In [2]:
import os
import re
import pandas as pd

# Directory containing the txt files
txt_dir = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\WordNet"

# Find all relevant .txt files except LICENSE.txt
txt_files = [f for f in os.listdir(txt_dir) if f.lower().endswith('.txt') and "license" not in f.lower()]

entries = []

for filename in txt_files:
    with open(os.path.join(txt_dir, filename), encoding='utf-8') as f:
        cat = None
        subcat = None
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Top-level CATEGORY (all uppercase, no punctuation)
            if re.fullmatch(r'[A-Z ]+', line):
                cat = line
                subcat = None  # Reset subcategory
                continue

            # SUBCATEGORY (all uppercase, possibly with dots, hyphens, or spaces)
            if re.fullmatch(r'[A-Z0-9\.\-\_ ]+', line) and not line.endswith(')'):
                subcat = line
                continue

            # WORD line (all uppercase, possibly with hyphens or underscores, ends with (n))
            m = re.match(r'^([A-Z0-9\-_ ]+)\s+\(\d+\)$', line)
            if m:
                word = m.group(1).strip()
                entries.append({
                    "word": word,
                    "category": cat,
                    "subcategory": subcat,
                    "source_file": filename
                })

# Build DataFrame
df = pd.DataFrame(entries)

# Remove duplicates
df = df.drop_duplicates(subset=["word", "category", "subcategory"])

# Export the main cleaned dataset
df.to_csv("all_wordnet_cleaned.csv", index=False)
print(f"Total entries (deduplicated): {len(df)}")

# Export a list of all unique categories and subcategories
categories = df[['category', 'subcategory']].drop_duplicates().sort_values(['category', 'subcategory'])
categories.to_csv("all_wordnet_categories.csv", index=False)
print("Categories exported to all_wordnet_categories.csv")

# Optional: Show a few rows as preview
print(df.head(25))

Total entries (deduplicated): 172581
Categories exported to all_wordnet_categories.csv
                word    category subcategory  \
0           ABDUCTED  ADJECTIVES     ADJ.PPL   
1          ADSORBING  ADJECTIVES     ADJ.PPL   
2            AVENGED  ADJECTIVES     ADJ.PPL   
3         CALIBRATED  ADJECTIVES     ADJ.PPL   
4          CANTERING  ADJECTIVES     ADJ.PPL   
5         CARBONIZED  ADJECTIVES     ADJ.PPL   
6            CHARRED  ADJECTIVES     ADJ.PPL   
7   CLOSED-CAPTIONED  ADJECTIVES     ADJ.PPL   
8      CONTAINERISED  ADJECTIVES     ADJ.PPL   
9      CONTAINERIZED  ADJECTIVES     ADJ.PPL   
10         CONTESTED  ADJECTIVES     ADJ.PPL   
11            COOING  ADJECTIVES     ADJ.PPL   
12         CORBELLED  ADJECTIVES     ADJ.PPL   
13  COUNTERBALANCING  ADJECTIVES     ADJ.PPL   
14          CRUNCHED  ADJECTIVES     ADJ.PPL   
15           CURSING  ADJECTIVES     ADJ.PPL   
16           DEPOSED  ADJECTIVES     ADJ.PPL   
17           DIMPLED  ADJECTIVES     ADJ.PPL   
1

In [3]:
import pandas as pd
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# --- File paths ---
wordnet_path = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\all_wordnet_cleaned.csv"
keywords_path = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\keywordlist.csv"
output_path = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\classified_keywords.csv"

# --- Load Data ---
df_wordnet = pd.read_csv(wordnet_path)
df_keywords = pd.read_csv(keywords_path)

# --- Clean and Remove Stopwords ---
keyword_col = 'word' if 'word' in df_keywords.columns else df_keywords.columns[0]
df_keywords['lower_word'] = df_keywords[keyword_col].str.lower()
stop_words = set(stopwords.words('english'))
df_keywords = df_keywords[~df_keywords['lower_word'].isin(stop_words)]

# --- Prepare WordNet for Matching ---
df_wordnet['lower_word'] = df_wordnet['word'].str.lower()

# --- Merge/Join ---
merged = pd.merge(df_keywords, df_wordnet, on='lower_word', how='left', suffixes=('_keyword', '_wordnet'))

# --- Save Output ---
merged.to_csv(output_path, index=False)
print(f"Classified keyword list saved to: {output_path}")

# Preview some results
print(merged[['lower_word', 'category', 'subcategory']].head(20))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\neelb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classified keyword list saved to: C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\MEDD8925 Discord Project\classified_keywords.csv
   lower_word    category         subcategory
0           n  ADJECTIVES            ADJ.PERT
1           n  ADJECTIVES             ADJ.ALL
2           n     ADVERBS             ADV.ALL
3           n       NOUNS           NOUN.TOPS
4           n       NOUNS       NOUN.ARTIFACT
5           n       NOUNS           NOUN.BODY
6           n       NOUNS  NOUN.COMMUNICATION
7           n       NOUNS         NOUN.OBJECT
8           n       NOUNS         NOUN.PERSON
9           n       NOUNS        NOUN.PROCESS
10          n       NOUNS       NOUN.QUANTITY
11          n       NOUNS       NOUN.RELATION
12          n       NOUNS      NOUN.SUBSTANCE
13          n       NOUNS           NOUN.TIME
14          e     ADVERBS             ADV.ALL
15          e       NOUNS  NOUN.COMMUNICATION
16          e       NOUNS       NOUN.QUANTITY
17          e       NOUNS