In [1]:
import pandas as pd
import re

In [2]:
# Replace with your actual file path
df = pd.read_csv('refined_classified_keywords.csv')
df.head()

Unnamed: 0,type,freq_tar,range_tar,likelihood,effect,word,category,subcategory
0,like,1137753,735,16296.168,0.011,LIKE,ADJECTIVES,ADJ.ALL
1,im,1099153,736,15741.881,0.01,,,
2,welcome,784070,729,11221.085,0.007,WELCOME,ADJECTIVES,ADJ.ALL
3,get,624800,736,8938.409,0.006,GET,VERBS,VERB.BODY
4,oh,607688,734,8693.258,0.006,OH,NOUNS,NOUN.LOCATION


In [3]:
# Keep if subcategory is 'NOUN.COMMUNICATION' or blank/NaN
mask = (df['subcategory'] == 'NOUN.COMMUNICATION') | (df['subcategory'].isnull()) | (df['subcategory'] == '')
filtered_df = df[mask].copy()
filtered_df.head()

Unnamed: 0,type,freq_tar,range_tar,likelihood,effect,word,category,subcategory
1,im,1099153,736,15741.881,0.01,,,
6,yes,542445,734,7758.75,0.005,YES,NOUNS,NOUN.COMMUNICATION
7,dont,517848,735,7406.509,0.005,,,
11,hi,442385,732,6326.092,0.004,HI,NOUNS,NOUN.COMMUNICATION
14,got,404171,736,5779.12,0.004,,,


In [4]:
def canonicalize_type(word):
    word = str(word).lower()
    # Remove leading 'n' if followed by 'bruh' or 'good'
    word = re.sub(r'^n(bruh|good)$', r'\1', word)
    # Remove trailing 'n' if preceded by 'bruh' or 'good'
    word = re.sub(r'^(bruh|good)n$', r'\1', word)
    return word

filtered_df['canonical_type'] = filtered_df['type'].apply(canonicalize_type)
filtered_df.head(10)

Unnamed: 0,type,freq_tar,range_tar,likelihood,effect,word,category,subcategory,canonical_type
1,im,1099153,736,15741.881,0.01,,,,im
6,yes,542445,734,7758.75,0.005,YES,NOUNS,NOUN.COMMUNICATION,yes
7,dont,517848,735,7406.509,0.005,,,,dont
11,hi,442385,732,6326.092,0.004,HI,NOUNS,NOUN.COMMUNICATION,hi
14,got,404171,736,5779.12,0.004,,,,got
19,guys,319615,732,4569.182,0.003,,,,guys
25,hey,293683,735,4198.208,0.003,,,,hey
27,wanna,292759,734,4184.99,0.003,,,,wanna
34,gonna,237783,733,3398.675,0.002,,,,gonna
36,hello,235588,735,3367.284,0.002,HELLO,NOUNS,NOUN.COMMUNICATION,hello


In [5]:
# If your numeric columns differ, adjust the list below
agg_dict = {
    'freq_tar': 'sum',
    'range_tar': 'sum',
    'likelihood': 'sum',
    'effect': 'sum',
    'type': 'first',
    'word': 'first',
    'category': 'first',
    'subcategory': 'first'
    # Add any additional columns you wish to keep
}

# Group by canonical_type
result_df = filtered_df.groupby('canonical_type', as_index=False).agg(agg_dict)
result_df.head(10)

Unnamed: 0,canonical_type,freq_tar,range_tar,likelihood,effect,type,word,category,subcategory
0,aa,3840,682,54.856,0.0,aa,AA,NOUNS,NOUN.COMMUNICATION
1,aaah,1598,494,22.828,0.0,aaah,,,
2,aaahhh,2419,242,34.556,0.0,aaahhh,,,
3,aboutn,4795,685,68.499,0.0,aboutn,,,
4,abs,2981,566,42.585,0.0,abs,,,
5,abt,21268,709,303.835,0.0,abt,,,
6,abusing,2415,621,34.499,0.0,abusing,,,
7,accent,7601,702,108.584,0.0,accent,ACCENT,NOUNS,NOUN.COMMUNICATION
8,accountn,2200,578,31.428,0.0,accountn,,,
9,accounts,4888,695,69.827,0.0,accounts,,,


In [6]:
result_df.to_csv('filtered_keywords_cleaned.csv', index=False)