In [18]:
!pip install nlpaug
!pip install googletrans
!pip install aiohttp
!pip install --upgrade googletrans==4.0.0-rc1
!pip install deepl


Collecting deepl
  Downloading deepl-1.21.0-py3-none-any.whl.metadata (30 kB)
Downloading deepl-1.21.0-py3-none-any.whl (38 kB)
Installing collected packages: deepl
Successfully installed deepl-1.21.0


In [1]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Module_2/data_set/processed_data/emotions.csv')
df

Unnamed: 0,text,label,Detected Emotion
0,im a single person with an overflowing hoarder...,hoarding disorder,sadness
1,my grandmother has been a hoarder her entire l...,hoarding disorder,sadness
2,hoarding itself is not such a bad thing its th...,hoarding disorder,anger
3,i know eight hoarders i met them in my travels...,hoarding disorder,sadness
4,gad generalised anxiety disorder is the most c...,hoarding disorder,fear
...,...,...,...
76227,the difference between a narcissist and a psyc...,personality disorder,fear
76228,hey everyone looking for thoughtsadviceassista...,anxiety,fear
76229,what if i were god and i would ask you to dema...,eating disorder,sadness
76230,i want to kill her and him i want to watch her...,bpd,sadness


In [2]:
emotion_label_counts = df.groupby(['Detected Emotion', 'label']).size().unstack(fill_value=0)
print(emotion_label_counts)

label             anxiety   bdd  bipolar   bpd  depression  eating disorder  \
Detected Emotion                                                              
anger                 245   388      470   708         418              344   
disgust                 8    85        7    15          21               77   
fear                 2705   964     1172   962         580             1106   
joy                   305   451      552   445         404              665   
neutral               110   344      239   237          73              369   
sadness               945  1384     1663  1874        2887             1627   
surprise              205   346      420   282         140              335   

label             hoarding disorder  mentalillness  normal   ocd  offmychest  \
Detected Emotion                                                               
anger                           691            498     341   445         783   
disgust                         140             

In [3]:
pd.set_option('display.width', None)
filtered_df = df[(df['label']=='depression') & (df['Detected Emotion']=='joy')]
filtered_df

Unnamed: 0,text,label,Detected Emotion
8607,hopefully gonna be dead within a few days enjo...,depression,joy
8753,i see a lot of people talk about how they want...,depression,joy
9038,last words here if i do not make it keep stron...,depression,joy
9243,today i woke up washed up fed my dog and cuddl...,depression,joy
9369,ironic is not it it is not that i do not love ...,depression,joy
...,...,...,...
75605,hello i need some help i just keep thinking th...,depression,joy
75806,i just want someone to tell me it is alright i...,depression,joy
75851,lol im so tired of this shit face_with_tears_...,depression,joy
75901,step one is sleep i am not kidding or being fi...,depression,joy


In [4]:
df['label'].unique()

array(['hoarding disorder', 'bdd', 'bipolar', 'ocd', 'anxiety', 'stress',
       'schizophrenia', 'panic_disorder', 'depression',
       'personality disorder', 'offmychest', 'bpd', 'suicidal',
       'mentalillness', 'ptsd', 'normal', 'eating disorder'], dtype=object)

In [5]:
df['Detected Emotion'].value_counts()

Unnamed: 0_level_0,count
Detected Emotion,Unnamed: 1_level_1
sadness,25157
fear,22838
anger,8464
joy,8256
surprise,5652
neutral,5194
disgust,671


In [6]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
target_count = 6350
high_count_emotions = ['sadness', 'fear', 'anger','joy']
df_high = df[df['Detected Emotion'].isin(high_count_emotions)]

undersampler = RandomUnderSampler(sampling_strategy={emo: target_count for emo in high_count_emotions}, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(df_high.drop(columns=['Detected Emotion']), df_high['Detected Emotion'])

df_downsampled = pd.DataFrame(X_resampled, columns=df_high.drop(columns=['Detected Emotion']).columns)
df_downsampled['Detected Emotion'] = y_resampled
df_downsampled


Unnamed: 0,text,label,Detected Emotion
19019,my ex dumped me because he does not understand...,offmychest,anger
41576,recently a tik tok has been appearing on my fo...,personality disorder,anger
5240,hoarding itself is not such a bad thing its th...,hoarding disorder,anger
29701,lately i have been feeling an overwhelming urg...,anxiety,anger
6259,you have to make her listen i had signs of thi...,hoarding disorder,anger
...,...,...,...
65407,lamictal stoopids i could not retrieve the wo...,bipolar,sadness
71934,i feel like people took away my freedom i am h...,depression,sadness
53853,im not doing well at all im completely alone a...,bpd,sadness
36471,when i was first diagnosed my mom forced me in...,schizophrenia,sadness


In [7]:
df_downsampled['Detected Emotion'].value_counts()

Unnamed: 0_level_0,count
Detected Emotion,Unnamed: 1_level_1
anger,6350
fear,6350
joy,6350
sadness,6350


In [8]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

low_count_emotions = ['surprise', 'neutral']
df_low = df[df['Detected Emotion'].isin(low_count_emotions)]

target_count = 6250
sampling_strategy = {emo: target_count for emo in low_count_emotions}

X_low = df_low.drop(columns=['Detected Emotion'])
y_low = df_low['Detected Emotion']

oversampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X_low, y_low)

# Convert back to DataFrame
df_upsampled = pd.DataFrame(X_resampled, columns=X_low.columns)
df_upsampled['Detected Emotion'] = y_resampled


In [9]:
df_upsampled['Detected Emotion'].value_counts()

Unnamed: 0_level_0,count
Detected Emotion,Unnamed: 1_level_1
surprise,6250
neutral,6250


In [10]:
df_balanced = pd.concat([df_downsampled, df_upsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['Detected Emotion'].value_counts())


Detected Emotion
anger       6350
fear        6350
sadness     6350
joy         6350
surprise    6250
neutral     6250
Name: count, dtype: int64


# Data Augmentation

In [1]:
import time
import random
import torch
import pandas as pd
import re
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from deep_translator import GoogleTranslator
from transformers import T5Tokenizer, T5ForConditionalGeneration
import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)

tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

syn_aug = naw.SynonymAug(aug_src='wordnet', verbose=0)

def clean_text(text):
    text = re.sub(r'<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    return text.strip()

def paraphrase(text):
    try:
        input_text = f"paraphrase: {text}"
        encoding = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
        output = model.generate(encoding, num_beams=5, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
        return tokenizer.decode(output[0], skip_special_tokens=True)
    except Exception:
        return text  # Return original text in case of error

def back_translate(text, retries=3, delay=3):
    if not text.strip():
        return text  # Skip empty text

    languages = ['de', 'fr', 'es', 'it']
    chosen_lang = random.choice(languages)

    for attempt in range(retries):
        try:
            translated = GoogleTranslator(source='en', target=chosen_lang).translate(text)
            back_translated = GoogleTranslator(source=chosen_lang, target='en').translate(translated)
            return back_translated if back_translated else text
        except Exception:
            time.sleep(delay)
    return text  # Return original text if retries fail
augmented_texts, augmented_labels = [], []
for idx, original_text in tqdm(enumerate(filtered_df['text'].tolist()), desc="Processing Augmentation", total=len(filtered_df)):
    variations = []
    label = filtered_df.iloc[idx]['label']  # Fetch correct label

    # Generate augmentations
    try:
        variations.append(back_translate(original_text))
    except:
        pass

    try:
        variations.append(paraphrase(original_text))
    except:
        pass

    try:
        variations.append(syn_aug.augment(original_text))
    except:
        pass
    selected_variations = random.sample(variations, min(len(variations), 2))  # Pick up to 2 variations
    for var in selected_variations:
        augmented_texts.append(var)
        augmented_labels.append(label)

augmented_df = pd.DataFrame({
    'text': augmented_texts,
    'label': augmented_labels
})
print(f"Augmentation complete. {len(augmented_df)} samples saved to CSV.")


AttributeError: module 'pyarrow' has no attribute '__version__'

In [27]:
augmented_df = pd.read_csv('/content/drive/MyDrive/Module_2/data_set/processed_data/augmented_data.csv')

In [28]:
augmented_df

Unnamed: 0,text,Detected Emotion,label
0,Hi assumed that many of us have a certain tend...,disgust,hoarding disorder
1,paraphrase: hi assuming that many of us have a...,disgust,hoarding disorder
2,her childhood scarred her into believing that ...,disgust,hoarding disorder
3,My only friend comes to mind for this question...,disgust,hoarding disorder
4,My only friend comes to mind for this question...,disgust,hoarding disorder
...,...,...,...
1337,"Yes, do they do it?\nThe best thing is honesty...",disgust,eating disorder
1338,book the deserters tells of how poorly allied ...,disgust,ptsd
1339,The book that the deserters tell how badly all...,disgust,ptsd
1340,Paraphrase,disgust,eating disorder


In [41]:
filtered_df = df[df['Detected Emotion']=='disgust']

In [42]:
filtered_df = pd.concat([filtered_df,augmented_df])

In [48]:
filtered_df

Unnamed: 0,text,label,Detected Emotion
5,hi assuming that many of us have a certain ten...,hoarding disorder,disgust
39,my one friend comes to mind for this question ...,hoarding disorder,disgust
53,my one friend comes to mind for this question ...,hoarding disorder,disgust
175,this is it this is the worst roomive never had...,hoarding disorder,disgust
189,animal hoarding usually starts because a perso...,hoarding disorder,disgust
...,...,...,...
1336,if someone with mental illness deserves to be ...,eating disorder,disgust
1337,"Yes, do they do it?\nThe best thing is honesty...",eating disorder,disgust
1338,book the deserters tells of how poorly allied ...,ptsd,disgust
1339,The book that the deserters tell how badly all...,ptsd,disgust


In [47]:
filtered_df = filtered_df[filtered_df['text'] != 'Paraphrase']

In [49]:
filtered_df

Unnamed: 0,text,label,Detected Emotion
5,hi assuming that many of us have a certain ten...,hoarding disorder,disgust
39,my one friend comes to mind for this question ...,hoarding disorder,disgust
53,my one friend comes to mind for this question ...,hoarding disorder,disgust
175,this is it this is the worst roomive never had...,hoarding disorder,disgust
189,animal hoarding usually starts because a perso...,hoarding disorder,disgust
...,...,...,...
1336,if someone with mental illness deserves to be ...,eating disorder,disgust
1337,"Yes, do they do it?\nThe best thing is honesty...",eating disorder,disgust
1338,book the deserters tells of how poorly allied ...,ptsd,disgust
1339,The book that the deserters tell how badly all...,ptsd,disgust


In [24]:
df_disgust

Unnamed: 0,text,label,Detected Emotion
5,hi assuming that many of us have a certain ten...,hoarding disorder,disgust
39,my one friend comes to mind for this question ...,hoarding disorder,disgust
53,my one friend comes to mind for this question ...,hoarding disorder,disgust
175,this is it this is the worst roomive never had...,hoarding disorder,disgust
189,animal hoarding usually starts because a perso...,hoarding disorder,disgust
...,...,...,...
74795,hi everyone\n\nrecently got broken yo with by ...,eating disorder,disgust
75083,my husband cheated and the girl is pregnant ye...,offmychest,disgust
75107,yes they do you see the thing with an eating d...,eating disorder,disgust
75302,the book the deserters tells of how poorly all...,ptsd,disgust


In [50]:
df_balanced= pd.concat([filtered_df,df_balanced])

In [51]:
df_balanced['Detected Emotion'].value_counts()

Unnamed: 0_level_0,count
Detected Emotion,Unnamed: 1_level_1
anger,6350
fear,6350
sadness,6350
joy,6350
surprise,6250
neutral,6250
disgust,1987


In [52]:
df_balanced.to_csv('/content/drive/MyDrive/Module_2/data_set/processed_data/balanced_emotions.csv', index=False)