# In this notebook we will look at exploratory data analysis (EDA) from which some insights can be drawn. The date has been augmented using google api for back translation and crossover tools. 

In [None]:
!git clone https://github.com/OopsWrongCode/nlp-project.git

In [None]:
%cd nlp-project/

In [None]:
%pip install regex nlpaug

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
import torch

warnings.filterwarnings('ignore')

import nltk
nltk.download('popular')

In [None]:
from src.utils import *

PATH = loader()
os.listdir(PATH)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
val_file_path = os.path.join(PATH,os.listdir(PATH)[0])
test_file_path = os.path.join(PATH,os.listdir(PATH)[1])
train_file_path = os.path.join(PATH,os.listdir(PATH)[2])

In [None]:
train = pd.read_csv(train_file_path, sep=';', header=None)
valid = pd.read_csv(val_file_path, sep=';', header=None)
test = pd.read_csv(test_file_path, sep=';', header=None)

In [None]:
train.rename({0 : 'text', 1 : 'label'}, inplace=True, axis=1)
test.rename({0 : 'text', 1 : 'label'}, inplace=True, axis=1)
valid.rename({0 : 'text', 1 : 'label'}, inplace=True, axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(f"Shape of train : {train.shape}\nShape of validation : {valid.shape}\nShape of test : {test.shape}")

In [None]:
print(f"Number of duplicates: {(len(train) - len(train.drop_duplicates(subset=['text'])))} ({round((len(train) - len(train.drop_duplicates(subset=['text']))) / len(train) * 100,2)}%)")
train = train.drop_duplicates(subset=['text'])

In [None]:
print(f"Number of duplicates: {(len(valid) - len(valid.drop_duplicates(subset=['text'])))} ({round((len(valid) - len(valid.drop_duplicates(subset=['text']))) / len(valid) * 100,2)}%)")
valid = valid.drop_duplicates(subset=['text'])

In [None]:
train['label'].value_counts(normalize=True).round(4)

In [None]:
# import nlpaug.augmenter.word as naw

# # https://nlpaug.readthedocs.io/en/latest/augmenter/word/context_word_embs.html

# def compute_augment_size(label, train_data, max_augment=10, min_augment=0, base_multiplier=1.0):
#     class_freq = train_data['label'].value_counts(normalize=True)
    
#     max_freq = class_freq.max()
#     current_freq = class_freq[label]
    
#     augment_factor = max_freq / current_freq
#     augment_size = min(int(augment_factor * base_multiplier), max_augment)
    
#     if current_freq > 0.8 * max_freq: # 0 for others
#         augment_size = min_augment
    
#     return max(augment_size, min_augment)

# def data_augmentation(text, label, train_data, max_augment=10, base_multiplier=1.0):
#     augment_size = compute_augment_size(label, train_data, max_augment, min_augment=0, base_multiplier=base_multiplier)
    
#     aug = naw.ContextualWordEmbsAug(
#         model_path='roberta-base',
#         model_type='roberta',
#         action='substitute',
#         device=str(DEVICE),
#         top_k=50,
#         aug_max=5
#     )
    
#     augmented_texts = []
#     for _ in range(augment_size):
#         aug_text = aug.augment(text)[0]
#         augmented_texts.append(aug_text)
    
#     return augmented_texts

# def augment_dataset(train_data, labels_to_augment=['anger', 'fear', 'love', 'surprise'], max_augment=10, base_multiplier=1.0):
#     augmented_texts = []
#     augmented_labels = []
    
#     for label in labels_to_augment:
#         label_df = train_data[train_data['label'] == label]
#         print(f"Augmenting label: {label} ({len(label_df)} samples)")
        
#         for _, row in label_df.iterrows():
#             new_texts = data_augmentation(row['text'], label, train_data, max_augment, base_multiplier)
#             augmented_texts.extend(new_texts)
#             augmented_labels.extend([label] * len(new_texts))
        
#         print(f"Label {label} augmented successfully! Added {len(label_df) * compute_augment_size(label, train_data, max_augment, base_multiplier=base_multiplier)} samples.")
    
#     augmented_df = pd.DataFrame({
#         'text': augmented_texts,
#         'label': augmented_labels
#     })
    
#     train_augmented = pd.concat([train_data, augmented_df], ignore_index=True)
    
#     print(f"\nRows after augmentation: {len(train_augmented)}\n")
#     print("New class distribution:")
#     print(train_augmented['label'].value_counts(normalize=True))
    
#     return train_augmented

In [None]:
# source: https://www.kaggle.com/code/mujrush/data-augmentation-by-back-translation
# %pip install googletrans==4.0.0-rc1

In [None]:
# from googletrans import Translator
# from tqdm import tqdm

# # STEP 1: FIND APPROPRIATE SIZE OF AUGMENTATION
# def compute_augment_size(label, train_data, max_augment=10, min_augment=0, base_multiplier=1.0): 
#     class_freq = train_data['label'].value_counts(normalize=True)
#     max_freq = class_freq.max()
#     current_freq = class_freq[label]
    
#     augment_factor = max_freq / current_freq
#     augment_size = min(int(augment_factor * base_multiplier), max_augment)

#     if current_freq > 0.8 * max_freq:
#         augment_size = min_augment

#     return max(augment_size, min_augment)

# # STEP 2: MAIN FUNC
# def back_translate_text(text, translator, src_lang='en', via_lang='fr', max_len=512):
#     try:
#         if len(text) > max_len:
#             text = text[:max_len]

#         via = translator.translate(text, dest=via_lang).text
#         back = translator.translate(via, dest=src_lang).text
#         return back
#     except Exception as e:
#         print(f"Translation error: {e}")
#         return None

# # STEP 3: MAIN BODY
# def back_translation(train_data, labels_to_augment=['anger', 'fear', 'love', 'surprise'], max_augment=10, base_multiplier=1.0):
#     translator = Translator()
#     augmented_texts = []
#     augmented_labels = []

#     for label in labels_to_augment:
#         label_df = train_data[train_data['label'] == label]
#         augment_size = compute_augment_size(label, train_data, max_augment, base_multiplier=base_multiplier)

#         if augment_size == 0:
#             continue

#         n_to_augment = min(len(label_df), augment_size * len(label_df))
#         label_subset = label_df.sample(n=n_to_augment, random_state=42)

#         print(f"Augmenting label: {label} ({len(label_df)} samples), will augment: {len(label_subset)}")

#         for _, row in tqdm(label_subset.iterrows(), total=len(label_subset), desc=f"Back-translating {label}"):
#             orig_text = row['text']
#             translated = back_translate_text(orig_text, translator)
#             if translated:
#                 augmented_texts.append(translated)
#                 augmented_labels.append(label)

#     augmented_df = pd.DataFrame({'text': augmented_texts, 'label': augmented_labels})
#     train_augmented = pd.concat([train_data, augmented_df], ignore_index=True)
#     train_augmented.drop_duplicates(subset=['text', 'label'], inplace=True)

#     print(f"\nRows after augmentation: {len(train_augmented)}\n")
#     print("New class distribution:")
#     print(train_augmented['label'].value_counts(normalize=True))

#     return train_augmented


In [None]:
# train = back_translation(train, max_augment=10, base_multiplier=1.2)

In [None]:
# train_augmented = augment_dataset(train, labels_to_augment=['anger', 'fear', 'love', 'surprise'], max_augment=10, base_multiplier=1.0)
# train = train_augmented

In [None]:
# import nlpaug.augmenter.word as naw 

# # https://nlpaug.readthedocs.io/en/latest/augmenter/word/context_word_embs.html

# def compute_augment_size(label):
#     class_counts = train['label'].value_counts(normalize=True)
#     max_freq = class_counts.max()
#     current_freq = class_counts[label]
#     augment_size = int((max_freq - current_freq) * 100)
    
#     return max(augment_size, 1)


# def data_augmentation(text, label):
#     augment_size = compute_augment_size(label)
#     aug = naw.ContextualWordEmbsAug(
#         model_path='roberta-base',
#         model_type='roberta',
#         action='substitute',
#         device=str(DEVICE),
#         top_k=100,
#         aug_max=augment_size
#     )
#     return aug.augment(text)

In [None]:
# augmented_texts = []
# augmented_labels = []

# LABELS = ['anger', 'fear', 'love', 'surprise']

# for label in LABELS:
#     label_df = train[train['label'] == label]

#     for _, row in label_df.iterrows():
#         new_texts = data_augmentation(row['text'], label)
#         augmented_texts.extend(new_texts)
#         augmented_labels.extend([label] * len(new_texts))
#     print(f"Label {label} augmented successfully!")

In [None]:

# augmented_df = pd.DataFrame({
#     'text': augmented_texts,
#     'label': augmented_labels
# })

# augmented_df

In [None]:
# df.iloc[55]['text']

In [None]:
# import src.utils
# print(src.utils.re.__name__)

In [None]:
train['text'] = train['text'].apply(clean_for_bert)
train['text'] = train['text'].apply(extract_clean_words)


test['text'] = test['text'].apply(clean_for_bert)
test['text'] = test['text'].apply(extract_clean_words)

valid['text'] = valid['text'].apply(clean_for_bert)
valid['text'] = valid['text'].apply(extract_clean_words)

In [None]:
fig = plt.figure(figsize=(8,6))
colors = ['grey','grey','grey','darkblue','darkblue','darkblue']
train.groupby('label').text.count().sort_values().plot.barh(ylim=0, color=colors, title= 'COUNT OF EACH CATEGORIES')
plt.xlabel('Number of Texts', fontsize = 10)

In [None]:
# #Visualize the data
# fig, ax = plt.subplots(figsize=(12,8))

# train['label'].value_counts(sort=True).plot(kind='bar', color='mediumseagreen', fontsize = 16)
# x = set(train['label'])

# default_x_ticks = range(len(x))

# plt.xticks(default_x_ticks, x, rotation=0, fontsize = 16)
# plt.title('Target distribution', fontsize = 20)
# plt.xlabel('Labels', fontsize = 20)
# plt.ylabel('Number of MBTIs', fontsize = 20)

# plt.show()

In [None]:
train['token_count'] = [len(sentence) for sentence in train['text']]
train['text_length'] = [len(seq) for seq in train['text']]

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

sns.boxplot(x='label', y='token_count', data=train)
plt.title('Tokens distribution', fontsize = 20)
plt.xlabel('Labels', fontsize = 20)
plt.ylabel('Number of emotion texts', fontsize = 20)
plt.ylim((0,train['token_count'].max() + 10))
plt.xticks(rotation=45)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

sns.boxplot(x='label', y='text_length', data=train)
plt.title('Length distribution', fontsize = 20)
plt.xlabel('Labels', fontsize = 20)
plt.ylabel('Length', fontsize = 20)
plt.ylim((0,train['text_length'].max() + 10))
plt.xticks(rotation=45)
plt.show()

In [None]:
# Embarked town distribution pie chart
embark_counts = train['label'].value_counts()
plt.figure(figsize=(10,10))
plt.pie(embark_counts, labels=embark_counts.index, colors=sns.color_palette('Set3'), autopct='%1.1f%%', startangle=140)
plt.title('Pie chart of target variable')
plt.show()

In [None]:
sw1 = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
from nltk.corpus import stopwords
sw1 = set(sw1)
sw = stopwords.words("english")
sw = set(sw)
STOPWORDS = set.union(sw1, sw)

# source https://github.com/6/stopwords-json/blob/master/dist/en.json

# another stop words can be obtained here: spacy.load('en_core_web_sm') nlp.Defaults.stop_words

In [None]:
#noisy words
UNWANTED_WORDS = set([
    "i", "it", "ur", "na", "omg", "people", "time", "good", "back", "gon", "day", "love", "happy", "lt", "kst", 'im', 'feel', 'feeling', 'like', 'ive'
])

for l in train['label'].unique():

    text_series = [" ".join(text_list) for text_list in train[train.label == l]['text']]
    label_text = " ".join(text_series)

    words = nltk.tokenize.word_tokenize(label_text)
    
    filtered_words = [
        w for w in words if w.isalnum() and w not in STOPWORDS and w.lower() not in UNWANTED_WORDS
    ]
    
    label_keywords = nltk.FreqDist(filtered_words)
    
    label_keywords_df = pd.DataFrame(label_keywords.items(), columns=['Word', 'Frequency'])
    
    label_keywords_df = label_keywords_df.sort_values(by='Frequency', ascending=False).head(15)
    
    fig, ax = plt.subplots(figsize=(20, 8))
    sns.barplot(y=label_keywords_df['Word'], x=label_keywords_df['Frequency'], orient='h', ax=ax, palette="magma")

    ax.set_title(f'Top 15 keywords in {l} target tweets', fontsize=15)
    ax.set_xlabel('Keyword Frequency')

    plt.tight_layout()
    plt.show()  

In [None]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
valid.to_csv('valid.csv', index=False)

print('Done!')