# __Exploratory Data Analysis__

In [2]:
# Python base libraries
import os

# Data science libraries
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP libraries
import nltk
from nltk.corpus import stopwords
from textblob import Word
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
import plotly.express as px

# Custom libraries
import sys
sys.path.append('..')
from functions.eda import *

### Load data

In [25]:
train_path = '../data/train.tsv'
test_path = '../data/test.tsv'
validation_path = '../data/validation.tsv'

columns = ['comment', 'label', 'id']

train = pd.read_csv(train_path, sep='\t', names=columns)
test = pd.read_csv(test_path, sep='\t', names=columns)
validation = pd.read_csv(validation_path, sep='\t', names=columns)

print(train.head())
print(test.head())
print(validation.head())

                                             comment label       id
0  My favourite food is anything I didn't have to...    27  eebbqej
1  Now if he does off himself, everyone will thin...    27  ed00q6i
2                     WHY THE FUCK IS BAYLESS ISOING     2  eezlygj
3                        To make her feel threatened    14  ed7ypvh
4                             Dirty Southern Wankers     3  ed0bdzj
                                             comment label       id
0  I’m really sorry about your situation :( Altho...    25  eecwqtt
1    It's wonderful because it's awful. At not with.     0  ed5f85d
2  Kings fan here, good luck to you guys! Will be...    13  een27c3
3  I didn't know that, thank you for teaching me ...    15  eelgwd1
4  They got bored from haunting earth for thousan...    27  eem5uti
                                             comment label       id
0  Is this in New Orleans?? I really feel like th...    27  edgurhb
1  You know the answer man, you are programmed t

- In the validation set we see that there are comments with more than one tag. This can complicate model training. Therefore, we have decided to do an explode: replicate those records as many times as necessary so that each replica has a single label. 

- But before we do that, let's check that there are no duplicates or null values in the original datasets.

### Check for duplicates

In [4]:
# Number of duplicate rows
print('Number of duplicate rows in the train dataset:', train.duplicated().sum())
print('Number of duplicate rows in the test dataset:', train.duplicated().sum())
print('Number of duplicate rows in the validation dataset:', train.duplicated().sum())

Number of duplicate rows in the train dataset: 0
Number of duplicate rows in the test dataset: 0
Number of duplicate rows in the validation dataset: 0


### Check datatypes and missing values

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43410 entries, 0 to 43409
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  43410 non-null  object
 1   label    43410 non-null  object
 2   id       43410 non-null  object
dtypes: object(3)
memory usage: 1017.6+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5427 entries, 0 to 5426
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  5427 non-null   object
 1   label    5427 non-null   object
 2   id       5427 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


In [7]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5426 entries, 0 to 5425
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  5426 non-null   object
 1   label    5426 non-null   object
 2   id       5426 non-null   object
dtypes: object(3)
memory usage: 127.3+ KB


### Explode the datasets so that there is only 1 label per record

In [26]:
# First test with the validation df, as we can check already printing the first 5 rows.

# Split the labels in a new column
validation['labels_list'] = validation['label'].str.split(',')

# Explode the dataframe based on the labels_list column
validation = validation.explode('labels_list')

# Delete the label column
validation = validation.drop(columns = ['label'])

# Rename the labels_list column to 'label'
validation = validation.rename(columns = {'labels_list' : 'label'})

# Print the first 5 rows to check
validation.head()

Unnamed: 0,comment,id,label
0,Is this in New Orleans?? I really feel like th...,edgurhb,27
1,"You know the answer man, you are programmed to...",ee84bjg,4
1,"You know the answer man, you are programmed to...",ee84bjg,27
2,I've never been this sad in my life!,edcu99z,25
3,The economy is heavily controlled and subsidiz...,edc32e2,4


In [27]:
# Train dataset
train['labels_list'] = train['label'].str.split(',')
train = train.explode('labels_list')
train = train.drop(columns = ['label'])
train = train.rename(columns = {'labels_list' : 'label'})

# Test dataset
test['labels_list'] = test['label'].str.split(',')
test = test.explode('labels_list')
test = test.drop(columns = ['label'])
test = test.rename(columns = {'labels_list' : 'label'})

In [13]:
# Let's see how much the number of rows has increased after the explode
print("Number of rows in the train dataset:", train.shape[0])
print("Number of rows in the test dataset:", test.shape[0])
print("Number of rows in the validation dataset:", validation.shape[0])

Number of rows in the train dataset: 51103
Number of rows in the test dataset: 6329
Number of rows in the validation dataset: 6380


- We can now also convert the label column to integer type, which reduces memory usage and improves computational efficiency.

In [28]:
# Cast column label to integer type
train['label'] = train['label'].astype(int)
test['label'] = test['label'].astype(int)
validation['label'] = validation['label'].astype(int)

### Analyse class imbalance


In [29]:
# Compute the frequencies and relative frequencies of the column label, for each dataset
train_freqs = check_class_imbalance(train, 'label').rename(columns = {'Freq' : 'Freq_train', 'RelFreq' : 'FreqRel_train'})
test_freqs = check_class_imbalance(test, 'label').rename(columns = {'Freq' : 'Freq_test', 'RelFreq' : 'FreqRel_test'})
validation_freqs = check_class_imbalance(validation, 'label').rename(columns = {'Freq' : 'Freq_validation', 'RelFreq' : 'FreqRel_validation'})

# Merge the dataframes on the column label, in order to print all the results together
merged_df = train_freqs.merge(test_freqs, on='label', how='outer')
merged_df = merged_df.merge(validation_freqs, on='label', how='outer')
merged_df = merged_df.sort_values(by='label')
merged_df

Unnamed: 0,label,Freq_train,FreqRel_train,Freq_test,FreqRel_test,Freq_validation,FreqRel_validation
0,0,4130,0.080817,504,0.079633,488,0.076489
1,1,2328,0.045555,264,0.041713,303,0.047492
2,2,1567,0.030664,198,0.031285,195,0.030564
3,3,2470,0.048334,320,0.050561,303,0.047492
4,4,2939,0.057511,351,0.055459,397,0.062226
5,5,1087,0.021271,135,0.02133,153,0.023981
6,6,1368,0.026769,153,0.024174,152,0.023824
7,7,2191,0.042874,284,0.044873,248,0.038871
8,8,641,0.012543,83,0.013114,77,0.012069
9,9,1269,0.024832,151,0.023858,163,0.025549


- When training a model, an unbalanced dataset can lead to a bias in favor of the most frequent class. To avoid this, techniques such as up-sampling (adding samples from minority classes), down-sampling (reducing samples from majority classes) or a combination of both can be used.

- Label 27 does not correspond to any emotion, it is the 'neutral' class. This makes sense, since there may be many comments that do not reflect any emotion. However, it is necessary to remove many of these comments to ensure correct training of the models. To do this, we proceed to downsampling by removing a determined number of random records with label 27.

In [34]:
# Filter records with label 27
train_27_all = train[train['label'] == 27]
test_27_all = test[test['label'] == 27]
validation_27_all = validation[validation['label'] == 27]

# Select a random sample of the records with the label 27
train_27_sample = train_27_all.sample(n=10000, random_state=42) 
test_27_sample = test_27_all.sample(n=1300, random_state=42) 
validation_27_sample = validation_27_all.sample(n=1300, random_state=42) 

# Remove the selected sample from the original df
train = train.drop(train_27_sample.index)
test = test.drop(test_27_sample.index)
validation = validation.drop(validation_27_sample.index)

# Compute the frequencies and relative frequencies of the column label, for each dataset
train_freqs = check_class_imbalance(train, 'label').rename(columns = {'Freq' : 'Freq_train', 'RelFreq' : 'FreqRel_train'})
test_freqs = check_class_imbalance(test, 'label').rename(columns = {'Freq' : 'Freq_test', 'RelFreq' : 'FreqRel_test'})
validation_freqs = check_class_imbalance(validation, 'label').rename(columns = {'Freq' : 'Freq_validation', 'RelFreq' : 'FreqRel_validation'})

# Merge the dataframes on the column label, in order to print all the results together
merged_df = train_freqs.merge(test_freqs, on='label', how='outer')
merged_df = merged_df.merge(validation_freqs, on='label', how='outer')
merged_df = merged_df.sort_values(by='label')
merged_df

Unnamed: 0,label,Freq_train,FreqRel_train,Freq_test,FreqRel_test,Freq_validation,FreqRel_validation
0,0,4062,0.101319,495,0.101144,479,0.096826
1,1,2289,0.057095,257,0.052513,295,0.059632
2,2,1522,0.037964,192,0.039232,191,0.038609
3,3,2372,0.059165,301,0.061504,287,0.058015
4,4,2802,0.069891,333,0.068043,378,0.07641
5,5,1043,0.026016,133,0.027176,145,0.029311
6,6,1301,0.032451,144,0.029424,142,0.028704
7,7,2106,0.05253,268,0.054761,234,0.047301
8,8,622,0.015515,83,0.01696,77,0.015565
9,9,1221,0.030456,144,0.029424,158,0.031939


In [35]:
# Print the new number of rows after removal
print("Number of rows in the train dataset:", train.shape[0])
print("Number of rows in the test dataset:", test.shape[0])
print("Number of rows in the validation dataset:", validation.shape[0])

Number of rows in the train dataset: 40091
Number of rows in the test dataset: 4894
Number of rows in the validation dataset: 4947


### Text preprocessing

### Frecuency distribution

### Word cloud