In [1]:
import pandas as pd


In [2]:
df_train = pd.read_csv('train.csv')
df_train.shape

(3235, 6)

In [3]:
# Checking for null values in the datasets 
df_train.isnull().sum()

id                 0
original_text      0
lang               4
retweet_count      4
original_author    0
sentiment_class    0
dtype: int64

In [4]:
df_train.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [5]:
# As the lang, retweet_count, original_author does not play much role in case of sentiment analysis we drop them
df_train.drop(columns=['lang','retweet_count','original_author'],inplace=True)


In [6]:
df_train.shape

(3235, 3)

In [7]:
df_train.head()

Unnamed: 0,id,original_text,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,-1


## Data cleaning Step 

In [8]:
import re
import cleantext
import contractions

In [9]:
def remove_urls(text):
    text = re.sub(r'https|www?\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'insta\w+','',text)
    text = re.sub(r'photo\w+','',text)
    text = re.sub(r'vedio\w+','',text)
    return text

In [10]:
def clean_text(text):
    # Removing Hashtags
    text = re.sub(r'#\w+','',text)
    # Removing URLS
    text = remove_urls(text)
    # Handling Contractions like i'll, can't , won't
    text = contractions.fix(text)
    # Cleaning the remaining text with clean-text library
    text = cleantext.clean(text, no_digits=True,  no_emails=True, no_urls=True, no_punct=True, lower=True)
    return (text)
    

In [11]:
df_train['cleaned_text']= df_train['original_text'].apply(lambda x: clean_text(x))

In [12]:
# Transforming Labels presented in Sentiment_class  [-1,0,1] --> [0,1,2]
df_train['label'] = df_train['sentiment_class'] + 1

In [13]:
df_train.head()

Unnamed: 0,id,original_text,sentiment_class,cleaned_text,label
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,0,happy to all you amazing mothers out there i k...,1
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,0,happy mothers day mum i am sorry i can not be ...,1
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,-1,happy mothers day to all this doing a mothers ...,0
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,0,happy mothers day to this beautiful womanroyal...,1
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,-1,remembering the 0 most amazing ladies who made...,0


In [14]:
# Re-arranging columns in DataFrame

df_train = df_train[['id','cleaned_text','label']]

In [15]:
df_train.head()

Unnamed: 0,id,cleaned_text,label
0,1.245025e+18,happy to all you amazing mothers out there i k...,1
1,1.245759e+18,happy mothers day mum i am sorry i can not be ...,1
2,1.246087e+18,happy mothers day to all this doing a mothers ...,0
3,1.244803e+18,happy mothers day to this beautiful womanroyal...,1
4,1.244876e+18,remembering the 0 most amazing ladies who made...,0


## Checking for the Distribution of label class in the train_df

In [16]:
df_train['label'].value_counts()

1    1701
0     769
2     765
Name: label, dtype: int64

## Since we notice that there is a class imbalance in the training set ... we will balance the train dataframe .... Making each class having equal samples

In [17]:
def over_sample(df: pd.DataFrame) -> pd.DataFrame :
    unique_label_list = list(df['label'].unique())
    max_label_count = max(df['label'].value_counts().to_dict().values())
    new_df = pd.DataFrame()
    for label in unique_label_list:
        selected_df_per_label = df[df['label']==label]
        if(selected_df_per_label.shape[0] == max_label_count):
            new_df = pd.concat([new_df,selected_df_per_label])
        else:
            new_df = pd.concat([new_df,selected_df_per_label.sample(n=max_label_count,replace=True)])
    #Resetting index of new_df
    new_df.reset_index(drop=True,inplace=True)
    return new_df

In [18]:
df_train['label'].value_counts()

1    1701
0     769
2     765
Name: label, dtype: int64

In [19]:
# Calling over_sample Function 
df_train = over_sample(df_train)

In [20]:
df_train['label'].value_counts()

2    1701
1    1701
0    1701
Name: label, dtype: int64

## As the train data is now balanced across all the labels ... with will mix up this data well so that no two labels comes sequentially one after the other 

In [21]:
df_train.iloc[0:1700,:]['label'].value_counts()

1    1700
Name: label, dtype: int64

In [22]:
df_train.iloc[1700:3400,:]['label'].value_counts()

0    1699
1       1
Name: label, dtype: int64

In [23]:
df_train.iloc[3400:,:]['label'].value_counts()

2    1701
0       2
Name: label, dtype: int64

In [24]:
# As we observe now that the data across all the labels are sequentially arranged 
# To overcome this we will re-shuffle the dataframe to ensure no two labels are occur one after the other
df_train = df_train.sample(frac=1, random_state=45)

In [25]:
df_train.iloc[0:1700,:]['label'].value_counts()

2    576
1    570
0    554
Name: label, dtype: int64

In [26]:
df_train.iloc[1700:3400,:]['label'].value_counts()

0    597
1    561
2    542
Name: label, dtype: int64

In [28]:
df_train.head()

Unnamed: 0,id,cleaned_text,label
3197,1.244244e+18,happy mothers day to all the amazing mums out ...,0
747,1.243923e+18,happy mothers day to my super mum ochie dike n...,1
2476,1.246533e+18,thismorning happy mothers day mum we miss you ...,0
769,1.244366e+18,happy mothers day to our mothers and intending...,1
4422,1.243976e+18,ndi nne mama happy mothers day to the anglican...,2


## At last resetting index of the suffled dataframe 

In [29]:
df_train.reset_index(drop=True,inplace=True)

In [30]:
df_train.head()

Unnamed: 0,id,cleaned_text,label
0,1.244244e+18,happy mothers day to all the amazing mums out ...,0
1,1.243923e+18,happy mothers day to my super mum ochie dike n...,1
2,1.246533e+18,thismorning happy mothers day mum we miss you ...,0
3,1.244366e+18,happy mothers day to our mothers and intending...,1
4,1.243976e+18,ndi nne mama happy mothers day to the anglican...,2


## Saving the transformed dataframe to csv file for future use 

In [31]:
df_train.to_csv('balanced_train_dataset.csv',index=False)