## Balance Data

In [1]:
import numpy as np
import pandas as pd

# vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# resambling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks


from sklearn.model_selection import train_test_split


In [2]:
df_train = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train.csv")

# Vectorization

In [3]:
# extracting tweet-column 

tweets = df_train['tweet']
y_train = df_train['label']

print(tweets)

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object


In [4]:
# TF-IDF Vectorization

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)

# Converting TF-IDF matrix to array >> better visualization
tfidf_array = tfidf_matrix.toarray()

# Getting the feature names/words
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

print("\nTF-IDF Vectorization:\n")
print("Feature Names:", tfidf_feature_names)
print("TF-IDF Matrix:\n", tfidf_array)

# create output
other_columns = df_train.drop(columns=['tweet', 'label'])
X_combined = pd.concat([pd.DataFrame(tfidf_array, columns=tfidf_feature_names), other_columns], axis=1)
print(f"Original training dataset shape: {Counter(y_train)}")




TF-IDF Vectorization:

Feature Names: ['00' '000' '000001' ... 'ó¾' 'øª' 'ø¹ù']
TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Original training dataset shape: Counter({0: 29720, 1: 2242})


# Resampling

Manual Hybrid Sampling

In [None]:
#  Oversmapling the minority class & undersampling the majority class

smote = SMOTE(sampling_strategy=0.5, random_state=42)  
undersample = RandomUnderSampler(sampling_strategy=0.8, random_state=42)  

# using numeric data (verctorizaiton result) instead of text data
X_train_sm, y_train_sm = smote.fit_resample(X_combined, y_train)
X_train_combined, y_train_combined = undersample.fit_resample(X_train_sm, y_train_sm)

print(f"Resampled training dataset shape: {Counter(y_train_combined)}")

# Converting back to DataFrame 
df_resampled_manually = pd.DataFrame(X_train_combined.toarray(), columns=tfidf_feature_names)
df_resampled_manually['label'] = y_train_combined


SMOTETomek

In [None]:
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_combined , y_train)

#print("Original dataset shape:", tfidf_matrix.shape, y_train)
print("Resampled dataset shape:", X_train_resampled.shape, y_train_resampled.shape)
print(f"Resampled training dataset shape: {Counter(y_train_resampled)}")

# Converting back to DataFrame 
df_resampled_smote_tomek = pd.DataFrame(X_train_resampled.toarray(), columns=X_combined.columns)
df_resampled_smote_tomek['label'] = y_train_resampled

print("\nFirst few rows of resampled data:")
print(df_resampled_smote_tomek.head())

SMOTEEN Hyprid-Sampling

In [None]:
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(X_combined , y_train)

#print("Original dataset shape:", tfidf_matrix.shape, y_train)
print("SMOTEENN resampled shape:", X_resampled_enn.shape, y_resampled_enn.shape)
print(f"Resampled training dataset shape: {Counter(y_resampled_enn)}")

# Converting back to DataFrame 
df_resampled_SMOTEEN = pd.DataFrame(X_resampled_enn.toarray(), columns=tfidf_feature_names)
df_resampled_SMOTEEN['label'] = y_resampled_enn

print("\nFirst few rows of resampled data:")
print(df_resampled_SMOTEEN.head())


Undersampling

In [None]:
#clenaing dataset 

tomek = TomekLinks()
X_tomek, y_tomek  = tomek.fit_resample(X_combined, y_train)

print(f"Resampled training dataset shape: {Counter(y_tomek)}")

Resampled training dataset shape: Counter({0: 28754, 1: 2242})


In [6]:
# Undersampling
undersample = RandomUnderSampler(sampling_strategy='auto')
X_resampled, y_resampled = undersample.fit_resample(X_tomek, y_tomek)

print(f"Resampled training dataset shape: {Counter(y_resampled)}")

Resampled training dataset shape: Counter({0: 2242, 1: 2242})


Oversampling

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_smote, y_smote= smote.fit_resample(X_combined, y_train)

print(f"Resampled training dataset shape: {Counter(y_smote)}")