## Twitter_hate-speech : data exploration

In [1]:
import numpy as np
import pandas as pd

# vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# resambling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split


In [2]:
df_train = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train.csv")

# Vectorization

In [3]:
# extracting tweet-column 

tweets = df_train['tweet']
y_train = df_train['label']

print(tweets)

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object


In [4]:
# # vectorization of tweet column 

# count_vectorizer = CountVectorizer()

# count_matrix = count_vectorizer.fit_transform(tweets)

# count_array = count_matrix.toarray()

# feature_names = count_vectorizer.get_feature_names_out()

# # Output the results
# print("Count Vectorization:\n")
# print("Feature Names:", feature_names)
# print("Count Matrix:\n", count_array)

Count Vectorization:

Feature Names: ['00' '000' '000001' ... 'ó¾' 'øª' 'ø¹ù']
Count Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# TF-IDF Vectorization

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)

# Converting TF-IDF matrix to array >> better visualization
tfidf_array = tfidf_matrix.toarray()

# Getting the feature names/words
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

print("\nTF-IDF Vectorization:\n")
print("Feature Names:", tfidf_feature_names)
print("TF-IDF Matrix:\n", tfidf_array)





TF-IDF Vectorization:

Feature Names: ['00' '000' '000001' ... 'ó¾' 'øª' 'ø¹ù']
TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Original training dataset shape: Counter({0: 29720, 1: 2242})


# Resampling 

In [5]:
X_train = df_train['tweet']  
y_train = df_train['label']  

print(f"Original training dataset shape: {Counter(y_train)}")


Original training dataset shape: Counter({0: 29720, 1: 2242})


Manual Hybrid Sampling

In [6]:
#  Oversmapling the minority class & undersampling the majority class

smote = SMOTE(sampling_strategy=0.5, random_state=42)  
undersample = RandomUnderSampler(sampling_strategy=0.8, random_state=42)  

# using numeric data (verctorizaiton result) instead of text data
X_train_sm, y_train_sm = smote.fit_resample(tfidf_matrix, y_train)
X_train_combined, y_train_combined = undersample.fit_resample(X_train_sm, y_train_sm)

print(f"Resampled training dataset shape: {Counter(y_train_combined)}")

# Converting back to DataFrame 
df_resampled_manually = pd.DataFrame(X_train_combined.toarray(), columns=tfidf_feature_names)
df_resampled_manually['label'] = y_train_combined


Resampled training dataset shape: Counter({0: 18575, 1: 14860})


SMOTETomek Hybrid-Sampling

In [11]:
smote_tomek = SMOTETomek(random_state=42)
X_resampled_smt, y_resampled_smt = smote_tomek.fit_resample(tfidf_matrix, df_train['label'] )

# Checking resampled dataset shapes
print("Original dataset shape:", tfidf_matrix.shape, y_train)
print("SMOTEENN resampled shape:", X_resampled_smt.shape, y_resampled_smt.shape)
print(f"Resampled training dataset shape: {Counter(y_resampled_smt)}")

# Converting back to DataFrame 
X_resampled_SMOTEEN_df = pd.DataFrame(X_resampled_smt.toarray(), columns=tfidf_feature_names)
X_resampled_SMOTEEN_df['label'] = y_resampled_smt

Original dataset shape: (31962, 41392) 0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64
SMOTEENN resampled shape: (59440, 41392) (59440,)
Resampled training dataset shape: Counter({0: 29720, 1: 29720})


SMOTEEN Hyprid-Sampling

In [None]:
# Apply SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled_enn, y_resampled_enn = smote_enn.fit_resample(tfidf_matrix, df_train['label'] )

# Checking resampled dataset shapes
print("Original dataset shape:", tfidf_matrix.shape, y_train)
print("SMOTEENN resampled shape:", X_resampled_enn.shape, y_resampled_enn.shape)
print(f"Resampled training dataset shape: {Counter(y_resampled_enn)}")

# Converting back to DataFrame 
X_resampled_SMOTEEN_df = pd.DataFrame(X_resampled_enn.toarray(), columns=tfidf_feature_names)
X_resampled_SMOTEEN_df['label'] = y_resampled_enn

Original dataset shape: (31962, 41392) 0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64
SMOTEENN resampled shape: (40696, 41392) (40696,)
