## Twitter_hate-speech : data exploration

In [7]:
import numpy as np
import pandas as pd

# vectorization 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# resambling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


In [8]:
df_train = pd.read_csv("..\\..\\..\\data\\twitter_hate-speech\\train.csv")

# Vectorization

In [12]:
# extracting tweet-column 

tweets = df_train['tweet']

print(tweets)

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31960    @user #sikh #temple vandalised in in #calgary,...
31961                     thank you @user for you follow  
Name: tweet, Length: 31962, dtype: object


In [13]:
# vectorization of tweet column 

count_vectorizer = CountVectorizer()

count_matrix = count_vectorizer.fit_transform(tweets)

count_array = count_matrix.toarray()

feature_names = count_vectorizer.get_feature_names_out()

# Output the results
print("Count Vectorization:\n")
print("Feature Names:", feature_names)
print("Count Matrix:\n", count_array)

Count Vectorization:

Feature Names: ['00' '000' '000001' ... 'ó¾' 'øª' 'ø¹ù']
Count Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
# TF-IDF Vectorization

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(tweets)

# Converting TF-IDF matrix to array >> better visualization
tfidf_array = tfidf_matrix.toarray()

# Getting the feature names/words
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

print("\nTF-IDF Vectorization:\n")
print("Feature Names:", tfidf_feature_names)
print("TF-IDF Matrix:\n", tfidf_array)


TF-IDF Vectorization:

Feature Names: ['00' '000' '000001' ... 'ó¾' 'øª' 'ø¹ù']
TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Resampling 

In [15]:
X_train = df_train['tweet']  
y_train = df_train['label']  

print(f"Original training dataset shape: {Counter(y_train)}")


Original training dataset shape: Counter({0: 29720, 1: 2242})


In [17]:
# Oversmapling the minority class & undersampling the majority class

smote = SMOTE(sampling_strategy=0.5, random_state=42)  
undersample = RandomUnderSampler(sampling_strategy=0.8, random_state=42)  

# using numeric data (verctorizaiton result) instead of text data
X_train_sm, y_train_sm = smote.fit_resample(tfidf_matrix, y_train)
X_train_combined, y_train_combined = undersample.fit_resample(X_train_sm, y_train_sm)

print(f"Resampled training dataset shape: {Counter(y_train_combined)}")

# Converting back to DataFrame 
df_resampled = pd.DataFrame(X_train_combined.toarray(), columns=vectorizer.get_feature_names_out())
df_resampled['label'] = y_train_combined



Resampled training dataset shape: Counter({0: 18575, 1: 14860})
