#### Imports & Downloads

In [1]:
import pandas as pd
import os
from gensim.models import Word2Vec
import numpy as np
from imblearn.under_sampling import NearMiss, CondensedNearestNeighbour, NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Einladen der Daten

#### Originaldaten ohne umfassende Vorverarbeitung

In [2]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_basic_cleaned.csv'))
df = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [3]:
df['label'].value_counts()
positive = len(df[df['label'] == 1])
negative = len(df[df['label'] == 0])
negative/positive

13.669647292598112

#### Vorverarbeitete Daten

In [4]:
current_dir = os.getcwd()
csv_path_train = os.path.abspath(os.path.join(current_dir, '../../../data/twitter_hate-speech/train_cleaned.csv'))
df_cleaned = pd.read_csv(csv_path_train, encoding='utf-8', index_col=0)

df_cleaned.head()

Unnamed: 0_level_0,label,tweet,tweet_cleaned,user_handle,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,@user when a father is dysfunctional and is s...,father selfish drag kid run,1,['#run'],
2,0,@user @user thanks for #lyft credit i can't us...,thank lyft credit use cause offer van,2,"['#lyft', '#disapointed', '#getthanked']",
3,0,bihday your majesty,bihday majesty,0,[],
4,0,#model i love u take with u all the time in ...,model take time mobile phone kiss sunglass mou...,0,['#model'],":mobile_phone:,:kissing_face_with_smiling_eyes..."
5,0,factsguide: society now #motivation,factsguide society motivation,0,['#motivation'],


In [5]:
df_cleaned['label'].value_counts()
positive = len(df_cleaned[df_cleaned['label'] == 1])
negative = len(df_cleaned[df_cleaned['label'] == 0])
negative/positive

14.26780784097184

Verhältnis hat sich durch die Bereinigung sogar noch weiter verschlechter

### Vektorisierung der Daten
https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Unsupervised_text_classification_Word2Vec

In [6]:
def vectorize(sentence, w2v_model):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [7]:
def vectorize_df(df):
    X = df["tweet"]
    y = df["label"]

    sentences_base = [sentence.split() for sentence in X]
    w2v_model_base = Word2Vec(sentences_base, window=5, min_count=5, workers=-1)

    x_data = np.array([vectorize(sentence, w2v_model_base) for sentence in X])
    return x_data, y

In [8]:
X_base, y_base = vectorize_df(df)

In [9]:
X_base

array([[ 1.30374171e-03, -6.65663800e-04,  3.21442029e-03, ...,
        -2.68328539e-03,  1.41547900e-03, -2.84765265e-03],
       [ 3.36569443e-04, -9.82483965e-04,  9.05304274e-04, ...,
        -4.07109677e-04,  5.64215006e-04,  1.94166932e-04],
       [ 1.23032148e-03, -2.46845209e-03,  2.51201470e-03, ...,
         2.88697542e-03,  4.11413144e-03,  4.88469563e-03],
       ...,
       [-9.87032778e-04,  5.68386691e-04, -1.85755307e-05, ...,
         3.09016541e-05, -3.14187346e-04, -8.47426010e-04],
       [ 1.40571001e-03,  4.23042104e-03,  2.36720010e-03, ...,
        -4.67062840e-04,  2.66610738e-03, -7.61655625e-04],
       [ 8.88627619e-05, -6.70738227e-04,  2.82955565e-03, ...,
        -3.34652583e-03,  8.70123040e-04,  3.40033951e-03]])

In [10]:
y_base

id
1        0
2        0
3        0
4        0
5        0
        ..
31957    0
31958    0
31959    0
31960    0
31962    0
Name: label, Length: 29530, dtype: int64

In [11]:
X_clean, y_clean = vectorize_df(df_cleaned)

In [12]:
X_clean

array([[ 0.000164  , -0.00097227,  0.00330843, ..., -0.00523365,
         0.00096938, -0.00285136],
       [-0.00232347, -0.0016694 ,  0.00038817, ..., -0.00020569,
         0.00022666,  0.0038221 ],
       [-0.00620588, -0.00316428,  0.00138663, ...,  0.00165645,
         0.00093876, -0.00017074],
       ...,
       [ 0.0017181 ,  0.00186271,  0.00110266, ..., -0.0023023 ,
        -0.00016485,  0.00787419],
       [-0.00182129,  0.00049515,  0.0007572 , ..., -0.00087936,
        -0.00065907, -0.00146304],
       [-0.00335515, -0.00040929, -0.00032946, ..., -0.00268573,
         0.00366877, -0.0030683 ]])

In [13]:
y_clean

id
1        0
2        0
3        0
4        0
5        0
        ..
31956    0
31957    0
31958    0
31959    0
31960    0
Name: label, Length: 27650, dtype: int64

### Train/Test Split

In [14]:
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(X_base, y_base)

In [15]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean)

### 1. Resampling Methods
https://www.analyticsvidhya.com/blog/2022/05/handling-imbalanced-data-with-imbalance-learn-in-python/#h-techniques-for-handling-imbalanced-data

In [16]:
def print_data(y_before, y_after, sampling_technique):
    counter_before = Counter(y_before)
    counter_after = Counter(y_after)
    print("Before sampling with:", sampling_technique, counter_before)
    print("After sampling with:", sampling_technique, counter_after)

#### 1.1 Oversampling

##### 1.1.1 SMOTE

In [17]:
os_smote = SMOTE()

In [18]:
X_train_base_s, y_train_base_s = os_smote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 20650, 1: 1497})
After sampling with: SMOTE Counter({0: 20650, 1: 20650})


In [19]:
X_train_clean_s, y_train_clean_s = os_smote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_s, "SMOTE")

Before sampling with: SMOTE Counter({0: 19342, 1: 1395})
After sampling with: SMOTE Counter({0: 19342, 1: 19342})


##### 1.1.2 Borderline-SMOTE

In [20]:
os_bsmote = BorderlineSMOTE()

In [21]:
X_train_base_bs, y_train_base_bs = os_bsmote.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 20650, 1: 1497})
After sampling with: BorderlineSMOTE Counter({0: 20650, 1: 20650})


In [22]:
X_train_clean_bs, y_train_clean_bs = os_bsmote.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_bs, "BorderlineSMOTE")

Before sampling with: BorderlineSMOTE Counter({0: 19342, 1: 1395})
After sampling with: BorderlineSMOTE Counter({0: 19342, 1: 19342})


##### 1.1.3 ADASYN

In [23]:
os_ada = ADASYN()

In [24]:
X_train_base_a, y_train_base_a = os_ada.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 20650, 1: 1497})
After sampling with: ADASYN Counter({1: 21007, 0: 20650})


In [25]:
X_train_clean_a, y_train_clean_a = os_ada.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_a, "ADASYN")

Before sampling with: ADASYN Counter({0: 19342, 1: 1395})
After sampling with: ADASYN Counter({1: 19362, 0: 19342})


#### 1.2 Undersampling

##### 1.2.1 NearMiss

In [26]:
us_near_miss = NearMiss(version = 3, n_neighbors_ver3=3)

In [27]:
X_train_base_nm, y_train_base_nm = us_near_miss.fit_resample(X_train_base, y_train_base)
print_data(y_train_base, y_train_base_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 20650, 1: 1497})
After sampling with: NearMiss Counter({0: 1497, 1: 1497})


In [28]:
X_train_clean_nm, y_train_clean_nm = us_near_miss.fit_resample(X_train_clean, y_train_clean)
print_data(y_train_clean, y_train_clean_nm, "NearMiss")

Before sampling with: NearMiss Counter({0: 19342, 1: 1395})
After sampling with: NearMiss Counter({0: 1395, 1: 1395})


##### 1.2.2 Condensed Nearest Neighbor

In [29]:
us_cnn = CondensedNearestNeighbour(n_neighbors=1, n_jobs=-1) # sehr langsam

In [30]:
# X_train_base_cnn, y_train_base_cnn = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_train_base_cnn, "CondensedNearestNeighbour")

In [31]:
# X_train_clean_cnn, y_train_clean_cnn = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_train_clean_cnn, "CondensedNearestNeighbour")

##### 1.2.3 Neighborhood Cleaning

In [32]:
us_cnn_cr = NeighbourhoodCleaningRule(n_neighbors=3, threshold_cleaning=0.5, n_jobs=-1) # sehr langsam

In [33]:
# X_base_ncr, y_base_ncr = us_cnn.fit_resample(X_train_base, y_train_base)
# print_data(y_train_base, y_base_ncr, "NeighbourhoodCleaningRule")

In [34]:
# X_clean_ncr, y_clean_ncr = us_cnn.fit_resample(X_train_clean, y_train_clean)
# print_data(y_train_clean, y_clean_ncr, "NeighbourhoodCleaningRule")

### 2. Ensemble Models

In [35]:
evaluation = pd.DataFrame(columns=["model", "variant", "train_acc", "train_prec", "train_rec", "train_f1", "test_acc", "test_prec", "test_rec", "test_f1"])

In [36]:
def add_to_eval_df(model, model_name, variant, x_data_train, y_data_train, x_data_test, y_data_test):
    train_acc = model.score(x_data_train, y_data_train)
    train_precision = precision_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_recall = recall_score(y_data_train, model.predict(x_data_train), average="weighted")
    train_f1 = f1_score(y_data_train, model.predict(x_data_train), average="weighted")

    test_acc = model.score(x_data_test, y_data_test)
    test_precision = precision_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_recall = recall_score(y_data_test, model.predict(x_data_test), average="weighted")
    test_f1 = f1_score(y_data_test, model.predict(x_data_test), average="weighted")

    evaluation.loc[len(evaluation.index)] = [model_name, variant, train_acc, train_precision, train_recall, train_f1, test_acc, test_precision, test_recall, test_f1]

In [37]:
def evaluate_model(model, x_test, y_test, sampling_method):
    pred = model.predict(x_test)
    accscore = metrics.accuracy_score(pred, y_test)

    print(f'{sampling_method} model accuracy for classification is =', str('{:04.2f}'.format(accscore * 100)) + '%')
    print('------------------------------------------------')
    print('Confusion Matrix:')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    print('------------------------------------------------')
    print('Classification Report:')
    print(classification_report(y_test, pred))

#### 2.1 RandomForest

In [38]:
rf = RandomForestClassifier(n_jobs=-1)

In [39]:
# BASE
rf.fit(X_train_base, y_train_base)
add_to_eval_df(rf, "RandomForest", "base", X_train_base, y_train_base, X_test_base, y_test_base)

In [40]:
# CLEANED
rf.fit(X_train_clean, y_train_clean)
add_to_eval_df(rf, "RandomForest", "clean", X_train_clean, y_train_clean, X_test_clean, y_test_clean)

In [41]:
# SMOTE --- BASE
rf.fit(X_train_base_s, y_train_base_s)
add_to_eval_df(rf, "RandomForest", "SMOTE base", X_train_base_s, y_train_base_s, X_test_base, y_test_base)

In [42]:
# SMOTE --- CLEANED
rf.fit(X_train_clean_s, y_train_clean_s)
add_to_eval_df(rf, "RandomForest", "SMOTE clean", X_train_clean_s, y_train_clean_s, X_test_clean, y_test_clean)

In [43]:
# BorderlineSMOTE --- BASE
rf.fit(X_train_base_bs, y_train_base_bs)
add_to_eval_df(rf, "RandomForest", "BorderlineSMOTE base", X_train_base_bs, y_train_base_bs, X_test_base, y_test_base)

In [44]:
# BorderlineSMOTE --- CLEANED
rf.fit(X_train_clean_bs, y_train_clean_bs)
add_to_eval_df(rf, "RandomForest", "BorderlineSMOTE clean", X_train_clean_bs, y_train_clean_bs, X_test_clean, y_test_clean)

In [45]:
# ADASYN --- BASE
rf.fit(X_train_base_a, y_train_base_a)
add_to_eval_df(rf, "RandomForest", "ADASYN base", X_train_base_a, y_train_base_a, X_test_base, y_test_base)

In [46]:
# ADASYN --- CLEANED
rf.fit(X_train_clean_a, y_train_clean_a)
add_to_eval_df(rf, "RandomForest", "ADASYN clean", X_train_clean_a, y_train_clean_a, X_test_clean, y_test_clean)

In [47]:
# NearMiss --- BASE
rf.fit(X_train_base_nm, y_train_base_nm)
add_to_eval_df(rf, "RandomForest", "NearMiss base", X_train_base_nm, y_train_base_nm, X_test_base, y_test_base)

In [48]:
# NearMiss --- CLEANED
rf.fit(X_train_clean_nm, y_train_clean_nm)
add_to_eval_df(rf, "RandomForest", "NearMiss clean", X_train_clean_nm, y_train_clean_nm, X_test_clean, y_test_clean)

In [49]:
evaluation.sort_values(by=["test_rec"], ascending=False)

Unnamed: 0,model,variant,train_acc,train_prec,train_rec,train_f1,test_acc,test_prec,test_rec,test_f1
1,RandomForest,clean,0.999711,0.999711,0.999711,0.99971,0.940257,0.943828,0.940257,0.911734
0,RandomForest,base,0.999729,0.999729,0.999729,0.999729,0.931735,0.93152,0.931735,0.900609
5,RandomForest,BorderlineSMOTE clean,0.999586,0.999587,0.999586,0.999586,0.925503,0.901707,0.925503,0.911836
4,RandomForest,BorderlineSMOTE base,0.999855,0.999855,0.999855,0.999855,0.923608,0.89989,0.923608,0.907758
3,RandomForest,SMOTE clean,0.999741,0.999742,0.999741,0.999741,0.917257,0.898812,0.917257,0.907318
6,RandomForest,ADASYN base,0.999688,0.999688,0.999688,0.999688,0.916023,0.897853,0.916023,0.905434
7,RandomForest,ADASYN clean,0.999638,0.999639,0.999638,0.999638,0.915521,0.900077,0.915521,0.907285
2,RandomForest,SMOTE base,0.999661,0.999661,0.999661,0.999661,0.915346,0.896838,0.915346,0.904609
9,RandomForest,NearMiss clean,0.999642,0.999642,0.999642,0.999642,0.422537,0.895341,0.422537,0.543189
8,RandomForest,NearMiss base,0.998998,0.999,0.998998,0.998998,0.422322,0.88035,0.422322,0.535741
