In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB



In [2]:
train_df = pd.read_csv('data/Twitter/hate_twitter/hate_train.csv')
val_df = pd.read_csv('data/Twitter/hate_twitter/hate_val.csv')
test_df = pd.read_csv('data/Twitter/hate_twitter/hate_test.csv')

In [3]:
# Check and drop na values in clean_tweet column
train_df[train_df['clean_tweet'].isnull()]

train_df = train_df[train_df['clean_tweet'].notna()]
val_df = val_df[val_df['clean_tweet'].notna()]
test_df = test_df[test_df['clean_tweet'].notna()]


In [4]:
train_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
0,27857,27857,27858,0,"omg. omg. omg. yay! i found it, and at a wond...","['segasaturn', 'throwbackâ']",omg omg omg yay found wonderful price segasatu...,"omg. omg. omg. yay! i found it, and at a wond...",omg omg omg yay found wonderful price segasatu...
1,31205,31205,31206,0,#payintheusa polar bear climb racing: angry ...,['payintheusa'],payintheusa polar bear climb racing angry pola...,<hashtag> payintheusa <elong>polar bear climb...,payintheusa polar bear climb racing angry pola...
2,8440,8440,8441,0,#trainhard polar bear climb racing: angry po...,['trainhard'],trainhard polar bear climb racing angry polar ...,<hashtag> trainhard <elong>polar bear climb r...,trainhard polar bear climb racing angry polar ...
3,5005,5005,5006,1,he should turn in his resignation.,[],turn resignation,he should turn in his resignation.,turn resignation
4,3898,3898,3899,0,ððð . . happy bihday!! to hajime hoso...,"['bihday', '30æ', 'ã']",happy bihday hajime hosogai bihday bihday 30,ððð . . happy bihday! <repeat> to haj...,. . happy bihday hajime hosogai . . . bihday b...


In [5]:
x_train = train_df['clean_tweet']
y_train = train_df['label']

x_test = test_df['clean_tweet']
y_test = test_df['label']

x_val = val_df['clean_tweet']
y_val = val_df['label']




# Deal with data imbalance, upsample

In [6]:
from sklearn.utils import resample


In [7]:
train_majority = train_df[train_df.label==0]
train_minority = train_df[train_df.label==1]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

0    20776
1    20776
Name: label, dtype: int64

In [8]:
x_train_upsampled = train_upsampled['clean_tweet']
y_train_upsampled = train_upsampled['label']

# Pipeline


In [9]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [10]:
def bow_pipeline(train_df, test_df, val_df, ngram_range=(1,1)):
    '''
    Pipeline for Bag-of-words based models training and evaluation
    '''
    cls = [LogisticRegression(),
           MultinomialNB(), 
           SVC(),
           LinearSVC(C=0.01),
           GradientBoostingClassifier(),
           xgb.XGBClassifier(use_label =False),
           RandomForestClassifier(n_estimators=200),
           KNeighborsClassifier(n_neighbors = 5)]
    
    result_all_dict = {}

    vectorizer = CountVectorizer()
    train_model = vectorizer.fit_transform(train_df.clean_tweet)
    test_model = vectorizer.transform(test_df.clean_tweet)
    val_model = vectorizer.transform(val_df.clean_tweet)

    train_tfidf = pd.DataFrame(train_model)

    test_actual = test_df.label
    val_actual = val_df.label
    i = 0
    accuracy = []
    cls_name = []

    for cl in cls:
        result_dict = {}
            
        model = cl.fit(train_model,train_df.label)
        
        y_test_predict = model.predict(test_model)
        y_val_predict = model.predict(val_model)
        
        a = (100*accuracy_score(y_val_predict, val_actual))
        a = round(a,2)
        accuracy.append(a)
        cls_name.append(cl.__class__.__name__)
        
        result_dict["Validation Accuracy"] = accuracy_score(y_val_predict, val_actual)
        result_dict["Validation Binary Recall"] = recall_score(y_val_predict, val_actual)
        result_dict["Validation Macro Recall"] = recall_score(y_val_predict, val_actual, average='macro')
        result_dict["Validation Binary F1"] = f1_score(y_val_predict, val_actual)
        result_dict["Validation Macro F1"] = f1_score(y_val_predict, val_actual, average='macro')
        result_dict["Test Accuracy"] = accuracy_score(y_test_predict, test_actual)
        result_dict["Test Binary Recall"] = recall_score(y_test_predict, test_actual)
        result_dict["Test Macro Recall"] = recall_score(y_test_predict, test_actual, average='macro')
        result_dict["Test Binary F1"] = f1_score(y_test_predict, test_actual)
        result_dict["Test Macro F1"] = f1_score(y_test_predict, test_actual, average='macro')
        
        result_all_dict[cl.__class__.__name__] = result_dict
        
        i +=1
    
    return result_all_dict

## Original data frame


In [11]:
result_all_dict = bow_pipeline(train_df, test_df, val_df)

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [12]:
result_pd = pd.DataFrame.from_dict(result_all_dict, orient='index')
result_pd

Unnamed: 0,Validation Accuracy,Validation Binary Recall,Validation Macro Recall,Validation Binary F1,Validation Macro F1,Test Accuracy,Test Binary Recall,Test Macro Recall,Test Binary F1,Test Macro F1
LogisticRegression,0.960752,0.910526,0.936676,0.64794,0.813579,0.963272,0.896175,0.931056,0.650794,0.815705
MultinomialNB,0.958246,0.867347,0.914736,0.62963,0.803753,0.959933,0.817734,0.891979,0.633588,0.806198
SVC,0.955115,0.932886,0.944357,0.563895,0.770117,0.959516,0.931973,0.94618,0.58547,0.782094
LinearSVC,0.946138,0.957447,0.951679,0.410959,0.691369,0.952212,0.933962,0.943294,0.4637,0.719346
GradientBoostingClassifier,0.945094,0.917526,0.931595,0.403628,0.687425,0.949708,0.892157,0.921558,0.43026,0.701976
XGBClassifier,0.949896,0.882353,0.917111,0.5,0.736813,0.955968,0.866667,0.91276,0.552017,0.764432
RandomForestClassifier,0.956994,0.841584,0.90183,0.622711,0.799954,0.962646,0.858586,0.912858,0.655106,0.81768
KNeighborsClassifier,0.937787,0.848485,0.89376,0.273171,0.620337,0.941987,0.811594,0.877743,0.287179,0.628471


## Upsampled data frame


In [13]:
# Upsampled data frame
result_all_dict_upsampled = bow_pipeline(train_upsampled, test_df, val_df)


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [14]:
result_pd_upsampled = pd.DataFrame.from_dict(result_all_dict_upsampled, orient='index')
result_pd_upsampled

Unnamed: 0,Validation Accuracy,Validation Binary Recall,Validation Macro Recall,Validation Binary F1,Validation Macro F1,Test Accuracy,Test Binary Recall,Test Macro Recall,Test Binary F1,Test Macro F1
LogisticRegression,0.948434,0.633609,0.803929,0.650636,0.8114,0.952629,0.629834,0.804421,0.667643,0.82107
MultinomialNB,0.925261,0.487762,0.736176,0.60917,0.783925,0.921745,0.453608,0.720035,0.584718,0.77076
SVC,0.961378,0.866359,0.916123,0.670232,0.82486,0.963063,0.839623,0.9042,0.667917,0.824181
LinearSVC,0.940084,0.564334,0.771355,0.635324,0.801342,0.940735,0.541387,0.761603,0.630208,0.798997
GradientBoostingClassifier,0.932777,0.531609,0.747907,0.534682,0.749227,0.936561,0.522788,0.747137,0.56196,0.763882
XGBClassifier,0.932777,0.524336,0.749835,0.595477,0.77941,0.937187,0.522624,0.750967,0.605505,0.785691
RandomForestClassifier,0.953445,0.721612,0.844534,0.638574,0.806847,0.960351,0.750958,0.861685,0.67354,0.826217
KNeighborsClassifier,0.901461,0.359031,0.658644,0.408521,0.677388,0.901294,0.339662,0.651304,0.405031,0.675607
