In [1]:
#!pip install xgboost

In [2]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

In [3]:
train_df = pd.read_csv('data/Twitter/hate_twitter/hate_train.csv')
val_df = pd.read_csv('data/Twitter/hate_twitter/hate_val.csv')
test_df = pd.read_csv('data/Twitter/hate_twitter/hate_test.csv')

In [4]:
# Check and drop na values in clean_tweet column
train_df[train_df['clean_tweet'].isnull()]

train_df = train_df[train_df['clean_tweet'].notna()]
val_df = val_df[val_df['clean_tweet'].notna()]
test_df = test_df[test_df['clean_tweet'].notna()]


In [5]:
train_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
0,27857,27857,27858,0,"omg. omg. omg. yay! i found it, and at a wond...","['segasaturn', 'throwbackâ']",omg omg omg yay found wonderful price segasatu...,"omg. omg. omg. yay! i found it, and at a wond...",omg omg omg yay found wonderful price segasatu...
1,31205,31205,31206,0,#payintheusa polar bear climb racing: angry ...,['payintheusa'],payintheusa polar bear climb racing angry pola...,<hashtag> payintheusa <elong>polar bear climb...,payintheusa polar bear climb racing angry pola...
2,8440,8440,8441,0,#trainhard polar bear climb racing: angry po...,['trainhard'],trainhard polar bear climb racing angry polar ...,<hashtag> trainhard <elong>polar bear climb r...,trainhard polar bear climb racing angry polar ...
3,5005,5005,5006,1,he should turn in his resignation.,[],turn resignation,he should turn in his resignation.,turn resignation
4,3898,3898,3899,0,ððð . . happy bihday!! to hajime hoso...,"['bihday', '30æ', 'ã']",happy bihday hajime hosogai bihday bihday 30,ððð . . happy bihday! <repeat> to haj...,. . happy bihday hajime hosogai . . . bihday b...


In [6]:
x_train = train_df['clean_tweet']
y_train = train_df['label']

x_test = test_df['clean_tweet']
y_test = test_df['label']

x_val = val_df['clean_tweet']
y_val = val_df['label']




# TFIDF+XGBoost

In [7]:
pipeline_xgb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', xgb.XGBClassifier(use_label =False)),])

In [8]:
model_xgb = pipeline_xgb.fit(x_train, y_train)
model = pipeline_xgb.fit(x_train, y_train)

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [9]:
y_test_predict = model.predict(x_test)
y_val_predict = model.predict(x_val)

print('--'* 20)
print('Validation result')
print('Recall_Score: ', recall_score(y_val, y_val_predict))
print('F1_Score: ',f1_score(y_val, y_val_predict))
print('Accuracy_Score: ', accuracy_score(y_val, y_val_predict))



print('--'* 20)
print('Test result')
print('Recall_Score: ',recall_score(y_test, y_test_predict))
print('F1_Score: ',f1_score(y_test, y_test_predict))
print('Accuracy_Score: ', accuracy_score(y_test, y_test_predict))

----------------------------------------
Validation result
Recall_Score:  0.3313953488372093
F1_Score:  0.4840764331210191
Accuracy_Score:  0.9492693110647181
----------------------------------------
Test result
Recall_Score:  0.40809968847352024
F1_Score:  0.5598290598290598
Accuracy_Score:  0.9570116861435726


In [10]:
def evaluate_model(model):
    y_test_predict = model.predict(x_test)
    y_val_predict = model.predict(x_val)

    print('--'* 20)
    print('Validation result')
    print('Recall_Score: ', recall_score(y_val, y_val_predict))
    print('F1_Score: ',f1_score(y_val, y_val_predict))
    print('Accuracy_Score: ', accuracy_score(y_val, y_val_predict))



    print('--'* 20)
    print('Test result')
    print('Recall_Score: ',recall_score(y_test, y_test_predict))
    print('F1_Score: ',f1_score(y_test, y_test_predict))
    print('Accuracy_Score: ', accuracy_score(y_test, y_test_predict))


In [11]:
evaluate_model(model_xgb)

----------------------------------------
Validation result
Recall_Score:  0.3313953488372093
F1_Score:  0.4840764331210191
Accuracy_Score:  0.9492693110647181
----------------------------------------
Test result
Recall_Score:  0.40809968847352024
F1_Score:  0.5598290598290598
Accuracy_Score:  0.9570116861435726


In [12]:
# RepeatedKFOLD
def k_fold(pipeline):
    print('--'*20)
    print('---RepeatedKFOLD---')
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state =1)

    recall_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='recall', n_jobs=1)
    recall_score = np.mean(recall_score)

    f1_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='f1', n_jobs=1)
    f1_score = np.mean(f1_score)

    accuracy_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='accuracy', n_jobs=1)
    accuracy_score = np.mean(accuracy_score)
    print('--'* 20)
    print('RKFold_Recall_Score: ', recall_score)
    print('RKFold_F1_Score: ', f1_score)
    print('RKFold_Accuracy_Score: ', accuracy_score)

    print('--'* 20)

In [14]:
k_fold(pipeline_xgb)

----------------------------------------
---RepeatedKFOLD---
Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such 

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  Th

# TFIDF+SVC/Linear SVC

In [15]:
from sklearn.svm import SVC, LinearSVC


In [18]:
pipeline_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SVC()),])
model_svc = pipeline_svc.fit(x_train, y_train)

evaluate_model(model_svc)
# k_fold(pipeline_svc)

----------------------------------------
Validation result
Recall_Score:  0.42151162790697677
F1_Score:  0.5846774193548387
Accuracy_Score:  0.9569937369519833
----------------------------------------
Test result
Recall_Score:  0.43302180685358255
F1_Score:  0.5914893617021276
Accuracy_Score:  0.9599332220367279


In [19]:
pipeline_linear_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', LinearSVC(C=0.01)),])
model_linear_svc = pipeline_linear_svc.fit(x_train, y_train)

evaluate_model(model_linear_svc)
# k_fold(pipeline_linear_svc)

----------------------------------------
Validation result
Recall_Score:  0.02616279069767442
F1_Score:  0.05099150141643059
Accuracy_Score:  0.930062630480167
----------------------------------------
Test result
Recall_Score:  0.04361370716510903
F1_Score:  0.0835820895522388
Accuracy_Score:  0.9359348914858097


# TFIDF+LogisticRegression

In [21]:
from sklearn.linear_model import LogisticRegression


In [23]:
pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', LogisticRegression()),])
model_lr = pipeline_lr.fit(x_train, y_train)

evaluate_model(model_lr)
k_fold(pipeline_lr)

----------------------------------------
Validation result
Recall_Score:  0.2819767441860465
F1_Score:  0.43595505617977526
Accuracy_Score:  0.9475991649269311
----------------------------------------
Test result
Recall_Score:  0.2803738317757009
F1_Score:  0.43062200956937796
Accuracy_Score:  0.9503338898163606
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.2430108179152765
RKFold_F1_Score:  0.3858051711149444
RKFold_Accuracy_Score:  0.9455257270693511
----------------------------------------


# TFIDF+GradientBoosting/RandomForest

In [24]:
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier


In [25]:
pipeline_gb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', GradientBoostingClassifier()),])
model_gb = pipeline_gb.fit(x_train, y_train)

evaluate_model(model_gb)
k_fold(pipeline_gb)

----------------------------------------
Validation result
Recall_Score:  0.2616279069767442
F1_Score:  0.40268456375838924
Accuracy_Score:  0.944258872651357
----------------------------------------
Test result
Recall_Score:  0.2866043613707165
F1_Score:  0.4309133489461358
Accuracy_Score:  0.9492904841402338
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.2881225356384592
RKFold_F1_Score:  0.43189157489685137
RKFold_Accuracy_Score:  0.9472259507829979
----------------------------------------


In [27]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', RandomForestClassifier()),])
model_rf = pipeline_rf.fit(x_train, y_train)

evaluate_model(model_rf)
k_fold(pipeline_rf)

----------------------------------------
Validation result
Recall_Score:  0.5058139534883721
F1_Score:  0.6432532347504621
Accuracy_Score:  0.9597077244258873
----------------------------------------
Test result
Recall_Score:  0.5233644859813084
F1_Score:  0.6511627906976744
Accuracy_Score:  0.9624373956594324
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.48603073501162675
RKFold_F1_Score:  0.6380260149836923
RKFold_Accuracy_Score:  0.9596420581655479
----------------------------------------
