In [1]:
#!pip install xgboost

In [2]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB



In [3]:
train_df = pd.read_csv('data/Twitter/hate_twitter/hate_train.csv')
val_df = pd.read_csv('data/Twitter/hate_twitter/hate_val.csv')
test_df = pd.read_csv('data/Twitter/hate_twitter/hate_test.csv')

In [4]:
# Check and drop na values in clean_tweet column
train_df[train_df['clean_tweet'].isnull()]

train_df = train_df[train_df['clean_tweet'].notna()]
val_df = val_df[val_df['clean_tweet'].notna()]
test_df = test_df[test_df['clean_tweet'].notna()]


In [5]:
train_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,label,tweet,hash_tag,clean_tweet,tokenized_tweet,tokenized_tweet_NLTK
0,27857,27857,27858,0,"omg. omg. omg. yay! i found it, and at a wond...","['segasaturn', 'throwbackâ']",omg omg omg yay found wonderful price segasatu...,"omg. omg. omg. yay! i found it, and at a wond...",omg omg omg yay found wonderful price segasatu...
1,31205,31205,31206,0,#payintheusa polar bear climb racing: angry ...,['payintheusa'],payintheusa polar bear climb racing angry pola...,<hashtag> payintheusa <elong>polar bear climb...,payintheusa polar bear climb racing angry pola...
2,8440,8440,8441,0,#trainhard polar bear climb racing: angry po...,['trainhard'],trainhard polar bear climb racing angry polar ...,<hashtag> trainhard <elong>polar bear climb r...,trainhard polar bear climb racing angry polar ...
3,5005,5005,5006,1,he should turn in his resignation.,[],turn resignation,he should turn in his resignation.,turn resignation
4,3898,3898,3899,0,ððð . . happy bihday!! to hajime hoso...,"['bihday', '30æ', 'ã']",happy bihday hajime hosogai bihday bihday 30,ððð . . happy bihday! <repeat> to haj...,. . happy bihday hajime hosogai . . . bihday b...


In [6]:
x_train = train_df['clean_tweet']
y_train = train_df['label']

x_test = test_df['clean_tweet']
y_test = test_df['label']

x_val = val_df['clean_tweet']
y_val = val_df['label']




# TFIDF+XGBoost

In [7]:
pipeline_xgb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', xgb.XGBClassifier(use_label =False)),])

In [8]:
model_xgb = pipeline_xgb.fit(x_train, y_train)
model = pipeline_xgb.fit(x_train, y_train)

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [9]:
y_test_predict = model.predict(x_test)
y_val_predict = model.predict(x_val)

print('--'* 20)
print('Validation result')
print('Recall_Score: ', recall_score(y_val, y_val_predict))
print('F1_Score: ',f1_score(y_val, y_val_predict))
print('Accuracy_Score: ', accuracy_score(y_val, y_val_predict))



print('--'* 20)
print('Test result')
print('Recall_Score: ',recall_score(y_test, y_test_predict))
print('F1_Score: ',f1_score(y_test, y_test_predict))
print('Accuracy_Score: ', accuracy_score(y_test, y_test_predict))

----------------------------------------
Validation result
Recall_Score:  0.3313953488372093
F1_Score:  0.4840764331210191
Accuracy_Score:  0.9492693110647181
----------------------------------------
Test result
Recall_Score:  0.40809968847352024
F1_Score:  0.5598290598290598
Accuracy_Score:  0.9570116861435726


In [10]:
def evaluate_model(model):
    y_test_predict = model.predict(x_test)
    y_val_predict = model.predict(x_val)

    print('--'* 20)
    print('Validation result')
    print('Recall_Score: ', recall_score(y_val, y_val_predict))
    print('F1_Score: ',f1_score(y_val, y_val_predict))
    print('Accuracy_Score: ', accuracy_score(y_val, y_val_predict))



    print('--'* 20)
    print('Test result')
    print('Recall_Score: ',recall_score(y_test, y_test_predict))
    print('F1_Score: ',f1_score(y_test, y_test_predict))
    print('Accuracy_Score: ', accuracy_score(y_test, y_test_predict))


In [11]:
evaluate_model(model_xgb)

----------------------------------------
Validation result
Recall_Score:  0.3313953488372093
F1_Score:  0.4840764331210191
Accuracy_Score:  0.9492693110647181
----------------------------------------
Test result
Recall_Score:  0.40809968847352024
F1_Score:  0.5598290598290598
Accuracy_Score:  0.9570116861435726


In [12]:
# RepeatedKFOLD
def k_fold(pipeline):
    print('--'*20)
    print('---RepeatedKFOLD---')
    cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state =1)

    recall_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='recall', n_jobs=1)
    recall_score = np.mean(recall_score)

    f1_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='f1', n_jobs=1)
    f1_score = np.mean(f1_score)

    accuracy_score = cross_val_score(pipeline, x_train, y_train, cv=cv, scoring='accuracy', n_jobs=1)
    accuracy_score = np.mean(accuracy_score)
    print('--'* 20)
    print('RKFold_Recall_Score: ', recall_score)
    print('RKFold_F1_Score: ', f1_score)
    print('RKFold_Accuracy_Score: ', accuracy_score)

    print('--'* 20)

In [13]:
k_fold(pipeline_xgb)

----------------------------------------
---RepeatedKFOLD---
Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such 

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  Th

# TFIDF+SVC/Linear SVC

In [14]:
from sklearn.svm import SVC, LinearSVC


In [15]:
pipeline_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SVC()),])
model_svc = pipeline_svc.fit(x_train, y_train)

evaluate_model(model_svc)
# k_fold(pipeline_svc)

----------------------------------------
Validation result
Recall_Score:  0.42151162790697677
F1_Score:  0.5846774193548387
Accuracy_Score:  0.9569937369519833
----------------------------------------
Test result
Recall_Score:  0.43302180685358255
F1_Score:  0.5914893617021276
Accuracy_Score:  0.9599332220367279


In [16]:
pipeline_linear_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', LinearSVC(C=0.01)),])
model_linear_svc = pipeline_linear_svc.fit(x_train, y_train)

evaluate_model(model_linear_svc)
# k_fold(pipeline_linear_svc)

----------------------------------------
Validation result
Recall_Score:  0.02616279069767442
F1_Score:  0.05099150141643059
Accuracy_Score:  0.930062630480167
----------------------------------------
Test result
Recall_Score:  0.04361370716510903
F1_Score:  0.0835820895522388
Accuracy_Score:  0.9359348914858097


# TFIDF+LogisticRegression

In [17]:
from sklearn.linear_model import LogisticRegression


In [18]:
pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', LogisticRegression()),])
model_lr = pipeline_lr.fit(x_train, y_train)

evaluate_model(model_lr)
k_fold(pipeline_lr)

----------------------------------------
Validation result
Recall_Score:  0.2819767441860465
F1_Score:  0.43595505617977526
Accuracy_Score:  0.9475991649269311
----------------------------------------
Test result
Recall_Score:  0.2803738317757009
F1_Score:  0.43062200956937796
Accuracy_Score:  0.9503338898163606
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.2430108179152765
RKFold_F1_Score:  0.3858051711149444
RKFold_Accuracy_Score:  0.9455257270693511
----------------------------------------


# TFIDF+GradientBoosting/RandomForest

In [19]:
from sklearn.ensemble  import GradientBoostingClassifier, RandomForestClassifier


In [20]:
pipeline_gb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', GradientBoostingClassifier()),])
model_gb = pipeline_gb.fit(x_train, y_train)

evaluate_model(model_gb)
k_fold(pipeline_gb)

----------------------------------------
Validation result
Recall_Score:  0.25872093023255816
F1_Score:  0.40362811791383213
Accuracy_Score:  0.9450939457202505
----------------------------------------
Test result
Recall_Score:  0.2866043613707165
F1_Score:  0.42890442890442887
Accuracy_Score:  0.948873121869783
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.2884369628955616
RKFold_F1_Score:  0.4330752143996256
RKFold_Accuracy_Score:  0.9470469798657717
----------------------------------------


In [21]:
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', RandomForestClassifier()),])
model_rf = pipeline_rf.fit(x_train, y_train)

evaluate_model(model_rf)
k_fold(pipeline_rf)

----------------------------------------
Validation result
Recall_Score:  0.5261627906976745
F1_Score:  0.6546112115732369
Accuracy_Score:  0.960125260960334
----------------------------------------
Test result
Recall_Score:  0.5264797507788161
F1_Score:  0.6537717601547388
Accuracy_Score:  0.9626460767946577
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.4825366494793246
RKFold_F1_Score:  0.6343990740255891
RKFold_Accuracy_Score:  0.9602237136465325
----------------------------------------


# Deal with data imbalance, upsample

In [22]:
from sklearn.utils import resample


In [23]:
train_majority = train_df[train_df.label==0]
train_minority = train_df[train_df.label==1]
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

0    20776
1    20776
Name: label, dtype: int64

In [24]:
x_train = train_upsampled['clean_tweet']
y_train = train_upsampled['label']

In [25]:
# XGBoost
pipeline_xgb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', xgb.XGBClassifier(use_label =False)),])
model_xgb = pipeline_xgb.fit(x_train, y_train)

evaluate_model(model_xgb)
k_fold(pipeline_xgb)

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


----------------------------------------
Validation result
Recall_Score:  0.6744186046511628
F1_Score:  0.5679314565483475
Accuracy_Score:  0.9263048016701462
----------------------------------------
Test result
Recall_Score:  0.7071651090342679
F1_Score:  0.5646766169154229
Accuracy_Score:  0.9269616026711185
----------------------------------------
---RepeatedKFOLD---
Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such case

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "use_label" } might not be used.

  Th

In [26]:
# SVC
pipeline_svc = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SVC()),])
model_svc = pipeline_svc.fit(x_train, y_train)

evaluate_model(model_svc)
# k_fold(pipeline_svc)

----------------------------------------
Validation result
Recall_Score:  0.5087209302325582
F1_Score:  0.660377358490566
Accuracy_Score:  0.9624217118997912
----------------------------------------
Test result
Recall_Score:  0.5264797507788161
F1_Score:  0.6706349206349206
Accuracy_Score:  0.9653589315525877


In [27]:
# Logistic Regression
pipeline_lr = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', LogisticRegression()),])
model_lr = pipeline_lr.fit(x_train, y_train)

evaluate_model(model_lr)
k_fold(pipeline_lr)

----------------------------------------
Validation result
Recall_Score:  0.7151162790697675
F1_Score:  0.6525198938992043
Accuracy_Score:  0.9453027139874739
----------------------------------------
Test result
Recall_Score:  0.7757009345794392
F1_Score:  0.6578599735799207
Accuracy_Score:  0.9459515859766278
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.9975211921580618
RKFold_F1_Score:  0.9789589688959653
RKFold_Accuracy_Score:  0.9785569847045679
----------------------------------------


In [28]:
# Gradient Boost
pipeline_gb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', GradientBoostingClassifier()),])
model_gb = pipeline_gb.fit(x_train, y_train)

evaluate_model(model_gb)
k_fold(pipeline_gb)

----------------------------------------
Validation result
Recall_Score:  0.5436046511627907
F1_Score:  0.5327635327635327
Accuracy_Score:  0.9315240083507307
----------------------------------------
Test result
Recall_Score:  0.5950155763239875
F1_Score:  0.5528219971056438
Accuracy_Score:  0.9355175292153589
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  0.6389582920724709
RKFold_F1_Score:  0.7626105357525584
RKFold_Accuracy_Score:  0.8010925656209904
----------------------------------------


In [29]:
# Random Forest
pipeline_rf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', RandomForestClassifier()),])
model_rf = pipeline_rf.fit(x_train, y_train)

evaluate_model(model_rf)
k_fold(pipeline_rf)

----------------------------------------
Validation result
Recall_Score:  0.5901162790697675
F1_Score:  0.6916524701873935
Accuracy_Score:  0.9622129436325678
----------------------------------------
Test result
Recall_Score:  0.5794392523364486
F1_Score:  0.6666666666666665
Accuracy_Score:  0.9611853088480802
----------------------------------------
---RepeatedKFOLD---
----------------------------------------
RKFold_Recall_Score:  1.0
RKFold_F1_Score:  0.9938659091119965
RKFold_Accuracy_Score:  0.9936104268464756
----------------------------------------


# Pipeline


In [32]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


In [52]:
def tfidf_pipeline(train_df, test_df, val_df, ngram_range=(1,1)):
    '''
    Pipeline for TF-IDF based models training and evaluation
    '''
    cls = [LogisticRegression(),
           MultinomialNB(), 
           SVC(),
           LinearSVC(C=0.01),
           GradientBoostingClassifier(),
           xgb.XGBClassifier(use_label =False),
           RandomForestClassifier(n_estimators=200),
           KNeighborsClassifier(n_neighbors = 5)]
    
    result_all_dict = {}

    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    train_tfidf_model = vectorizer.fit_transform(train_df.clean_tweet)
    test_tfidf_model = vectorizer.transform(test_df.clean_tweet)
    val_tfidf_model = vectorizer.transform(val_df.clean_tweet)

    train_tfidf = pd.DataFrame(train_tfidf_model)

    test_actual = test_df.label
    val_actual = val_df.label
    i = 0
    accuracy = []
    cls_name = []

    for cl in cls:
        result_dict = {}
            
        model = cl.fit(train_tfidf_model,train_df.label)
        
        y_test_predict = model.predict(test_tfidf_model)
        y_val_predict = model.predict(val_tfidf_model)
        
        a = (100*accuracy_score(y_val_predict, val_actual))
        a = round(a,2)
        accuracy.append(a)
        cls_name.append(cl.__class__.__name__)
        
#         # Most important features
#         print('most important features')
#         print('--------> tfidf')
#         ngram_coef0 = abs(model.coef_[0])
#         imp0 = ngram_coef0.argsort()[-20:][::-1]
#         features = vectorizer.get_feature_names()
#         print([features[index] for index in imp0])
        
        
        result_dict["Validation Accuracy"] = accuracy_score(y_val_predict, val_actual)
        result_dict["Validation Binary Recall"] = recall_score(y_val_predict, val_actual)
        result_dict["Validation Macro Recall"] = recall_score(y_val_predict, val_actual, average='macro')
        result_dict["Validation Binary F1"] = f1_score(y_val_predict, val_actual)
        result_dict["Validation Macro F1"] = f1_score(y_val_predict, val_actual, average='macro')
        result_dict["Test Accuracy"] = accuracy_score(y_test_predict, test_actual)
        result_dict["Test Binary Recall"] = recall_score(y_test_predict, test_actual)
        result_dict["Test Macro Recall"] = recall_score(y_test_predict, test_actual, average='macro')
        result_dict["Test Binary F1"] = f1_score(y_test_predict, test_actual)
        result_dict["Test Macro F1"] = f1_score(y_test_predict, test_actual, average='macro')
        
        result_all_dict[cl.__class__.__name__] = result_dict
        
        i +=1
    
    return result_all_dict

## Original data frame: Only Unigram


In [53]:
result_all_dict = tfidf_pipeline(train_df, test_df, val_df)

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [54]:
result_pd = pd.DataFrame.from_dict(result_all_dict, orient='index')
result_pd

Unnamed: 0,Validation Accuracy,Validation Binary Recall,Validation Macro Recall,Validation Binary F1,Validation Macro F1,Test Accuracy,Test Binary Recall,Test Macro Recall,Test Binary F1,Test Macro F1
LogisticRegression,0.947599,0.960396,0.95386,0.435955,0.704239,0.950334,0.927835,0.939317,0.430622,0.702328
MultinomialNB,0.939457,0.982143,0.960548,0.275,0.621705,0.943447,1.0,0.971426,0.269542,0.620063
SVC,0.956994,0.953947,0.95552,0.584677,0.781,0.959933,0.932886,0.946844,0.591489,0.785211
LinearSVC,0.930063,1.0,0.964965,0.050992,0.507343,0.935935,1.0,0.967874,0.083582,0.525195
GradientBoostingClassifier,0.945094,0.926316,0.935895,0.400911,0.68607,0.949917,0.878505,0.915026,0.439252,0.70652
XGBClassifier,0.949269,0.897638,0.924157,0.484076,0.7287,0.957012,0.891156,0.925126,0.559829,0.768616
RandomForestClassifier,0.962004,0.90099,0.93284,0.666667,0.82326,0.962437,0.865285,0.9159,0.649805,0.81498
KNeighborsClassifier,0.94572,0.833333,0.891045,0.446809,0.709134,0.952212,0.865079,0.909822,0.487696,0.731316


## Original data frame: Only Bigram


In [55]:
result_all_dict_bi = tfidf_pipeline(train_df, test_df, val_df, ngram_range=(2,2))
result_pd_bi = pd.DataFrame.from_dict(result_all_dict_bi, orient='index')
result_pd_bi

Parameters: { "use_label" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Unnamed: 0,Validation Accuracy,Validation Binary Recall,Validation Macro Recall,Validation Binary F1,Validation Macro F1,Test Accuracy,Test Binary Recall,Test Macro Recall,Test Binary F1,Test Macro F1
LogisticRegression,0.935699,0.973684,0.95454,0.193717,0.580116,0.940317,0.972973,0.956518,0.201117,0.585059
MultinomialNB,0.936326,1.0,0.967901,0.203655,0.585246,0.941361,1.0,0.970434,0.221607,0.59557
SVC,0.941127,0.984375,0.962458,0.308824,0.639039,0.945326,0.983607,0.964219,0.314136,0.642832
LinearSVC,0.930063,1.0,0.964965,0.050992,0.507343,0.935935,1.0,0.967874,0.083582,0.525195
GradientBoostingClassifier,0.935908,0.974359,0.954976,0.198433,0.582526,0.941152,0.953488,0.947264,0.225275,0.597345
XGBClassifier,0.935908,0.974359,0.954976,0.198433,0.582526,0.941569,1.0,0.970533,0.226519,0.598079
RandomForestClassifier,0.949061,0.980769,0.964563,0.455357,0.714319,0.954508,0.972477,0.963283,0.493023,0.734604
KNeighborsClassifier,0.936326,1.0,0.967901,0.203655,0.585246,0.941569,1.0,0.970533,0.226519,0.598079


## Upsampled data frame: Only Unigram


In [None]:
# Upsampled data frame
result_all_dict_upsampled = tfidf_pipeline(train_upsampled, test_df, val_df)


In [None]:
result_pd_upsampled = pd.DataFrame.from_dict(result_all_dict_upsampled, orient='index')
result_pd_upsampled

## Upsampled data frame: Only Bigram



In [None]:
result_all_dict_upsampled_bi = tfidf_pipeline(train_upsampled, test_df, val_df, ngram_range=(2,2))
result_pd_upsampled_bi = pd.DataFrame.from_dict(result_all_dict_bi, orient='index')
result_pd_upsampled_bi

# Generate other features (TBC)

In [None]:
# https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb

# get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [None]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

feats = get_feature_array(tweets)

