In [220]:
import pandas as pd
import numpy as np
import string
import csv

pd.options.display.float_format = '{:.3f}'.format

In [221]:
import pickle

In [222]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.dummy import DummyClassifier

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [233]:
df_no_anno = pd.read_csv('no_gdpr_preprocessed.csv', engine='python')

In [304]:
df_anno = pd.read_excel('anno_final.xlsx')

In [235]:
def text_preprocess(data):
    #remove links
    data['text_processed'] = data['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
    #remove Twitter mentions
    data['text_processed'] = data['text_processed'].str.replace('@[A-Za-z0-9]+\s?', '', regex=True)
    #remove punctuations
    data['text_processed'] = data['text_processed'].str.replace('[{}]'.format(string.punctuation), '')
    #remove RT 
    data['text_processed'] = data['text_processed'].str.replace("RT ", '')
    #remove double quotes 
    data['text_processed'] = data['text_processed'].str.replace('“|”', '')
    #lowercase
    data['text_processed'] = data['text_processed'].str.lower().str.strip()
    #replace new line with space
    data['text_processed'] = data['text_processed'].str.replace("\n", ' ')
    #other stuff
    data['text_processed'] = data['text_processed'].str.replace("rt : ", '')
    data['text_processed'] = data['text_processed'].str.replace("rt _com: ", '')
    data['text_processed'] = data['text_processed'].str.replace(" via", "")
    data['text_processed'] = data['text_processed'].str.replace("read more:", "")
    data['text_processed'] = data['text_processed'].str.replace("learn more :", "")
    #replace multiple spaces with single space
    data['text_processed'] = data['text_processed'].str.replace(' +', ' ')
    #strip spaces
    data['text_processed'] = data['text_processed'].str.strip()
    
    return data

In [387]:
df_no_anno = text_preprocess(df_no_anno)

In [239]:
df_no_anno.text_processed = df_no_anno.text_processed.fillna('')

In [240]:
df_no_anno_no_null = df_no_anno[~(df_no_anno.text_processed.isnull())]

In [309]:
df_no_anno = df_no_anno_no_null.copy()

### Helper Functions

In [344]:
def get_pipeline(max_df=1.0, max_features=None):
    """
        Create SciKit Learn pipeline for TF-IDF vectorization.

        This also provides access to the `CountVectorizer` via 
        the dict-like key, 'count' on the returned `Pipeline` instance.

        Example:
            pipe = get_pipeline()
            pipe.fit(X, y)


    """
    return Pipeline([
        ('count', CountVectorizer(max_df=max_df, max_features=max_features, ngram_range=(1,2))),
        ('tfidf', TfidfTransformer())
    ])

In [314]:
def get_scores(y_actual, y_pred, name):
    """
        Get prediction scores as Pandas Series.
    """

    return pd.Series({n: fn(y_actual, y_pred) for n, fn in [
        ('F1', f1_score), 
        ('Recall', recall_score), 
        ('Precision', precision_score), 
        ('Accuracy', accuracy_score)
    ]}, name=name)

# classifier: HBM-related 

In [315]:
df_anno.HBM_related.value_counts()

0    4795
1    1205
Name: HBM_related, dtype: int64

### Balance the dataset

In [316]:
#balance the dataset
min_cls_label = df_anno.HBM_related.value_counts().idxmin()
min_cls_length = len(df_anno[df_anno.HBM_related == min_cls_label])

df_no = df_anno[df_anno.HBM_related == 0]
df_yes = df_anno[df_anno.HBM_related == 1]

# binary classes
if min_cls_label == 1: 
    df_no = df_no.sample(min_cls_length)
else: 
    df_yes = df_yes.sample(min_cls_length)

# combine balanced classes
df_hbm_related = pd.concat([df_no, df_yes])

In [317]:
#make sure label values are balanced
print(f'Balanced Class Value Counts:\n{df_hbm_related.HBM_related.value_counts()}')

Balanced Class Value Counts:
0    1205
1    1205
Name: HBM_related, dtype: int64


### Create Train/Test Splits

Indices are retained so original text can be matched with results.

In [318]:
# Create Splits
# `train_test_split` will return a Pandas Series with the corresponding Indices.
x = df_hbm_related.text_processed
y = df_hbm_related.HBM_related
# Stratification is probably unncessary here due to pre-balanced classes, but it won't hurt
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=0, stratify=y)

# Save indices for accessing original text later on...
train_idx = x_train.copy().index.tolist()
test_idx = x_test.copy().index.tolist()

train_df = df_hbm_related.loc[train_idx]
test_df = df_hbm_related.loc[test_idx]

# convert to Numpy arrays (removing indices)
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()

y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print('X train shape:', x_train.shape)
print('X test shape:', x_test.shape)

X train shape: (1928,)
X test shape: (482,)


### Build and Train CountVectorizer / TF-IDF Transformer Pipeline

Using 3000 features/vocabulary terms seemed to work well enough for all models. \
More or less might be better, but hyperparameter tuning is unlikely to improve scores significantly.

In [319]:
pipe = get_pipeline(max_features=3000)
pipe.fit(x_train)

X_train_tfidf = pipe.transform(x_train)
X_test_tfidf = pipe.transform(x_test)

X_train_count = pipe['count'].transform(x_train)
X_test_count = pipe['count'].transform(x_test)

vocab_arr = sorted(pipe['count'].vocabulary_.keys())

## Test Performance of Various Models

1. Dummy Classifier
2. SVM
3. Multinomial Naive Bayes
4. Random Forest
5. Logistic Regression

In [320]:
def ModelBuilder(model_cls, **kwargs):
    """
        Given a model class and keyword arguments,
        this will return a function (`wrapped`).

        Upon calling the returned function (`wrapped`),
        a new instance of the model class will be created
        with arguments in kwargs.

        Every call of 'wrapped' will create, fit, and predict a new model instance

        Ex:

            m = ModelBuilder(DummyClassifier, strategy='stratified', random_state=0)
            y_pred, model = m(x_train, y_train, x_test)

    """
    def wrapped(_x_train, _y_train, _x_test):
        _model = model_cls(**kwargs)
        _model.fit(_x_train, _y_train)
        _y_pred = _model.predict(_x_test)

        return _y_pred, _model

    return wrapped

In [321]:
# dummy classifier
print("Dummy Classifier: \n")
dummy_mb = ModelBuilder(DummyClassifier, strategy='stratified', random_state=0)
dummy_y_pred_count, dummy_model_count = dummy_mb(X_train_count, y_train, X_test_count)
dummy_y_pred_tfidf, dummy_model_tfidf = dummy_mb(X_train_tfidf, y_train, X_test_tfidf)

pd.concat([
    get_scores(y_test, dummy_y_pred_count, 'count'), 
    get_scores(y_test, dummy_y_pred_tfidf, 'tfidf')
], axis=1)


Dummy Classifier: 



Unnamed: 0,count,tfidf
F1,0.491,0.491
Recall,0.494,0.494
Precision,0.488,0.488
Accuracy,0.488,0.488


In [348]:
#SVM
print('Support Vector Machine Classifier:')
svm_mb = ModelBuilder(svm.SVC)
svm_y_pred_count, svm_model_count = svm_mb(X_train_count, y_train, X_test_count)
svm_y_pred_tfidf, svm_model_tfidf = svm_mb(X_train_tfidf, y_train, X_test_tfidf)

pd.concat([
    get_scores(y_test, svm_y_pred_count, 'count'), 
    get_scores(y_test, svm_y_pred_tfidf, 'tfidf')
], axis=1)


Support Vector Machine Classifier:


Unnamed: 0,count,tfidf
F1,0.817,0.819
Recall,0.788,0.817
Precision,0.848,0.821
Accuracy,0.824,0.82


In [371]:
#Naive Bayes
print('Multinomial Naive Bayes:')
nb_mb = ModelBuilder(MultinomialNB)
nb_y_pred_count, nb_model_count = nb_mb(X_train_count, y_train, X_test_count)
nb_y_pred_tfidf, nb_model_tfidf = nb_mb(X_train_tfidf, y_train, X_test_tfidf)

pd.concat([
    get_scores(y_test, nb_y_pred_count, 'count'), 
    get_scores(y_test, nb_y_pred_tfidf, 'tfidf')
], axis=1)

Multinomial Naive Bayes:


Unnamed: 0,count,tfidf
F1,0.801,0.801
Recall,0.809,0.826
Precision,0.793,0.777
Accuracy,0.799,0.795


In [346]:
#Random Forest
print("Random Forest:")
rf_mb = ModelBuilder(RandomForestClassifier)
rf_y_pred_count, rf_model_count = rf_mb(X_train_count, y_train, X_test_count)
rf_y_pred_tfidf, rf_model_tfidf = rf_mb(X_train_tfidf, y_train, X_test_tfidf)

pd.concat([
    get_scores(y_test, rf_y_pred_count, 'count'), 
    get_scores(y_test, rf_y_pred_tfidf, 'tfidf')
], axis=1)

Random Forest:


Unnamed: 0,count,tfidf
F1,0.844,0.841
Recall,0.851,0.834
Precision,0.837,0.848
Accuracy,0.842,0.842


In [343]:
#Logistic Regression
print("Logistic Regression:")
lr_mb = ModelBuilder(LogisticRegression,  max_iter=4000, random_state=0)
lr_y_pred_count, lr_model_count = lr_mb(X_train_count, y_train, X_test_count)
lr_y_pred_tfidf, lr_model_tfidf = lr_mb(X_train_tfidf, y_train, X_test_tfidf)

pd.concat([
    get_scores(y_test, lr_y_pred_count, 'count'), 
    get_scores(y_test, lr_y_pred_tfidf, 'tfidf')
], axis=1)

Logistic Regression:


Unnamed: 0,count,tfidf
F1,0.796,0.809
Recall,0.759,0.817
Precision,0.836,0.801
Accuracy,0.805,0.807


In [380]:
#Logistic Regression
model_lg = LogisticRegression(max_iter=4000, random_state=0)
model_lg.fit(X_train_count, y_train)
pred_lg = model_lg.predict(X_test_count)

print("lg:\n")
print(f1_score(y_test, pred_lg))
print(recall_score(y_test, pred_lg))
print(precision_score(y_test, pred_lg))
print(accuracy_score(y_test, pred_lg))

lg:

0.7956521739130434
0.7593360995850622
0.8356164383561644
0.8049792531120332


In [335]:
#annotate tweets with no anno
texts = list(df_no_anno.text_processed)

In [336]:
# X_test_tfidf = pipe.transform(x_test)
# X_train_count = pipe['count'].transform(x_train)

texts_count = pipe['count'].transform(texts)

In [381]:
pred_vec = model_lg.predict(texts_count)

In [382]:
df_no_anno['HBM_related'] = pred_vec

In [383]:
df_hbm_only = df_no_anno[df_no_anno.HBM_related == 1]

In [386]:
df_hbm_only.to_csv(r'hbm_only.csv', index=False)