In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import dill


In [2]:
data_path = 'data'
model_path = 'models'

df = pd.read_csv(data_path + '/tweets.csv').drop('id', 1).fillna('')
df

Unnamed: 0,keyword,location,text,target
0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0
...,...,...,...,...
11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0
11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0
11367,wrecked,🇵🇭,i feel directly attacked 💀 i consider moonbin ...,0
11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0


In [3]:
df['location'].value_counts()

                    3418
United States         96
Australia             83
London, England       81
UK                    77
                    ... 
D(M)V                  1
in mista's pants       1
your stomach           1
burritoblanket         1
Webster, MA            1
Name: location, Length: 4505, dtype: int64

In [4]:
df['keyword'].value_counts()

thunderstorm     93
flattened        88
mass%20murder    86
stretcher        86
drown            83
                 ..
electrocuted     16
rainstorm        11
deluged          10
siren            10
tsunami           6
Name: keyword, Length: 219, dtype: int64

In [5]:
X=df.drop('target', axis=1)
y=df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

X_test.to_csv(data_path + "/X_test.csv", index=None)
y_test.to_csv(data_path + "/y_test.csv", index=None)
X_train.to_csv(data_path + "/X_train.csv", index=None)
y_train.to_csv(data_path + "/y_train.csv", index=None)


In [10]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]


class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        x_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in x_columns:
                 X[col_] = 0
        return X[self.columns]

    
    
def run(name, pipeline, **fit_params):
    print(f'{name}: Cross-validation ...')
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc')
    cv_score = np.mean(cv_scores)

    print(f'{name}: Training ...')
    pipeline.fit(X_train, y_train, **fit_params)

    dill_path = model_path + '/' + name + '.dill'

    print(f'{name}: Storing to {dill_path} ...')
    with open(dill_path, 'wb') as out_strm:
        dill.dump(pipeline, out_strm)

    print(f'{name}: Loading from {dill_path} ...')
    with open(dill_path, 'rb') as in_strm:
        pipeline = dill.load(in_strm)    
    
    print(f'{name}: Testing loaded pipeline ...')
    preds = pipeline.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, preds)

    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    
    cnf_matrix = confusion_matrix(y_test, preds>thresholds[ix])
    
    TN = cnf_matrix[0][0]
    FN = cnf_matrix[1][0]
    TP = cnf_matrix[1][1]
    FP = cnf_matrix[0][1]
    
    metrics = (thresholds[ix], fscore[ix], precision[ix], recall[ix], cv_score, cnf_matrix)
    metrics_string = 'Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f, CV score=%.3f' % metrics[0:5]
    results[name] = (metrics_string,) + metrics
    return pipeline
    


In [11]:
features = FeatureUnion([
                           ('text', Pipeline([('text_selector', FeatureSelector(column='text')), 
                                             ('text_tfidf', TfidfVectorizer(sublinear_tf=True,
                                                            strip_accents='unicode',
                                                            analyzer='word',
                                                            token_pattern=r'\w{1,}',
                                                            stop_words='english',
                                                            ngram_range=(1, 1),
                                                            max_features=100)), 
                                             ])),
                           ('keyword', Pipeline([('keyword_selector', FeatureSelector(column='keyword')), 
                                                 ('ohe', OHEEncoder(key='keyword'))
                                             ])),
                           ('location', Pipeline([('location_selector', FeatureSelector(column='location')), 
                                             ('text_tfidf', TfidfVectorizer(sublinear_tf=True,
                                                            strip_accents='unicode',
                                                            analyzer='word',
                                                            token_pattern=r'\w{1,}',
                                                            stop_words='english',
                                                            ngram_range=(1, 1),
                                                            max_features=5)), 
                                             ])),
                        ])
features.fit_transform(X_train)
features

FeatureUnion(n_jobs=None,
             transformer_list=[('text',
                                Pipeline(memory=None,
                                         steps=[('text_selector',
                                                 FeatureSelector(column='text')),
                                                ('text_tfidf',
                                                 TfidfVectorizer(analyzer='word',
                                                                 binary=False,
                                                                 decode_error='strict',
                                                                 dtype=<class 'numpy.float64'>,
                                                                 encoding='utf-8',
                                                                 input='content',
                                                                 lowercase=True,
                                                                 max_df=1.0,
 

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import catboost as catb

def classifier(clf):
    return Pipeline([ ('features', features), ('classifier', clf) ])

results={}

run("LogisticRegression", classifier(LogisticRegression(C=0.1, solver='sag', random_state=42)))
run("RandomForestClassifier", classifier(RandomForestClassifier(random_state=42)))
run("CatBoostClassifier", classifier(catb.CatBoostClassifier(silent=True, random_state=42)))


LogisticRegression: Cross-validation ...
LogisticRegression: Training ...
LogisticRegression: Storing to models/LogisticRegression.dill ...
LogisticRegression: Loading from models/LogisticRegression.dill ...
LogisticRegression: Testing loaded pipeline ...
RandomForestClassifier: Cross-validation ...
RandomForestClassifier: Training ...
RandomForestClassifier: Storing to models/RandomForestClassifier.dill ...
RandomForestClassifier: Loading from models/RandomForestClassifier.dill ...
RandomForestClassifier: Testing loaded pipeline ...
CatBoostClassifier: Cross-validation ...
CatBoostClassifier: Training ...
CatBoostClassifier: Storing to models/CatBoostClassifier.dill ...
CatBoostClassifier: Loading from models/CatBoostClassifier.dill ...
CatBoostClassifier: Testing loaded pipeline ...


Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('text_selector',
                                                                  FeatureSelector(column='text')),
                                                                 ('text_tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
   

In [13]:
results

{'LogisticRegression': ('Best Threshold=0.228861, F-Score=0.523, Precision=0.441, Recall=0.644, CV score=0.804',
  0.22886101526908317,
  0.5233644859813084,
  0.4409448818897638,
  0.6436781609195402,
  0.8040616954279547,
  array([[411,  71],
         [ 32,  55]], dtype=int64)),
 'RandomForestClassifier': ('Best Threshold=0.422310, F-Score=0.553, Precision=0.646, Recall=0.483, CV score=0.788',
  0.4223095238095238,
  0.5526315789473685,
  0.6461538461538462,
  0.4827586206896552,
  0.7883604805988752,
  array([[459,  23],
         [ 46,  41]], dtype=int64)),
 'CatBoostClassifier': ('Best Threshold=0.363309, F-Score=0.517, Precision=0.633, Recall=0.437, CV score=0.802',
  0.3633087446456145,
  0.5170068027210883,
  0.6333333333333333,
  0.4367816091954023,
  0.801917711652648,
  array([[460,  22],
         [ 50,  37]], dtype=int64))}