### imports

In [1]:
import itertools
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
df = pd.read_csv("./data/combined_with_who.csv")

In [3]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="pink" if cm[i, j] > thresh else "red")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [4]:
X= df['text']
y= df['label'].map(lambda x: 1 if x== 'true' else 0 )
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [5]:
models = [
    {
    'name': 'Multinomial Naive Bayes',
    'label' : 'mnnb',
    'model': MultinomialNB(),
    'params': {
            'mnnb__alpha' : [1, 1e-1, 1e-2, 10]
            }
    },
    {
    'name': 'Random Forest',
    'label' : 'rfc',
    'model': RandomForestClassifier(),
    'params': {
        'rfc__bootstrap' :  [True],
        'rfc__max_depth' : [50],
        'rfc__min_samples_split' : [ 5, 10],
        'rfc__min_samples_leaf' : [ 2, 4],
        }
    },
    {
    'name': 'Logistic Regression CV',
    'label' : 'lrcv',
    'model': LogisticRegressionCV(),
    'params':  {
        'lrcv__penalty': ['l1', 'l2'],
        'lrcv__solver': ['liblinear']
        }
    }
]

transformers = [
    {'name': 'Count Vectorizer',
    'label' : 'cvec',
    'transformer': CountVectorizer(),
    'params': {
        'cvec__stop_words' : [None],
        'cvec__max_features' : [2500, 5000],
        'cvec__min_df' : [2, 3],
        'cvec__max_df' : [.8, .9],
        'cvec__ngram_range' : [(1,1), (1,2)],
        }
    },
    {'name': 'TF-IDF Vectorizer',
    'label' : 'tvec',
    'transformer': TfidfVectorizer(),
    'params': {
        'tvec__stop_words' : [None],              
        'tvec__max_features' : [2500, 5000],
        'tvec__ngram_range' : [(1, 1), (1, 2)],  
        }    
    }
]

pipes = []

for model in models:
    for transformer in transformers:
        pipe = Pipeline([
            (transformer['label'], transformer['transformer']),
            (model['label'], model['model']),            
        ])

        pipe_dict = {
            'name' : transformer['name'] + ' and ' + model['name'],
            'pipe' : pipe,
        }
        pipe_dict['params'] = {**transformer['params'], **model['params']}
        pipes.append(pipe_dict)

In [18]:
searches = [{'name': 'Baseline', 
             'gs' : 'NA',
             'train_score': y.value_counts(normalize = True)[0],
             'test_score': y.value_counts(normalize = True)[0]}]
searchn = 1
for pipe in pipes:
    gs = GridSearchCV(pipe['pipe'], pipe['params'], verbose = 1, n_jobs = 1)
    nsearches = len(pipes)
    print(f"Search {searchn} of {nsearches}, {pipe['name']}")
    gs.fit(X_train, y_train)
    gs_dict = {'name' : pipe['name'],
                'gs': gs,
                'train_score': gs.score(X_train, y_train),
                'test_score': gs.score(X_test, y_test)}
    searches.append(gs_dict)
    
    #pickles 
    file_name = 'pfiles/'+ pipe['name'].replace(' ', '_') + '.p'
    pickle.dump(gs, open(file_name, 'wb'))
    
    
    print(f"Search # {searchn} of {nsearches} complete. \n Training Score: {gs_dict['train_score']}, Testing Score: {gs_dict['test_score']}")
    searchn += 1

Search 1 of 6, Count Vectorizer and Multinomial Naive Bayes
Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed:  4.0min finished


Search # 1 of 6 complete. 
 Training Score: 0.9407754010695187, Testing Score: 0.909382518043304
Search 2 of 6, TF-IDF Vectorizer and Multinomial Naive Bayes
Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   55.1s finished


Search # 2 of 6 complete. 
 Training Score: 0.9407754010695187, Testing Score: 0.8917401764234162
Search 3 of 6, Count Vectorizer and Random Forest
Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 640 out of 640 | elapsed: 17.8min finished


Search # 3 of 6 complete. 
 Training Score: 0.883288770053476, Testing Score: 0.8496391339214114
Search 4 of 6, TF-IDF Vectorizer and Random Forest
Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  4.1min finished


Search # 4 of 6 complete. 
 Training Score: 0.906283422459893, Testing Score: 0.8524458700882117
Search 5 of 6, Count Vectorizer and Logistic Regression CV
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed: 15.7min finished


Search # 5 of 6 complete. 
 Training Score: 0.9901069518716578, Testing Score: 0.9149959903769046
Search 6 of 6, TF-IDF Vectorizer and Logistic Regression CV
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  3.2min finished


Search # 6 of 6 complete. 
 Training Score: 0.993716577540107, Testing Score: 0.9133921411387329


In [19]:
for search in searches:
    print(f"{search['name']}: \n Train Score: {search['train_score']} \n Test Score: {search['test_score']}")
    if search['gs'] == 'NA':
        print('Baseline- Predicting 100% of the dominant class')
        print()
    else:
        print(search['gs'].best_params_)
        print()

Baseline: 
 Train Score: 0.7493483055945458 
 Test Score: 0.7493483055945458
Baseline- Predicting 100% of the dominant class

Count Vectorizer and Multinomial Naive Bayes: 
 Train Score: 0.9407754010695187 
 Test Score: 0.909382518043304
{'cvec__max_df': 0.8, 'cvec__max_features': 5000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': None, 'mnnb__alpha': 1}

TF-IDF Vectorizer and Multinomial Naive Bayes: 
 Train Score: 0.9407754010695187 
 Test Score: 0.8917401764234162
{'mnnb__alpha': 0.1, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': None}

Count Vectorizer and Random Forest: 
 Train Score: 0.883288770053476 
 Test Score: 0.8496391339214114
{'cvec__max_df': 0.9, 'cvec__max_features': 2500, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': None, 'rfc__bootstrap': True, 'rfc__max_depth': 50, 'rfc__min_samples_leaf': 2, 'rfc__min_samples_split': 5}

TF-IDF Vectorizer and Random Forest: 
 Train Score: 0.906283422459893 

## next we'll create an API via Flask 