# **Model Testing** - Text Only

In [1]:
# pip install -U sentence-transformers

In [2]:
import mysql.connector 
import numpy as np
import pandas as pd
import requests

from sodapy import Socrata
import sqlalchemy as db

import config_final as config
from schema import DbSchema

import pickle


In [3]:
bills_db = DbSchema(config)


In [4]:
#Query all titles and Passing

df = bills_db.query("""
    SELECT
        cb.Title,
        cb.PassH
    FROM con_bills.current_bills as cb
    JOIN con_bills.topics as tp
    ON cb.BillID = tp.BillID
    WHERE cb.Cong >=110
    """)
df.head()

In [5]:
df.shape

In [6]:
df['PassH'].value_counts()

# **Tokenizer:**

In [7]:
nlp = English()

stop_words = spacy.lang.en.stop_words.STOP_WORDS

nlp.Defaults.stop_words |= {"bill","amend", "purpose", "united", "state", "states", "secretary", "act", "federal", "provide"}

replace_with_space = re.compile('[/(){}\[\]\|@,;]')

just_words = re.compile('[^a-zA-Z\s]')

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')

stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
import spacy
from spacy.lang.en import English
import en_core_web_sm
import string
import re

def tokenizer(text):
    
    
    #lowercase everything
    lower_text = text.lower()
    
    #remove punctuation
#     no_pun_text = lower_text.translate(str.maketrans('', '', string.punctuation))
    
    #get rid of weird characters
    text = replace_with_space.sub('',lower_text)
    
    #remove numbers
    just_words_text = just_words.sub('', text)
    
    #add spacy tokenizer
    mytokens = nlp(just_words_text, disable=['parser', 'ner'])
#     print(mytokens)
    
    #for POS tagging
#     mytokens = [word for word in mytokens if (word.pos_ == 'NOUN') or (word.pos_ == 'VERB') or (word.pos_ == 'ADJ') or (word.pos_ == 'ADV')]
    
    #lemmatize
    mytokens = [word.lemma_.strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    #MAP SPECIFIC WORDS to others (veteran from veterans)

    #add stopwords
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    
    return mytokens
    

# Apply Tokenizer to Title Text

In [None]:
df['tokenized_title']=df['Title'].apply(lambda x: tokenizer(x))

In [None]:
df.head()

In [None]:
# for_pickle = df.drop(columns='Title')

In [None]:

# #is it vectorizer or transformed?
# tk_titles = 'tokenized_titles.sav'
# pickle.dump(for_pickle, open(tk_titles, 'wb'))
 

Unpickle Tokenized Text

In [None]:
# load the model from disk
test_1 = pickle.load(open(tk_titles, 'rb'))
test_1.head()

# **Modeling**

Import packages:

- CountVectorizer
- TFIDF

- Naive Bayes
- Logistic Regression
- Random Forest

**Remember to look at feature importances!

In [None]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import FeatureHasher


**Train test split** - Tokenized Text

In [None]:
from sklearn.model_selection import train_test_split

X = test_1['tokenized_title']
y = test_1['PassH']


In [None]:
#Train Test split!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=2)

In [None]:
X_train.head()

In [None]:
X_train.shape

**Train test split** - Regular Text

In [None]:
from sklearn.model_selection import train_test_split

X1 = df['Title']
y1 = df['PassH']


In [None]:
#Train Test split!
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = .2, random_state=2)

# **Logistic Regression**

- Precision means the percentage of your results which are relevant. 
- recall refers to the percentage of total relevant results correctly classified by your algorithm.

Also make a precision recall curve

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer(tokenizer = tokenizer, max_df = 0.90, max_features = 10000) # max_df=0.90, min_df=10
# X_train_transformed = vectorizer.fit_transform(X_train)
# X_test_fit = vectorizer.transform(X_test)

# print(len(vectorizer.get_feature_names()))

vectorizer = CountVectorizer(tokenizer=tokenizer, max_df=0.90, max_features=1000)

transformed = vectorizer.fit_transform(X_train, y_train)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer1 = CountVectorizer(tokenizer = tokenizer, max_df = 0.90, max_features = 10000) # max_df=0.90, min_df=10
# trainsformed = vectorizer1.fit_transform(X_train, X_test)

# print(len(vectorizer1.get_feature_names()))

# vectorizer = CountVectorizer(tokenizer=tokenizer, max_df=0.5, max_features=None)

# transformed = c_vectorizer.fit_transform(X_train, y_train)

Pickle Test and Train

In [None]:

# #is it vectorizer or transformed?
# filename_1 = 'finalized_countvectorizer_WORDSONLY_train.sav'
# pickle.dump(X_train_transformed, open(filename_1, 'wb'))
 

In [None]:
# #is it vectorizer or transformed?
# filename_2 = 'finalized_countvectorizer_WORDSONLY_test.sav'
# pickle.dump(X_test_fit, open(filename_2, 'wb'))
 

Open Pickle Files:

In [None]:
# # load the model from disk
# X_train1 = pickle.load(open(filename_1, 'rb'))
# X_train1

In [None]:
# # load the model from disk
# X_test1 = pickle.load(open(filename_2, 'rb'))

# X_test1

# Logistic Regression CountVectorizer for Regular Text

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

"""THIS IS ONLY FOR NON TOKENIZED TEXT"""

lr_clf = Pipeline([('vect', CountVectorizer(tokenizer = dummy, preprocessor = dummy, max_df=0.5, max_features=1000)),
               ('clf', LogisticRegression(class_weight='balanced', C=.8, random_state=2)),
              ])

# Logistic Regression Classifier

# lr_clf = LogisticRegression(class_weight='balanced', C=.8)

lr_clf.fit(X_train1, y_train1)

lr_y_pred = lr_clf.predict(X_test1)


print(confusion_matrix(y_test1, lr_y_pred))
print(classification_report(y_test1, lr_y_pred))

lr_confusion_matrix = confusion_matrix(y_test1, lr_y_pred)

In [None]:
# save the model to disk
lr_word_model = 'lr_model_NON_TOKENIZED_CV.sav'
pickle.dump(lr_clf, open(lr_word_model, 'wb'))
 
# some time later...


In [None]:
# load the model from disk
Final_Model = pickle.load(open(lr_word_model, 'rb'))


In [None]:
test_ = Final_Model.predict(['national, implement, forest, veteran'])

In [None]:
test_

# Logistic Regression TFIDF for Regular Text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Bring in OHE?

lr_clf_tf = Pipeline([('vect', TfidfVectorizer(tokenizer=tokenizer, encoding='utf-8', smooth_idf = True)),
               ('clf', LogisticRegression(class_weight='balanced', C=.8, random_state=2)),
              ])

# Logistic Regression Classifier
# lr_classifier = LogisticRegression()

lr_clf_tf.fit(X_train1, y_train1)

lr_y_pred_tf = lr_clf_tf.predict(X_test1)

print(confusion_matrix(y_test1, lr_y_pred_tf))
print(classification_report(y_test1, lr_y_pred_tf))

lr_confusion_matrix = confusion_matrix(y_test, lr_y_pred_tf)

In [None]:
test_2 = lr_clf_tf.predict(['A bill to designate postal office post office for forests in a forest of national veterans'])

In [None]:
test_2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#shows percent data represented in each quadrant

sns.heatmap(lr_confusion_matrix/np.sum(lr_confusion_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')

In [None]:
#for ROC/AUC Curve

lr_dec = lr_clf.decision_function(X_test)

import numpy as np
from sklearn.metrics import roc_auc_score
lr_roc_auc = roc_auc_score(y_test, lr_y_pred)
lr_roc_auc

#This is a decent ROC Score. Remember lays between .5 and 1

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

test_fpr, test_tpr, test_thresholds = roc_curve(y_test, lr_dec)

print('Test AUC: {}'.format(auc(test_fpr, test_tpr)))

# Seaborn's beautiful styling
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

plt.figure(figsize=(10, 8))
lw = 2

plt.plot(test_fpr, test_tpr, color='darkorange',
         lw=lw, label='Test ROC curve')

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('(ROC) Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.metrics import average_precision_score

lr_average_precision = average_precision_score(y_test, lr_dec)

print('Average precision-recall score: {0:0.2f}'.format(
      lr_average_precision))

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

disp = plot_precision_recall_curve(lr_clf, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(lr_average_precision))

# Logistic Regression Pre-Tokenized Text:

In [None]:
def dummy(x):
    return x
# Logistic Regression Classifier
lr_classifier = LogisticRegression(tokenizer = dummy, preprocessor=dummy, class_weight = 'balanced', C=.8, random_state=2)

lr_clf_pretok.fit(X_train, y_train)

lr_y_pred_pt = lr_clf_pretok.predict(X_test1)

print(confusion_matrix(y_test, lr_y_pred_pt))
print(classification_report(y_test, lr_y_pred_pt))

lr_confusion_matrix = confusion_matrix(y_test, lr_y_pred_pt)

# **Random Forest** - Regular Text

Class Imbalance: To overcome this issue, we used repeated random sub-sampling. Initially, we construct the testing data and the NoS training data sub-samples. For each disease, we train NoS classifiers and test all of them on the same data set. The final labels of the testing data are computed using a majority voting scheme.

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_pipeline = Pipeline(steps=[('vectorizer', CountVectorizer(tokenizer = tokenizer)),
                      ('classifier', RandomForestClassifier(max_depth=None, max_features='auto', n_estimators=100, class_weight="balanced", max_df=0.90, max_features=1000))])


rf_pipeline.fit(X_train1, y_train1) 

rf_y_pred = rf_pipeline.predict(X_test1)

print(confusion_matrix(y_test1, rf_y_pred))
print(classification_report(y_test1, rf_y_pred))

rf_confusion_matrix = confusion_matrix(y_test1, rf_y_pred)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#shows percent data represented in each quadrant

sns.heatmap(rf_confusion_matrix/np.sum(rf_confusion_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')

In [None]:
feature_names = rf_pipeline.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names(categorical_features)

In [None]:
importances = rf_pipeline.steps[1][1].feature_importances_
len(importances)

In [None]:
indices = np.argsort(importances)[::-1]
top_k = 10
new_indices = indices[:top_k]

In [None]:
new_indices

In [None]:
import matplotlib.pyplot as plt

def plot_feature_importances(model):
    
#     n_features = importances.shape
    
    plt.figure(figsize=(15,200))
    plt.barh(range(1044), importances, align='center') 
    
    plt.yticks(np.arange(1044), feature_names) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(rf_pipeline)

# Comparing all Models:

In [None]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:

mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc