In [1]:
from scripture.functions import *
from scripture.NLP_Pipe import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB

In [2]:
ads = get_ads_frame()
ads = ads[~ads['ad_copy'].isna()]

In [3]:
tf, count, tf_mat, count_mat, word_freq = nlp_pipe(ads)

In [4]:
count_mat.shape

(3477, 11174)

In [5]:
word_freq

[('black', 1713),
 ('police', 851),
 ('people', 647),
 ('com', 514),
 ('join', 460),
 ('stop', 398),
 ('like', 364),
 ('free', 357),
 ('man', 357),
 ('america', 346),
 ('community', 332),
 ('https', 331),
 ('matters', 324),
 ('american', 320),
 ('don', 309),
 ('just', 272),
 ('www', 269),
 ('white', 262),
 ('bm', 258),
 ('year', 246),
 ('follow', 229),
 ('facebook', 224),
 ('officer', 221),
 ('video', 218),
 ('life', 215),
 ('blackmattersus', 215),
 ('old', 212),
 ('new', 211),
 ('cops', 205),
 ('make', 204),
 ('support', 204),
 ('want', 202),
 ('officers', 199),
 ('time', 196),
 ('racism', 191),
 ('african', 190),
 ('school', 189),
 ('country', 187),
 ('day', 184),
 ('said', 184),
 ('let', 181),
 ('shot', 179),
 ('lives', 175),
 ('know', 174),
 ('news', 169),
 ('years', 168),
 ('justice', 166),
 ('brutality', 165),
 ('trump', 162),
 ('cop', 161),
 ('2nd', 160),
 ('stand', 154),
 ('rights', 153),
 ('twitter', 150),
 ('americans', 149),
 ('need', 148),
 ('united', 145),
 ('help', 145),


In [6]:
y_black = ads['African American']

In [27]:
X_train_af, X_test_af, y_train_af, y_test_af = train_test_split(tf_mat, y_black, test_size=.3)

In [28]:
Af_boost = GradientBoostingClassifier().fit(X_train_af, y_train_af)
Af_boost

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [29]:
Af_boost.score(X_test_af, y_test_af)

0.8113026819923371

## Naive Bayes AF

In [122]:
af_bay = MultinomialNB(fit_prior=False).fit(X_train_af, y_train_af)

In [123]:
af_bay.score(X_test_af, y_test_af)

0.8352490421455939

In [124]:
af_bay.fit(tf_mat, y_black)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

In [125]:
af_pkl = open('web_app/data/af_bay.pkl', 'wb')
pickle.dump(af_bay, af_pkl)
af_pkl.close()

## Grid Search

In [27]:
gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              'max_features': [1.0, 0.3, 0.1]}

In [31]:
grid_search_af = GridSearchCV(GradientBoostingClassifier(), gb_grid_params, n_jobs=-1, verbose=3)
grid_search_af.fit(X_train_af, y_train_af)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=100 
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=100 
[CV]  learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=50, score=0.7469135802469136, total=   3.4s
[CV] learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=100 
[CV]  learning_rate=0.1, max_depth=4, max_features=1.0, min_samples_leaf=50, score=0.7620221948212084, total=   3.5s
[CV] learning_ra

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.2s


[CV]  learning_rate=0.1, max_depth=4, max_features=0.3, min_samples_leaf=150, score=0.7012345679012346, total=   2.5s
[CV] learning_rate=0.1, max_depth=4, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=0.1, max_depth=4, max_features=0.3, min_samples_leaf=150, score=0.7266009852216748, total=   2.7s
[CV] learning_rate=0.1, max_depth=4, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=0.1, max_depth=4, max_features=0.3, min_samples_leaf=150, score=0.7127003699136868, total=   2.8s
[CV] learning_rate=0.1, max_depth=4, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=0.1, max_depth=4, max_features=0.3, min_samples_leaf=100, score=0.725925925925926, total=   3.5s
[CV] learning_rate=0.1, max_depth=4, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.1, max_depth=4, max_features=0.3, min_samples_leaf=100, score=0.7475369458128078, total=   3.8s
[CV] learning_rate=0.1, max_depth=4, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.1, max_de

[CV] learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.1, max_depth=6, max_features=0.3, min_samples_leaf=100, score=0.7246913580246913, total=   4.0s
[CV] learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=100, score=0.7450738916256158, total=   1.0s
[CV] learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=20, score=0.7765432098765432, total=   2.1s
[CV] learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=100, score=0.7250308261405672, total=   0.9s
[CV] learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.1, max_depth=6, max_features=0.1, min_samples_leaf=20, score=0.8175092478421702, total=   2.2s
[CV] learning_rate=0.1, max

[CV]  learning_rate=0.1, max_depth=8, max_features=0.1, min_samples_leaf=100, score=0.7271604938271605, total=   0.9s
[CV] learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV]  learning_rate=0.1, max_depth=8, max_features=0.1, min_samples_leaf=150, score=0.7114673242909988, total=   0.6s
[CV] learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV]  learning_rate=0.1, max_depth=8, max_features=0.1, min_samples_leaf=50, score=0.7733990147783252, total=   2.3s
[CV] learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV]  learning_rate=0.1, max_depth=8, max_features=0.1, min_samples_leaf=50, score=0.7681874229346486, total=   2.3s
[CV] learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV]  learning_rate=0.1, max_depth=8, max_features=0.1, min_samples_leaf=150, score=0.7012345679012346, total=   0.6s
[CV] learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=50 
[CV]  learning_rate=0.1, ma

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   51.3s


[CV]  learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=20, score=0.7641975308641975, total=   3.5s
[CV] learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=50 
[CV]  learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=50, score=0.7684729064039408, total=   3.5s
[CV] learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=50 
[CV]  learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=20, score=0.8054187192118226, total=   3.7s
[CV] learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=100 
[CV]  learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=150, score=0.7012345679012346, total=   3.8s
[CV] learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=100 
[CV]  learning_rate=0.05, max_depth=4, max_features=1.0, min_samples_leaf=150, score=0.7266009852216748, total=   3.9s
[CV] learning_rate=0.05, max_depth=4, max_features=0.3, min_samples_leaf=100 
[CV]  learning_rate=

[CV]  learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=20, score=0.8054187192118226, total=   5.2s
[CV] learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=150 
[CV]  learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=20, score=0.8051787916152897, total=   5.2s
[CV] learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=150 
[CV]  learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=100, score=0.7475369458128078, total=   3.3s
[CV] learning_rate=0.05, max_depth=6, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=150, score=0.7253694581280788, total=   2.0s
[CV] learning_rate=0.05, max_depth=6, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=0.05, max_depth=6, max_features=0.3, min_samples_leaf=150, score=0.7012345679012346, total=   1.9s
[CV] learning_rate=0.05, max_depth=6, max_features=0.1, min_samples_leaf=20 
[CV]  learning_rate=

[CV]  learning_rate=0.05, max_depth=8, max_features=0.3, min_samples_leaf=150, score=0.7127003699136868, total=   2.8s
[CV] learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.05, max_depth=8, max_features=0.3, min_samples_leaf=50, score=0.7669543773119606, total=   6.8s
[CV] learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.05, max_depth=8, max_features=0.3, min_samples_leaf=50, score=0.737037037037037, total=   6.7s
[CV] learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.05, max_depth=8, max_features=0.3, min_samples_leaf=100, score=0.725925925925926, total=   4.4s
[CV] learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=20, score=0.8152709359605911, total=   3.5s
[CV] learning_rate=0.05, max_depth=8, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.

[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=20, score=0.7780517879161529, total=   1.4s
[CV] learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=100, score=0.7475369458128078, total=   0.8s
[CV] learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=50, score=0.7623152709359606, total=   1.3s
[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=50, score=0.7345679012345679, total=   1.2s
[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=150, score=0.7016029593094945, total=   0.5s
[CV] learning_rate=0.02, max_depth=6, max_features=1.0, min_samples_leaf=20 
[CV] learning_rate=0.02, max_depth=6, max_features=1.0, min_samples_leaf=20 
[CV]  learning_rate=0.02, max_depth=4, max_features=0.1, min_samples_leaf=50, score=0.739827373612

[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.0min


[CV]  learning_rate=0.02, max_depth=6, max_features=0.3, min_samples_leaf=150, score=0.7114673242909988, total=   2.3s
[CV] learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.02, max_depth=6, max_features=0.3, min_samples_leaf=50, score=0.7435265104808878, total=   4.5s
[CV] learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=50 
[CV]  learning_rate=0.02, max_depth=6, max_features=0.3, min_samples_leaf=50, score=0.7358024691358025, total=   4.3s
[CV] learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.02, max_depth=6, max_features=0.3, min_samples_leaf=100, score=0.725925925925926, total=   3.3s
[CV] learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=100, score=0.7463054187192119, total=   0.9s
[CV] learning_rate=0.02, max_depth=6, max_features=0.1, min_samples_leaf=100 
[CV]  learning_rate=

[CV]  learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=20, score=0.7903822441430333, total=   2.6s
[CV] learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=100, score=0.7213316892725031, total=   0.8s
[CV] learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=150 
[CV]  learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=150, score=0.7241379310344828, total=   0.6s
[CV] learning_rate=0.01, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV]  learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=100, score=0.7246913580246913, total=   0.8s
[CV]  learning_rate=0.02, max_depth=8, max_features=0.1, min_samples_leaf=150, score=0.6954377311960542, total=   0.6s
[CV] learning_rate=0.01, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV] learning_rate=0.01, max_depth=4, max_features=1.0, min_samples_leaf=20 
[CV]  learning_rate

[CV]  learning_rate=0.01, max_depth=4, max_features=0.1, min_samples_leaf=50, score=0.7308641975308642, total=   1.6s
[CV] learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=50 
[CV]  learning_rate=0.01, max_depth=4, max_features=0.1, min_samples_leaf=50, score=0.7274969173859432, total=   1.6s
[CV] learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=50 
[CV]  learning_rate=0.01, max_depth=4, max_features=0.1, min_samples_leaf=100, score=0.7123456790123457, total=   1.1s
[CV] learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=100 
[CV]  learning_rate=0.01, max_depth=4, max_features=0.1, min_samples_leaf=150, score=0.7, total=   0.7s
[CV] learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=100 
[CV]  learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=100, score=0.7253694581280788, total=   5.1s
[CV] learning_rate=0.01, max_depth=6, max_features=1.0, min_samples_leaf=100 
[CV]  learning_rate=0.01, max_dept

[CV]  learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=50, score=0.7283950617283951, total=   5.3s
[CV] learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=150 
[CV]  learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=50, score=0.7561576354679803, total=   5.6s
[CV] learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=150 
[CV]  learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=50, score=0.7274969173859432, total=   5.5s
[CV] learning_rate=0.01, max_depth=8, max_features=0.3, min_samples_leaf=20 
[CV]  learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=20, score=0.7881773399014779, total=   6.5s
[CV] learning_rate=0.01, max_depth=8, max_features=0.3, min_samples_leaf=20 
[CV]  learning_rate=0.01, max_depth=8, max_features=1.0, min_samples_leaf=20, score=0.7506172839506173, total=   6.3s
[CV] learning_rate=0.01, max_depth=8, max_features=0.3, min_samples_leaf=20 
[CV]  learning_rate=0.0

[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  3.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01], 'max_depth': [4, 6, 8], 'min_samples_leaf': [20, 50, 100, 150], 'max_features': [1.0, 0.3, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [20]:
af_grad = grid_search_af.best_estimator_
af_grad

NameError: name 'grid_search_af' is not defined

In [21]:
af_grad.score(X_test_af, y_test_af)

NameError: name 'af_grad' is not defined

In [97]:
af_grad.fit(tf_mat, y_black)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [100]:

#This is how to find important tf words
tf_array=np.array(tf.get_feature_names())
af_sorts = np.argsort(af_grad.feature_importances_)[::-1]
tf_array[af_sorts][:10]

array(['black', 'repost', 'com', 'woke', 'join', 'stop', 'organization',
       'matters', 'free', 'african'], dtype='<U34')

In [None]:
af_feature_array=af_grad.feature_importances_

In [65]:
af_pkl = open('data/af_grad.pkl', 'wb')
pickle.dump(af_grad, af_pkl)
af_pkl.close()

## Progressive

In [127]:
y_prog = ads['Progressive']

In [128]:
prog_grad = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [129]:
X_train_pr, X_test_pr, y_train_pr, y_test_pr = train_test_split(tf_mat, y_prog)

In [130]:
prog_grad.fit(X_train_pr, y_train_pr)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features=0.1, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=20, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [131]:
prog_grad.score(X_test_pr, y_test_pr)

0.8724137931034482

In [None]:
prog_grad.

In [135]:
prog_pickle = open('data/prog_grad.pkl', 'wb')
pickle.dump(prog_grad, prog_pickle)
prog_pickle.close()

In [155]:
prog_sorts = np.argsort(prog_bay.feature_log_prob_)[1][::-1]
tf_array[prog_sorts][:10]


array(['repost', 'police', 'black', 'bm', 'brutality', 'man', 'video',
       'cops', 'cop', 'join'], dtype='<U34')

In [149]:
prog_bay.predict_log_proba()

array([-0.69314718, -0.69314718])

## Naive Bayes Progressive
### Grad Boost is better for progressive

In [136]:
prog_bay = MultinomialNB(fit_prior=False).fit(X_train_pr, y_train_pr)

In [137]:
prog_bay.score(X_test_pr, y_test_pr)

0.8367816091954023

In [138]:
prog_pickle = open('web_app/data/prog_bay.pkl', 'wb')
pickle.dump(prog_bay, prog_pickle)
prog_pickle.close()

## Getting important words from tfidf

In [85]:

#This is how to find important tf words
tf_array=np.array(tf.get_feature_names())
prog_sorts = np.argsort(prog_grad.feature_importances_)[::-1]
tf_array[prog_sorts][:10]

array(['repost', 'join', 'want', 'brutality', 'cop', 'http', 'bm',
       'video', 'black', 'like'], dtype='<U34')

## LatinX

In [94]:
latin_y = ads['Latinx']
lat_X_train, lat_X_test, lat_y_train, lat_y_test = train_test_split(tf_mat, latin_y, test_size=.3)

In [115]:
lat_bayes = MultinomialNB(fit_prior=False).fit(lat_X_train, lat_y_train)

In [116]:
lat_bayes.score(lat_X_test, lat_y_test)

0.9224137931034483

In [117]:
lat_bayes.feature_log_prob_

array([[-9.87832114, -8.30317222, -9.87832114, ..., -9.72068145,
        -9.87832114, -9.6141061 ],
       [-9.34779403, -9.34779403, -9.34779403, ..., -9.34779403,
        -9.34779403, -9.34779403]])

In [118]:
lat_bayes.fit(tf_mat, latin_y)
tf_mat.shape

(3477, 11174)

In [126]:
lat_path = open('web_app/data/lat_bay.pkl', 'wb')
pickle.dump(lat_bayes, lat_path)
lat_path.close()

In [100]:
lat_bayes.predict_proba

<bound method BaseNB.predict_proba of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>

In [101]:
#This is how to find important tf words
tf_array=np.array(tf.get_feature_names())
lat_sorts = np.argsort(lat_bayes.feature_log_prob_)[::]
tf_array[lat_sorts][:10]

array([['majorityminority', 'dual', 'scariest', ..., 'police', 'join',
        'black'],
       ['johnny', 'overturn', 'overwhelming', ..., 'mexico', 'like',
        'mexican']], dtype='<U34')

## Patriotism

In [57]:
y_pat = ads['Patriotism']

In [77]:
X_train_pat, X_test_pat, y_train_pat,y_test_pat = train_test_split(tf_mat, y_pat)

In [108]:
pat_bay = MultinomialNB(fit_prior=False).fit(X_train_pat, y_train_pat)

In [109]:
pat_bay.score(X_test_pat, y_test_pat)

0.906896551724138

In [110]:
pat_bay.fit(tf_mat,y_pat)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

In [111]:
pat_path= open('web_app/data/pat_bay.pkl', 'wb')
pickle.dump(pat_bay, pat_path)
pat_path.close()

In [112]:
pat_sorts = np.argsort(pat_bay.feature_log_prob_)[::]
tf_array[pat_sorts][:10]

array([['mimicking', 'mcmaster', 'meandmyflag', ..., 'police', 'join',
        'black'],
       ['00', 'nonchalantly', 'nonprofit', ..., 'homeland', 'heart',
        'texas']], dtype='<U34')

In [113]:
pat_bay.class_count_

array([3296.,  181.])

In [114]:
lat_bayes.class_count_

array([3271.,  206.])