In [172]:
import numpy as np
import pandas as pd
import pickle

from collections import Counter
from time import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn import metrics
from scipy.stats import randint as sp_randint

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


In [70]:
### import data ###

data = pd.read_csv("Ai4good_pre.csv")
data = data.iloc[:, 1:] # remove first column

print(data.shape)

### shuffle data and split into train and test ###
data = data.sample(frac=1).reset_index(drop=True)
train_data = data[:55]
test_data = data[55:]

data


(91, 3)


Unnamed: 0,Label,Title,Text
0,opinion,editorial : caq ' s immigration plan is bad...,coalition avenir quebec ' s newly revealed pro...
1,fact,military looks at foreign recruits to boost ranks,canada ' s military is considering lifting lo...
2,fact,he ' s a long shot for sc gov but gop firebran...,one series profiles on candidates running ...
3,opinion,after the santa fe and parkland school shootin...,what would happen millions parents across am...
4,opinion,the trump coalition is the future of the gop,there is long list conservative columnists i...
5,fact,federal changes to see asylum - seeker claims ...,asylum seekers who cross illegally into canada...
6,opinion,it ' s time we listened to melania,news media are obsessed with unstable stream...
7,fact,supreme court rules that companies can require...,ideologically divided supreme court ruled mon...
8,opinion,hard power long ago eclipsed the pursuit of pe...,theory relocation united states embassy i...
9,fact,trump says he ' s ' not familiar ' with wein...,washington ( cnn ) president donald trump sai...


In [68]:
# To look at an example text
#data["Text"][66]

' controversial sweeping development plan expected  cost more than  $ 140 million was scrapped by  divided  agitated richland county council on thursday  county already has spent more than  $ 8 million buying properties for different portions   now - stalled richland renaissance plan including three anchor stores at columbia place mall  it is uncertain what will happen  them now richland renaissance was supposed  include new county administrative offices at columbia place mall  new downtown courthouse complex  transportation hub  business incubator   st  andrews area  hospital  aquatics center  lower richland   countywide historic trail among other elements it was  transportation hub  business incubator that apparently drew  whole plan   halt thursday as council members argued over whether  purchase  piece  property for  $ 2 9 million near dutch square mall off bush river road  business incubator  transportation hub had not been properly vetted  might not be necessary or appropriate  t

In [58]:
### Building the vocabulary ###

# NOT USED!!!!

#def get_vocabulary(d, vocab_size=5000): 
#    #concatenate all the datapoint texts:
#    all_words = d.Text.str.cat()
#    #count the words: 
#    frequencies = Counter(all_words.split()).most_common()
#    return frequencies[0:vocab_size]

#vocab = get_vocabulary(train_data, 5000)
#len(vocab)

7218

In [127]:
def get_freqBoW_and_targetvect(dataset, train_test_split, vocab_length=10000, ngram_min_max=(1,1)):
    
    # Shuffle data
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    
    # Split into test and train
    train_data = data[:train_test_split]
    test_data = data[train_test_split:]

    # Initialize the (train) vectorizer to count words and return the vectorized text
    vectorizer_train = CountVectorizer(stop_words=None, max_features = vocab_length, ngram_range = ngram_min_max)
    
    # Convert the Text (train) data to bag-of-word vectors, and normalize to get frequencies. Also get Labels. 
    x_train = vectorizer_train.fit_transform(train_data.Text)
    x_train = normalize(x_train, norm='l1', axis=1)
    y_train = train_data.Label.as_matrix()
    
    # Initialize the (test) vectorizer, using the vocabulary from the train part
    vectorizer_test = CountVectorizer(stop_words=None, vocabulary = vectorizer_train.vocabulary_)
    
    # Convert the Text (test) data to bag-of-word vectors, normalize, and get Labels. 
    x_test = vectorizer_test.fit_transform(test_data.Text)
    x_test = normalize(x_test, norm='l1', axis=1)
    y_test = test_data.Label.as_matrix()
    
    # Return the frequency bag-of-words for the train and test sets.
    return x_train, y_train, x_test, y_test, vectorizer_train

In [125]:
# Utility function to report best scores in model selection
def report(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [136]:
### Get the train and test datasets! ###

x_train, y_train, x_test, y_test, vectorizer = get_freqBoW_and_targetvect(data, 61, 1000, (1,1))

print("TRAIN")
print(x_train.shape)
print(y_train.shape)

print("\n\nTEST")
print(x_test.shape)
print(y_test.shape)

In [92]:
### RANDOM CLASSIFIER ###

classes = np.unique(y_test)
random_predictions = [np.random.choice(classes) for _ in range(x_test.shape[0])]
print("Random classifier")
print("Accuracy: ", metrics.accuracy_score(y_test, random_predictions))
print("Precision:", metrics.precision_score(y_test, random_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, random_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, random_predictions, pos_label='fact'))

Random classifier
Accuracy:  0.466666666667
Precision: 0.357142857143
Recall:    0.416666666667
F1 score:  0.384615384615


In [137]:
### Multinomial Naive Bayes: Model Selection (grid search) ###

clf = MultinomialNB()
param_grid = {"alpha": [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
              "fit_prior": [True, False]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

GridSearchCV took 0.15 seconds for 14 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.902 (std: 0.041)
Parameters: {'alpha': 0.1, 'fit_prior': False}

Model with rank: 2
Mean validation score: 0.852 (std: 0.072)
Parameters: {'alpha': 0.01, 'fit_prior': False}

Model with rank: 3
Mean validation score: 0.820 (std: 0.065)
Parameters: {'alpha': 0.001, 'fit_prior': False}

Model with rank: 4
Mean validation score: 0.705 (std: 0.081)
Parameters: {'alpha': 1, 'fit_prior': False}

Model with rank: 5
Mean validation score: 0.689 (std: 0.067)
Parameters: {'alpha': 1e-06, 'fit_prior': False}

Model with rank: 5
Mean validation score: 0.689 (std: 0.067)
Parameters: {'alpha': 1e-05, 'fit_prior': False}

Model with rank: 5
Mean validation score: 0.689 (std: 0.067)
Parameters: {'alpha': 0.0001, 'fit_prior': False}

Model with rank: 8
Mean validation score: 0.656 (std: 0.033)
Parameters: {'alpha': 1e-05, 'fit_prior': True}

Model with rank: 8
Mean validation score: 0.656 (st

In [139]:
### Multinomial Naive Bayes ###

# Hyperparameters: use the best from the grid search above
alpha_ = 0.1      # Laplace smoothing
fit_prior_ = False # Whether to learn class priors (true) or use a uniform prior (false)
# class_prior: not used (let the model adjust the class priors to the data)

nb_classifier = MultinomialNB(alpha=alpha_, fit_prior = fit_prior_)
nb_classifier.fit(x_train, y_train)

train_predictions = nb_classifier.predict(x_train)
test_predictions = nb_classifier.predict(x_test)

print("Multinomial Naive Bayes")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Multinomial Naive Bayes
TRAIN
Accuracy:  0.983606557377
Precision: 1.0
Recall:    0.969696969697
F1 score:  0.984615384615
TEST
Accuracy:  0.733333333333
Precision: 0.625
Recall:    0.833333333333
F1 score:  0.714285714286


In [144]:
print(nb_classifier.classes_)
print((nb_classifier.feature_count_).shape)
print(nb_classifier.feature_count_[0])


['fact' 'opinion']
(2, 1000)
[ 0.01506732  0.00652314  0.01277832  0.01414711  0.00613018  0.00529903
  0.01118634  0.00993383  0.02645226  0.02249208  0.01486579  0.01804398
  0.03426091  0.14969674  0.00704948  0.03231823  0.0067519   0.01006512
  0.0384392   0.01680955  0.00807498  0.00904461  0.02323049  0.02653991
  0.05276317  0.00628754  0.02103787  0.08277096  0.012915    0.092625
  0.01329402  0.02744915  0.00935412  0.01233084  0.01886825  0.0168389
  0.00316309  0.08849209  0.03793467  0.01214674  0.02068249  0.02619401
  0.01640313  0.01065148  0.0448939   0.08442516  0.01416005  0.01518267
  0.02269889  0.00899944  0.0266496   0.00946879  0.05393426  0.03496646
  0.04506388  0.01365862  0.03181883  0.          0.01965147  0.03948198
  0.01044008  0.02054811  0.00939984  0.00972047  0.01582998  0.01579765
  0.02422145  0.21475496  0.02194234  0.02537938  0.01038043  0.02124403
  0.34978621  0.00715507  0.01280239  0.02131228  0.02496073  0.23161509
  0.00815101  0.02351217 

In [197]:
# Trying to visualize most informative features

index = 0
coef_features_c1_c2 = []

for feat, c1, c2 in zip(vectorizer.get_feature_names(), nb_classifier.feature_count_[0], nb_classifier.feature_count_[1]):
    coef_features_c1_c2.append(tuple([nb_classifier.coef_[0][index], feat, c1, c2]))
    index+=1

for i in sorted(coef_features_c1_c2):
    print(i)

(-7.1546153569136628, 'added', 0.026539908188448244, 0.0)
(-7.1546153569136628, 'aecon', 0.021037868162692847, 0.0)
(-7.1546153569136628, 'agencies', 0.013294017952023798, 0.0)
(-7.1546153569136628, 'agency', 0.027449151350626165, 0.0)
(-7.1546153569136628, 'angeles', 0.034966456186238191, 0.0)
(-7.1546153569136628, 'arbitration', 0.024221453287197232, 0.0)
(-7.1546153569136628, 'arizona', 0.025379382522239667, 0.0)
(-7.1546153569136628, 'association', 0.024960730535065135, 0.0)
(-7.1546153569136628, 'auto', 0.028531255915712567, 0.0)
(-7.1546153569136628, 'blaikie', 0.017605633802816902, 0.0)
(-7.1546153569136628, 'bryant', 0.033180778032036611, 0.0)
(-7.1546153569136628, 'caesar', 0.023404255319148935, 0.0)
(-7.1546153569136628, 'california', 0.034942976875597859, 0.0)
(-7.1546153569136628, 'car', 0.0187785489578953, 0.0)
(-7.1546153569136628, 'cars', 0.016051721571704577, 0.0)
(-7.1546153569136628, 'chavannes', 0.023404255319148935, 0.0)
(-7.1546153569136628, 'chinese', 0.0299792362

(-6.9907213176951224, 'cuts', 0.0, 0.017808947719724556)
(-6.9906498127976171, 'donation', 0.0, 0.017817371937639197)
(-6.9906498127976171, 'payment', 0.0, 0.017817371937639197)
(-6.9902140612262684, 'current', 0.013766269680994567, 0.017868722229765915)
(-6.9889826010667324, 'accept', 0.0070494794321634959, 0.018013962275557686)
(-6.9889557213380158, 'allies', 0.012146738126416142, 0.018017134501482509)
(-6.9886614957235924, 'minister', 0.062195481993489551, 0.018051863274190443)
(-6.9883868377867042, 'getting', 0.0095639062972013023, 0.018084291608548939)
(-6.9882494877156347, 'thing', 0.01340298202495515, 0.018100511608277057)
(-6.9873345745937545, 'position', 0.001053740779768177, 0.018208612760088814)
(-6.9873015204886517, 'both', 0.02971728612051917, 0.018212520104575577)
(-6.9870503534304538, 'paid', 0.013725706105819524, 0.018242214924516571)
(-6.9868353509860786, 'different', 0.019016562933465354, 0.018267640022884824)
(-6.9866490697371635, 'room', 0.00094073377234242712, 0.01

In [209]:
import codecs, re, time
from itertools import chain

ENGLISH_STOP_WORDS = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"]


['bonjour', 'jello', 'would', 'hierarchy']
['bonjour', 'jello', 'hierarchy']


In [228]:

def most_informative_feature_for_binary_classification(vectorizer, classifier, n=30):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))
    
    topn_class2_minus_stop = [x for x in topn_class2 if x[1] not in ENGLISH_STOP_WORDS]

    for coef, feat in topn_class1[:n]:
        print(class_labels[0], coef, feat)

    for coef, feat in reversed(topn_class2_minus_stop[-n:]):
        print(class_labels[1], coef, feat)
    
    print([x[1] for x in topn_class2_minus_stop[-n:]])


most_informative_feature_for_binary_classification(vectorizer, nb_classifier)

fact -7.15461535691 added
fact -7.15461535691 aecon
fact -7.15461535691 agencies
fact -7.15461535691 agency
fact -7.15461535691 angeles
fact -7.15461535691 arbitration
fact -7.15461535691 arizona
fact -7.15461535691 association
fact -7.15461535691 auto
fact -7.15461535691 blaikie
fact -7.15461535691 bryant
fact -7.15461535691 caesar
fact -7.15461535691 california
fact -7.15461535691 car
fact -7.15461535691 cars
fact -7.15461535691 chavannes
fact -7.15461535691 chinese
fact -7.15461535691 clinton
fact -7.15461535691 congressional
fact -7.15461535691 congressman
fact -7.15461535691 corker
fact -7.15461535691 council
fact -7.15461535691 county
fact -7.15461535691 cox
fact -7.15461535691 discrimination
fact -7.15461535691 funding
fact -7.15461535691 georgia
fact -7.15461535691 gorsuch
fact -7.15461535691 grant
fact -7.15461535691 guard
opinion -5.84821689605 trump
opinion -6.1950672899 president
opinion -6.39802439337 united
opinion -6.41233274494 states
opinion -6.43494011432 mr
opinion -

In [106]:
### Bagging Multinomial NB ###

# Hyperparameters of the Multinomial NB (pick values based on grid search above)
alpha_ = 0.001 
fit_prior_ = False

# Hyperparameters of the bagging
n_estimators_ = 10 #(default = 10)
#max_samples = 1.0 (default, i.e. all)
#max_features = 1.0 (default, i.e. all)

bagged_nb = BaggingClassifier(base_estimator = MultinomialNB(alpha=alpha_, fit_prior = fit_prior_), n_estimators=n_estimators_)
bagged_nb.fit(x_train, y_train)

train_predictions = bagged_nb.predict(x_train)
test_predictions = bagged_nb.predict(x_test)

print("Bagged Multinomial Naive Bayes")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Bagged Multinomial Naive Bayes
TRAIN
Accuracy:  1.0
Precision: 1.0
Recall:    1.0
F1 score:  1.0
TEST
Accuracy:  0.633333333333
Precision: 0.529411764706
Recall:    0.75
F1 score:  0.620689655172


In [114]:
### Logistic Regression with L2 regularization: Model Selection ###

clf = LogisticRegression(penalty='l2',multi_class='multinomial',dual=False,max_iter=1000,n_jobs=-1)

# specify parameters and distributions to sample from
param_dist = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              "tol":[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
              "solver": ["newton-cg","sag","lbfgs"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')

start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)


RandomizedSearchCV took 17.75 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.902 (std: 0.038)
Parameters: {'tol': 0.001, 'solver': 'newton-cg', 'C': 1000}

Model with rank: 1
Mean validation score: 0.902 (std: 0.038)
Parameters: {'tol': 1e-05, 'solver': 'sag', 'C': 1000}

Model with rank: 3
Mean validation score: 0.885 (std: 0.020)
Parameters: {'tol': 0.001, 'solver': 'lbfgs', 'C': 1000}

Model with rank: 4
Mean validation score: 0.820 (std: 0.091)
Parameters: {'tol': 0.001, 'solver': 'lbfgs', 'C': 100}

Model with rank: 4
Mean validation score: 0.820 (std: 0.091)
Parameters: {'tol': 0.1, 'solver': 'newton-cg', 'C': 100}

Model with rank: 4
Mean validation score: 0.820 (std: 0.091)
Parameters: {'tol': 1e-06, 'solver': 'newton-cg', 'C': 100}

Model with rank: 4
Mean validation score: 0.820 (std: 0.091)
Parameters: {'tol': 0.0001, 'solver': 'sag', 'C': 100}

Model with rank: 8
Mean validation score: 0.541 (std: 0.012)
Parameters: {'tol': 1e-06, 

In [148]:
### Logistic regression WITH L2 ###

# Hyperparameters of the logistic regression (pick values based on tuning above)
C_ = 1000
tol_ = 0.001
solver_ = "newton-cg" 

clf = LogisticRegression(penalty='l2',multi_class='multinomial',dual=False,max_iter=1000,n_jobs=-1,
                         C=C_, tol=tol_, solver=solver_)
clf.fit(x_train, y_train)

train_predictions = clf.predict(x_train)
test_predictions = clf.predict(x_test)

print("Logistic regression with L2")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Logistic regression with L2
TRAIN
Accuracy:  1.0
Precision: 1.0
Recall:    1.0
F1 score:  1.0
TEST
Accuracy:  0.866666666667
Precision: 0.785714285714
Recall:    0.916666666667
F1 score:  0.846153846154


In [149]:
### Bagging logistic regression WITH L2 ###

# Hyperparameters of the logistic regression (pick values based on grid search above)
C_ = 1000
tol_ = 0.001
solver_ = "newton-cg"

# Hyperparameters of the bagging
#n_estimators = 10 (default)
#max_samples = 1.0 (default, i.e. all)
#max_features = 1.0 (default, i.e. all)

bagged_lr = BaggingClassifier(base_estimator = LogisticRegression(penalty='l2', multi_class='multinomial', dual=False,
                                                                  max_iter=1000,n_jobs=-1, C=C_, tol=tol_, 
                                                                  solver=solver_))
bagged_lr.fit(x_train, y_train)

train_predictions = bagged_lr.predict(x_train)
test_predictions = bagged_lr.predict(x_test)

print("Bagged logistic regression with L2")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Bagged logistic regression with L2
TRAIN
Accuracy:  1.0
Precision: 1.0
Recall:    1.0
F1 score:  1.0
TEST
Accuracy:  0.866666666667
Precision: 0.785714285714
Recall:    0.916666666667
F1 score:  0.846153846154


In [150]:
### Logistic regression with L1: Model Selection ###

clf = LogisticRegression(penalty='l1',multi_class='multinomial',dual=False,max_iter=500,solver='saga',n_jobs=-1)

# specify parameters and distributions to sample from
param_dist = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              "tol":[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,scoring='accuracy')

start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)



RandomizedSearchCV took 4.65 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.885 (std: 0.045)
Parameters: {'tol': 0.01, 'C': 100}

Model with rank: 2
Mean validation score: 0.869 (std: 0.061)
Parameters: {'tol': 1e-05, 'C': 10}

Model with rank: 2
Mean validation score: 0.869 (std: 0.061)
Parameters: {'tol': 1e-05, 'C': 1000}

Model with rank: 2
Mean validation score: 0.869 (std: 0.061)
Parameters: {'tol': 0.0001, 'C': 10}

Model with rank: 5
Mean validation score: 0.525 (std: 0.035)
Parameters: {'tol': 0.001, 'C': 0.01}

Model with rank: 5
Mean validation score: 0.525 (std: 0.035)
Parameters: {'tol': 0.0001, 'C': 0.1}

Model with rank: 7
Mean validation score: 0.508 (std: 0.042)
Parameters: {'tol': 0.001, 'C': 0.001}

Model with rank: 8
Mean validation score: 0.492 (std: 0.042)
Parameters: {'tol': 1e-05, 'C': 0.1}

Model with rank: 8
Mean validation score: 0.492 (std: 0.042)
Parameters: {'tol': 0.0001, 'C': 0.01}

Model with rank: 10
Mean vali

In [151]:
# Logistic regression WITH L1

# Hyperparameters of the logistic regression (pick values based on tuning above)
C_ = 100
tol_ = 0.01 # default, not tuned

clf = LogisticRegression(penalty='l1',multi_class='multinomial',dual=False,max_iter=100,solver='saga',n_jobs=-1,
                         C=C_, tol=tol_)
clf.fit(x_train, y_train)

train_predictions = clf.predict(x_train)
test_predictions = clf.predict(x_test)

print("Logistic regression with L1")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Logistic regression with L1
TRAIN
Accuracy:  0.950819672131
Precision: 0.941176470588
Recall:    0.969696969697
F1 score:  0.955223880597
TEST
Accuracy:  0.866666666667
Precision: 0.785714285714
Recall:    0.916666666667
F1 score:  0.846153846154


In [156]:
### Random forests: Model Selection ###

# build a classifier
clf = RandomForestClassifier(n_estimators=20)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [2, 3, 5, 7, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 100
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 11.55 seconds for 100 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.820 (std: 0.101)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.787 (std: 0.145)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 6, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.787 (std: 0.044)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 2, 'min_samples_split': 9}

Model with rank: 2
Mean validation score: 0.787 (std: 0.065)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 3, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}

Model with rank: 5
Mean validation score: 0.721 (std: 0.021)
Parameters: {'bootstrap': True, 'criterion': 'entropy',

In [157]:
### Random forests ###

# Hyperparameters of the random forest (pick values based on tuning above)
bootstrap_ = True
criterion_ = 'entropy'
max_depth_ = 5
max_features_ = 10
min_samples_leaf_ = 2
min_samples_split_ = 2

clf = RandomForestClassifier(n_estimators=20, bootstrap=bootstrap_, criterion=criterion_, max_depth=max_depth_, 
                             max_features=max_features_, min_samples_leaf=min_samples_leaf_, 
                             min_samples_split=min_samples_split_)
clf.fit(x_train, y_train)

train_predictions = clf.predict(x_train)
test_predictions = clf.predict(x_test)

print("Random Forest")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))

Random Forest
TRAIN
Accuracy:  0.983606557377
Precision: 1.0
Recall:    0.969696969697
F1 score:  0.984615384615
TEST
Accuracy:  0.666666666667
Precision: 0.55
Recall:    0.916666666667
F1 score:  0.6875


In [162]:
### Neural Network: Model Selection ###

# build a classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(256, 3), random_state=1)

# specify parameters and distributions to sample from
param_dist = {"alpha": [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
              "solver": ["lbfgs", "sgd", "adam"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(mlp_clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)



RandomizedSearchCV took 95.70 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.885 (std: 0.063)
Parameters: {'solver': 'lbfgs', 'alpha': 0.001}

Model with rank: 1
Mean validation score: 0.885 (std: 0.045)
Parameters: {'solver': 'lbfgs', 'alpha': 1e-06}

Model with rank: 3
Mean validation score: 0.869 (std: 0.061)
Parameters: {'solver': 'lbfgs', 'alpha': 1e-05}

Model with rank: 4
Mean validation score: 0.836 (std: 0.045)
Parameters: {'solver': 'lbfgs', 'alpha': 0.0001}

Model with rank: 5
Mean validation score: 0.590 (std: 0.052)
Parameters: {'solver': 'adam', 'alpha': 0.01}

Model with rank: 6
Mean validation score: 0.574 (std: 0.054)
Parameters: {'solver': 'adam', 'alpha': 0.0001}

Model with rank: 6
Mean validation score: 0.574 (std: 0.054)
Parameters: {'solver': 'adam', 'alpha': 1e-06}

Model with rank: 6
Mean validation score: 0.574 (std: 0.054)
Parameters: {'solver': 'adam', 'alpha': 1e-05}

Model with rank: 6
Mean validation score: 0.574

In [238]:
### Neural Network ### 

solver_ = 'lbfgs'
alpha_ = 0.0001

mlp_clf = MLPClassifier(solver=solver_, alpha=alpha_, hidden_layer_sizes=(256, 3), random_state=1)
mlp_clf.fit(x_train, y_train)

train_predictions = mlp_clf.predict(x_train)
test_predictions = mlp_clf.predict(x_test)

print("Simple Neural Network")

print("TRAIN")
print("Accuracy: ", metrics.accuracy_score(y_train, train_predictions))
print("Precision:", metrics.precision_score(y_train, train_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_train, train_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_train, train_predictions, pos_label='fact'))

print("TEST")
print("Accuracy: ", metrics.accuracy_score(y_test, test_predictions))
print("Precision:", metrics.precision_score(y_test, test_predictions, pos_label='fact'))
print("Recall:   ", metrics.recall_score(y_test, test_predictions,pos_label='fact'))
print("F1 score: ", metrics.f1_score(y_test, test_predictions, pos_label='fact'))


Simple Neural Network
TRAIN
Accuracy:  1.0
Precision: 1.0
Recall:    1.0
F1 score:  1.0
TEST
Accuracy:  0.9
Precision: 0.846153846154
Recall:    0.916666666667
F1 score:  0.88


In [174]:
def load_pickle(filepath):
    documents_f = open(filepath, 'rb')
    file = pickle.load(documents_f)
    documents_f.close()
    return file

def save_pickle(data, filepath):
    save_documents = open(filepath, 'wb')
    pickle.dump(data, save_documents)
    save_documents.close()

In [239]:
# Pickling stuff to be used in the script

save_pickle(mlp_clf, 'nn_model.pickle')
save_pickle(nb_classifier, 'nb_model.pickle')
save_pickle(vectorizer, 'vectorizer.pickle')

In [243]:
### Script to classify a new text. 
# Also gives the list of the most opinion-related words as determined in a binary classifier earlier, 
# so that these can be highlighted in the text. ###

import codecs, re, time
from itertools import chain

ENGLISH_STOP_WORDS = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"]


def most_informative_features_opinion(vectorizer, classifier, n=30):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))
    
    topn_class2_minus_stop = [x for x in topn_class2 if x[1] not in ENGLISH_STOP_WORDS]

    #for coef, feat in topn_class1[:n]:
    #    print(class_labels[0], coef, feat)

    #for coef, feat in reversed(topn_class2_minus_stop[-n:]):
    #    print(class_labels[1], coef, feat)
    
    return [x[1] for x in topn_class2_minus_stop[-n:]]

def BiasBarometer(text):
    nn_classifier_pickle_in = open('nn_model.pickle', 'rb')
    nb_classifier_pickle_in = open('nb_model.pickle', 'rb')
    vectorizer_pickle_in = open('vectorizer.pickle', 'rb')

    nn_classifier = pickle.load(nn_classifier_pickle_in)
    nb_classifier = pickle.load(nb_classifier_pickle_in)
    vectorizer = pickle.load(vectorizer_pickle_in)
    
    # very extensive preprocessing
    text.lower()
    
    # vectorize the text
    vectorizer_test = CountVectorizer(stop_words=None, vocabulary = vectorizer.vocabulary_,  input = 'content')
    vector = vectorizer_test.fit_transform([text])
    vector = normalize(vector, norm='l1', axis=1)    
    
    # make the prediction
    prediction = nn_classifier.predict(vector)
    
    # get the most informative words (for opinion texts)
    words = most_informative_features_opinion(vectorizer, nb_classifier, n=30)
    
    return prediction, words
    

In [244]:
# test

text = "Cigna released their U.S. Loneliness Survey this month and reported that loneliness among Americans has reached epidemic levels. Their survey of over 20,000 Americans found that nearly half reported sometimes or always feeling alone (46 percent) or left out (47 percent). The survey used a 20-item questionnaire that assesses subjective feelings of loneliness and social isolation.The potential effects of loneliness on health are well established. A 2013 study on loneliness showed elevated levels of stress hormones and inflammation, which can increase the risk of heart disease, dementia, and Type 2 diabetes and suicide attempts. Loneliness is prevalent in society. The rising numbers of single adults, the breakdown of the family and the loss of neighborhood and community have all contributed to an immense sense of loneliness in many people's lives."

print(BiasBarometer(text))


(array(['opinion'],
      dtype='<U7'), ['white', 'way', 'political', 'nfl', 'security', 'national', 'long', 'israel', 'law', 'years', 'like', 'quebec', 'kim', 'north', 'iran', 'time', 'said', 'policy', 'people', 'just', 'party', 'ford', 'new', 'social', 'government', 'mr', 'states', 'united', 'president', 'trump'])
