In [None]:
# Purpose:  Use supervised learning to train a classifier to predict candidate-sentiment for tweets.
#           Evaluate multiple combinations of models and features.
#           Use a 70/30 train/test split to estimate performance.
#           Train final featurizer and model on full training set and save to file.
# Author:  Carol Sniegoski
# Date:  05/16/16
# Course:  MAS DSE capstone, Spring 2016

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import math
import pickle

pd.set_option('display.max_colwidth', 240)
pd.set_option('display.max_columns', 50)

import sys
import os
import time
import datetime

from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer  # for featurizing using term frequencies
from sklearn.feature_extraction.text import CountVectorizer  # for featurizing using word n-grams

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedShuffleSplit

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier

#from textblob import TextBlob
#from textblob import Blobber
#from textblob.taggers import NLTKTagger
#from textblob.sentiments import NaiveBayesAnalyzer

print 'done'

done


In [3]:
# Function defs.

# return string with non-ascii chars removed
def to_ascii(s):
    returnstr = s.strip()
    returnstr = "".join([ch for ch in returnstr if ord(ch)< 128])
    return returnstr

# Return df containing 'fraction' fraction of the original df,
# with each value in column 'col' equally represented.
def sample_equally( df, col, fraction ):
    n = int(df.shape[0] * fraction)  # Get the total number of records to sample.
    vals = pd.unique(df[col].values.ravel())  # Get the class labels.
    n_to_sample = int(n/len(vals))   # Get the number of records to sample from each class.
    
    samples = []
    for val in vals:
        #samples.append( df[df[col]==val].sample(n=n_to_sample) )  # This should work in python 0.16.1
        rows = np.random.choice(df[df[col]==val].index.values, n_to_sample)
        sampled_df = df.ix[rows]
        samples.append(sampled_df)
    
    result = pd.concat(samples)

    return result

# Split dataframe into test/train set and validation set.
# Return tuple of ( train/test df, validation df ).
def get_validation_splits(df, label_field, validation_size=0.3):
    sss = StratifiedShuffleSplit(df[label_field], 1, validation_size, random_state=0)
    print len(sss)

    for train_index, test_index in sss:
        print len(train_index), len(test_index)
        print("TRAIN:", train_index, "TEST:", test_index)
        #df_trainTest, df_validate = df.ix[train_index], df.ix[test_index]
        #df_trainTest, df_validate = df.loc[train_index], df.loc[test_index]
        df_trainTest, df_validate = df.iloc[train_index], df.iloc[test_index]

        #print df_trainTest, df_validate

    return df_trainTest, df_validate

# From newer version of python.
def cohen_kappa_score(y1, y2, labels=None, weights=None):
    confusion = confusion_matrix(y1, y2, labels=labels)
    n_classes = confusion.shape[0]
    sum0 = np.sum(confusion, axis=0)
    sum1 = np.sum(confusion, axis=1)
    expected = np.outer(sum0, sum1) / np.sum(sum0)

    if weights is None:
        w_mat = np.ones([n_classes, n_classes], dtype=np.int)
        w_mat.flat[:: n_classes + 1] = 0
    elif weights == "linear" or weights == "quadratic":
        w_mat = np.zeros([n_classes, n_classes], dtype=np.int)
        w_mat += np.arange(n_classes)
        if weights == "linear":
            w_mat = np.abs(w_mat - w_mat.T)
        else:
            w_mat = (w_mat - w_mat.T) ** 2
    else:
        raise ValueError("Unknown kappa weighting type.")

    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
    return 1 - k

print 'done'

done


In [4]:
# Locate the data.
%ls ../data/candidatesentiment/

20151013.csv
2016211.csv
[34mML_tagged[m[m/
candidatesentiment_fromDebbie_05-14-16.csv
candidatesentiment_fromDebbie_05-17-16.csv
candidatesentiment_validationSet_random_05-19-16.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16.bak.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16_2.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16_2_LinearSVC_char3and4grams_predicted.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16_2_Logistic_huberLossL2_termFrequency_predicted.csv
candidatesentiment_validationSet_random_caslabeled_05-20-16_2_predicted.csv
candidatesentiment_validationSet_random_caslabeled_05-21-16_2_predicted_termFrequency_MultinomialNB.csv
sentimentsamples_random_05-18-16.csv


In [5]:
# Set the names of the cleaned text field and the class label field,
# to be used in the sampling, featurizing, and classification steps.
clean_text = 'ascii_clean'  # This is where the cleaned text will be put.
label_field = 'label' # This is where the labels will be put.

print 'done'

done


In [6]:
# Load data file.

prefix = "../data/candidatesentiment/"
#filename = "candidatesentiment_fromDebbie_05-14-16.csv"
#filename = "candidatesentiment_fromDebbie_05-17-16.csv"
filename = "sentimentsamples_random_05-18-16.csv"

columns = ["candidate", "sentiment", "id_str", "text"]
df = pd.read_csv(prefix + filename, header=None, names=columns)
print df.shape
print df.columns.values

(356344, 4)
['candidate' 'sentiment' 'id_str' 'text']


In [7]:
# Clean the 'text' field.

# Convert to ascii
df['ascii'] = df['text'].apply(to_ascii)
df[clean_text] = df['ascii']

# Remove hashtags
#df[clean_text] = df[clean_text].str.replace(r"#([A-Za-z0-9_]+)", " ")

# Remove handles
#df[clean_text] = df[clean_text].str.replace(r"@([A-Za-z0-9_]+)", " ")

# Remove URLs
#df[clean_text] = df[clean_text].str.replace(r"http([A-Za-z0-9_:.\/]+)", " ")
# Replace URLs with "URL"
df[clean_text] = df[clean_text].str.replace(r"http([A-Za-z0-9_:.\/]+)", "URL")

# Remove punctuation symbols - but not @ or #
df[clean_text] = df[clean_text].str.replace(r"(['';:%()\+\*\"\…\“\”])", "")
#df[clean_text] = df[clean_text].str.replace(r"(['';:@%#()\+\*\"\…\“\”])", "")
#df[clean_text] = df[clean_text].str.replace(r"([;:@%#()\+\*\"\…\“\”])", "")
df[clean_text] = df[clean_text].str.replace(r"([.-])", " ")

# Remove eol symbols
df[clean_text] = df[clean_text].str.replace(r"\n", " ")

# Remove &x symbols
df[clean_text] = df[clean_text].str.replace(r"&[a-z]+", " ")

# Convert to lowercase
df[clean_text] = df[clean_text].str.lower()

#print df[df['ascii']!=df[clean_text]][['ascii', 'K_sentiment']].head(10)
df[['text', clean_text]].head(3)
#df[df['ascii'].str.contains("\n")][['text', 'ascii']].head(10)

Unnamed: 0,text,ascii_clean
2015_10_11,"""Latinos &amp; immigrants support Donald Trump, ""Doing it Right"" http://t.co/ss8uq9w6Vh ""","latinos immigrants support donald trump, doing it right url"
2015_10_11,"""#Trump2016 #Trump #TrumpTrain OKAY CHERI KEEP UP THE LIES I KNOW YOU ARE FAR SUPERIOR TO ALL HUMAN LIFE.DISHONEST! https://t.co/GA6q1H1fge """,#trump2016 #trump #trumptrain okay cheri keep up the lies i know you are far superior to all human life dishonest! url
2015_10_11,"""@bensonmya123 @LindaEpai457450 @TrumpNewsNetwrk That's a ridiculous statement. Trump supporters understand U.S. citizens being SCREWED. """,@bensonmya123 @lindaepai457450 @trumpnewsnetwrk thats a ridiculous statement trump supporters understand u s citizens being screwed


In [9]:
# Populate the label_field.
df[label_field] = df['candidate'] + '_' + df['sentiment']

# Show the class frequencies.
counts = df[label_field].value_counts()
total = counts.sum()

counts = counts.apply(lambda x: ("Count: %d" % x, "Frequency: %.2f" % (float(x)/total)) )

#print("%.2f" % a)
#string = 'string%d' % (i,)

print counts
print type(counts)

Trump_pos      (Count: 37962, Frequency: 0.11)
Cruz_pos       (Count: 37668, Frequency: 0.11)
Sanders_pos    (Count: 37544, Frequency: 0.11)
Clinton_pos    (Count: 37254, Frequency: 0.10)
Trump_neg      (Count: 36728, Frequency: 0.10)
Clinton_neg    (Count: 36594, Frequency: 0.10)
Rubio_pos      (Count: 33815, Frequency: 0.09)
Cruz_neg       (Count: 33743, Frequency: 0.09)
Sanders_neg    (Count: 33066, Frequency: 0.09)
Rubio_neg      (Count: 31970, Frequency: 0.09)
dtype: object
<class 'pandas.core.series.Series'>


In [10]:
### 
# Set aside the validation set.

from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(df[label_field], 1, test_size=0.3, random_state=0)
print type(sss)

X = df[clean_text]
y = df[label_field]
print type(X)

for train_index, test_index in sss:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
print type(y_train)

<class 'sklearn.cross_validation.StratifiedShuffleSplit'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [11]:
# Classes are balanced, so no need to under- or oversample. Just use the original unsampled data.

# True if you want to set aside a validation set upfront.
do_validation = True

df_samples = []
sample_names = []
df_validation_splits = []
    
#df_samples = [ df ]
#sample_names = [ 'original data' ]

# Treat each candidate as a separate sample.
candidates = df['candidate'].unique()
print candidates

for candidate in candidates:
    sample_df = df[ df['candidate']==candidate]
    #sample_df = df[ df['candidate']==candidate].copy(deep=True)
    #print sample_df.shape
    #print sample_df.head(3)

    if do_validation:
        print candidate
        trainTest_df, validate_df = get_validation_splits( sample_df, label_field, 0.3 )
        df_samples.append( trainTest_df )
        df_validation_splits.append( validate_df )
    else:
        #df_samples.append( df[ df['candidate']==candidate] )
        df_samples.append( sample_df )
    sample_names.append( candidate )
    #df_samples.append(sample_equally( df, label_field, data_fraction ))
    #sample_names.append( str(data_fraction)+' random sample' )

print len(df_samples), len(df_validation_splits)
print 'done'

['Trump' 'Clinton' 'Sanders' 'Rubio' 'Cruz']
Trump
1
52283 22407
('TRAIN:', array([29003, 15337, 68002, ..., 25273,  6253, 37646]), 'TEST:', array([62288, 11032, 58251, ..., 62107, 57237, 68296]))
Clinton
1
51693 22155
('TRAIN:', array([10326, 40409, 63825, ..., 54243, 64399, 48349]), 'TEST:', array([20926, 72099, 29490, ..., 11564, 24082, 41166]))
Sanders
1
49427 21183
('TRAIN:', array([35163, 49675, 25215, ..., 24577, 54898, 68655]), 'TEST:', array([25835, 36616, 44740, ...,  9706, 63430, 50877]))
Rubio
1
46049 19736
('TRAIN:', array([24111, 18399, 52532, ..., 40835, 26667,   151]), 'TEST:', array([55855, 43903, 43681, ..., 56443, 12591, 13572]))
Cruz
1
49987 21424
('TRAIN:', array([57377,  1402, 46261, ...,  7322, 19662, 69484]), 'TEST:', array([20578, 57128, 61704, ..., 47184, 41211, 44732]))
5 5
done


In [12]:
# Convert data & their class labels into ndarrays.
X_samples = []
y_samples = []

for df_sample in df_samples:
    X_samples.append(df_sample[clean_text].values)
    y_samples.append(df_sample[label_field].values)
    
if do_validation:
    X_validation_splits = []
    y_validation_splits = []
    for df_validation_split in df_validation_splits:
        X_validation_splits.append(df_validation_split[clean_text].values)
        y_validation_splits.append(df_validation_split[label_field].values)

print type(X_samples[0])
print len(X_samples)
print len(y_samples)
print X_samples[0].shape

print len(X_validation_splits)
print X_validation_splits[0].shape

<type 'numpy.ndarray'>
5
5
(52283,)
5
(22407,)


In [13]:
# Initialize list of types of feature vectors to use.
# Add feature vectors to this list by running one or more of the feature-generation cells below.

feature_vector_lists = []
feature_vector_validation_lists = []
feature_names = []
featurizers = []
print 'done'

done


In [14]:
# Create feature vectors using character n-grams.

this_feature_name = "char3and4grams"
this_feature_vector_list = []
this_feature_vector_validation_list = []

ngram_min_size = 3
ngram_max_size = 4

#test_string = ["I really like python, it's pretty awesome."]
vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char'  # create character ngrams
                             )

for X_sample, X_validation_split, sample_name in zip(X_samples, X_validation_splits, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    this_feature_vector_validation_list.append( vectorizer.transform(X_validation_split) )
    #feature_names.append( this_feature_name + ', ' + sample_name )

feature_vector_lists.append(this_feature_vector_list)
feature_vector_validation_lists.append(this_feature_vector_validation_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names
    
#print('{1}-grams: {0}'.format(vect.get_feature_names(), ngram_size))
#print 'Number of char n-grams:', len(vectorizer.get_feature_names())
#print vectorizer.get_feature_names()[1000:1025]


Trump (52283, 60677)
Clinton (51693, 58368)
Sanders (49427, 57571)
Rubio (46049, 53090)
Cruz (49987, 57003)
['char3and4grams']


In [199]:
# Create feature vectors using word n-grams.

this_feature_name = 'word 1- & 2grams'
this_feature_vector_list = []
this_feature_vector_validation_list = []

ngram_min_size = 1
ngram_max_size = 2

vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,      # min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,    # max percent of docs a token can appear in (if a float value)
                             analyzer='word'  # create word ngrams; this is the default
                             )

for X_sample, X_validation_split, sample_name in zip(X_samples, X_validation_splits, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    this_feature_vector_validation_list.append( vectorizer.transform(X_validation_split) )
    #feature_names.append( this_feature_name + ', ' + sample_name )
    
feature_vector_lists.append(this_feature_vector_list)
feature_vector_validation_lists.append(this_feature_vector_validation_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

Trump (52283, 35509)
Clinton (51693, 35765)
Sanders (49427, 33618)
Rubio (46049, 35306)
Cruz (49987, 33953)
['char 3grams', 'word 1- & 2grams']


In [15]:
# Create feature vectors using term frequency (same as word 1-grams, presumably).

this_feature_name = 'term freq'
this_feature_vector_list = []
this_feature_vector_validation_list = []

vectorizer = TfidfVectorizer(min_df = 5,     # Min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,   # Max percent of docs a token can appear in (if a float value)
                             sublinear_tf = True,   # Need to look up what this is
                             use_idf = False)  # Don't use inverse document frequency weighting

for X_sample, X_validation_split, sample_name in zip(X_samples, X_validation_splits, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    this_feature_vector_validation_list.append( vectorizer.transform(X_validation_split) )
    #feature_names.append( this_feature_name + ', ' + sample_name )
    
feature_vector_lists.append(this_feature_vector_list)
feature_vector_validation_lists.append(this_feature_vector_validation_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

Trump (52283, 9732)
Clinton (51693, 9696)
Sanders (49427, 9261)
Rubio (46049, 8310)
Cruz (49987, 8784)
['char3and4grams', 'term freq']


In [201]:
# Create feature vectors using tfidf.

this_feature_name = 'tfidf'
this_feature_vector_list = []
this_feature_vector_validation_list = []

vectorizer = TfidfVectorizer(min_df = 5,     # Min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,   # Max percent of docs a token can appear in (if a float value)
                             sublinear_tf = True,   # Need to look up what this is
                             use_idf = True)  # Use idf

for X_sample, X_validation_split, sample_name in zip(X_samples, X_validation_splits, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    this_feature_vector_validation_list.append( vectorizer.transform(X_validation_split) )
    #feature_names.append( this_feature_name + ', ' + sample_name )
    
feature_vector_lists.append(this_feature_vector_list)
feature_vector_validation_lists.append(this_feature_vector_validation_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

Trump (52283, 9732)
Clinton (51693, 9696)
Sanders (49427, 9261)
Rubio (46049, 8310)
Cruz (49987, 8784)
['char 3grams', 'word 1- & 2grams', 'term freq', 'tfidf']


In [16]:
# Initialize list of classifiers to use.
# Add classifiers to this list by running one or more of the classifier-creation cells below.

classifiers = []
classifier_names = []
print 'done'

done


In [17]:
# Create Naive Bayes classifiers.

# Multinomial NaiveBayes. Commonly used for text classification.
classifier_names.append("Multinomial NB")
classifiers.append(MultinomialNB(alpha=1.0,        # Use default Laplacian smoothing
                                 fit_prior=True))  # Fit priors based on training data
                  
# Bernoulli NaiveBayes. Commonly used for text classification for short documents.
# Expects boolean features (e.g., word occurence/nonoccurence instead of term frequency or tfidf).
#classifier_names.append('Bernoulli NB')
#classifiers.append(BernoulliNB(alpha=1.0,        # Use default Laplacian smoothing
#                               binarize=0,       # Threshold for binarizing the input features
#                               fit_prior=True))  # Fit priors based on training data
print 'done'
print classifier_names

done
['Multinomial NB']


In [18]:
# Create SVM / logistic regression classifiers.

# SVM with linear kernel.
classifier_names.append('LinearSVC')
classifiers.append(LinearSVC(random_state=0)) 

# Logistic regression (sometimes called maxent).
classifier_names.append('Logistic')
classifiers.append(SGDClassifier(loss='hinge', 
                                 penalty='l2', 
                                 alpha=1e-3, 
                                 n_iter=100, 
                                 random_state=0))
# Regular logistic regression.
#classifier_names.append('Logistic_regular')
#classifiers.append(LogisticRegression())


print 'done'
print classifier_names

done
['Multinomial NB', 'LinearSVC', 'Logistic']


In [19]:
# Initialize scores.
scores = {}
print 'done'

done


In [20]:
# Initialize labeled results.
labeled_results = {}
print 'done'

done


In [21]:
# Train classifiers on train/test set, test on the validation set.
# Get scores every x iterations, for those classifiers for which one can control this in sklearn.
for classifier, classifier_name in zip(classifiers, classifier_names):
    print 'beginning classifier ' + classifier_name
    scores[classifier_name] = {}
    labeled_results[classifier_name] = {}
    for feature_vector_list, feature_vector_validation_list, feature_name \
    in zip(feature_vector_lists, feature_vector_validation_lists, feature_names):
        scores[classifier_name][feature_name] = {}
        labeled_results[classifier_name][feature_name] = {}

        #for X, y_sample, sample_name in zip(feature_vector_list, y_samples, sample_names):
        #    y_pred = cross_validation.cross_val_predict(classifier, X, y_sample, cv=cv)
        #    scores[classifier_name][feature_name][sample_name] = ( y_sample, y_pred )

        #for X, X_validation, y_sample, y_validation_split, sample_name in \
        #zip(feature_vector_list, feature_vector_validation_list, y_samples, y_validation_splits, sample_names):
        for X, X_validation, y_sample, y_validation_split, df_validation_split, sample_name in \
        zip(feature_vector_list, feature_vector_validation_list, y_samples, y_validation_splits, df_validation_splits, sample_names):
            
            #y_pred = cross_validation.cross_val_predict(classifier, X, y_sample, cv=cv)
            print X.shape, X_validation.shape
            classifier.fit( X, y_sample )
            y_pred = classifier.predict( X_validation )
            scores[classifier_name][feature_name][sample_name] = ( y_validation_split, y_pred )
            df_withNewLabel = df_validation_split.copy(deep=True)
            df_withNewLabel['predicted'+ '_' + classifier_name + '_' + feature_name + '_' + sample_name] = y_pred
            labeled_results[classifier_name][feature_name][sample_name] = ( df_withNewLabel )

        print '   done with ' + feature_name
    
print 'done training and validating classifiers' 

beginning classifier Multinomial NB
(52283, 60677) (22407, 60677)
(51693, 58368) (22155, 58368)
(49427, 57571) (21183, 57571)
(46049, 53090) (19736, 53090)
(49987, 57003) (21424, 57003)
   done with char3and4grams
(52283, 9732) (22407, 9732)
(51693, 9696) (22155, 9696)
(49427, 9261) (21183, 9261)
(46049, 8310) (19736, 8310)
(49987, 8784) (21424, 8784)
   done with term freq
beginning classifier LinearSVC
(52283, 60677) (22407, 60677)
(51693, 58368) (22155, 58368)
(49427, 57571) (21183, 57571)
(46049, 53090) (19736, 53090)
(49987, 57003) (21424, 57003)
   done with char3and4grams
(52283, 9732) (22407, 9732)
(51693, 9696) (22155, 9696)
(49427, 9261) (21183, 9261)
(46049, 8310) (19736, 8310)
(49987, 8784) (21424, 8784)
   done with term freq
beginning classifier Logistic
(52283, 60677) (22407, 60677)
(51693, 58368) (22155, 58368)
(49427, 57571) (21183, 57571)
(46049, 53090) (19736, 53090)
(49987, 57003) (21424, 57003)
   done with char3and4grams
(52283, 9732) (22407, 9732)
(51693, 9696) (

In [43]:
# Write data with new labels to file.
prefix = "../data/candidatesentiment/output/"
filename = "candidatesentiment_fromDebbie_05-14-16"


i = 0
desired = [0, 1, 2, 3, 4, 5]
for classifier_name in classifier_names:
    for feature_name in feature_names:
        for sample_name in sample_names:
            if (i in desired):
                print classifier_name + ', ' + feature_name + ', ' + sample_name
                df = labeled_results[classifier_name][feature_name][sample_name]
                print df.shape
                new_label_field = 'predicted' + '_' + classifier_name + '_' + feature_name + '_' + sample_name
                df.to_csv(prefix + filename + '_' + new_label_field + '.csv')
            i+=1

print 'done'

Multinomial NB, term freq, Trump
(120, 9)
Multinomial NB, term freq, Sanders
(120, 9)
Multinomial NB, term freq, Clinton
(120, 9)
Multinomial NB, term freq, Cruz
(120, 9)
Multinomial NB, term freq, Rubio
(82, 9)
done


In [122]:
# Train and test classifiers using crossvalidation.
# This will use all the types of sampling, features, and classifiers that were created.

cv = 3  # Number of crossvalidation folds to use.
#scores = {}
#score_descriptions = []

for classifier, classifier_name in zip(classifiers, classifier_names):
    print 'beginning classifier ' + classifier_name
    scores[classifier_name] = {}
    for feature_vector_list, feature_name in zip(feature_vector_lists, feature_names):
        scores[classifier_name][feature_name] = {}
        for X, y_sample, sample_name in zip(feature_vector_list, y_samples, sample_names):
            y_pred = cross_validation.cross_val_predict(classifier, X, y_sample, cv=cv)
            scores[classifier_name][feature_name][sample_name] = ( y_sample, y_pred )
            #scores[classifier_name][feature_name][sample_name] = cross_validation.cross_val_score(classifier, X, y_sample, cv=cv)
            #scores.append( cross_validation.cross_val_score(classifier, X, y_sample, cv=cv) )
            #score_descriptions.append( classifier_name + ', ' + feature_name + ', ' + sample_name )
        print '   done with ' + feature_name
    
print 'done training and testing classifiers' 

beginning classifier Multinomial NB
   done with word 1- & 2grams
   done with char 3grams
   done with term freq
   done with tfidf
beginning classifier LinearSVC
   done with word 1- & 2grams
   done with char 3grams
   done with term freq
   done with tfidf
beginning classifier Logistic
   done with word 1- & 2grams
   done with char 3grams
   done with term freq
   done with tfidf
done training and testing classifiers


In [22]:
# Print results.

for classifier_name, classifier_results in scores.items():
    print classifier_name
    print
    for feature_name, feature_results in classifier_results.items():
        print feature_name
        print
        #for sample_name, sample_scores in feature_results.items():
        for sample_name, results in feature_results.items():
            #print "  " + sample_name + ": mean=" + str(sample_scores.mean()) + ", std=" + str(sample_scores.std())
            print "  " + sample_name
            y, y_pred = results
            #print type(results)
            #print confusion_matrix( y, y_pred )
            print classification_report( y, y_pred )
            print metrics.accuracy_score( y, y_pred )
            #print cohen_kappa_score( y, y_pred )
            #print 
        print
    print
print 'done'

LinearSVC

term freq

  Sanders
             precision    recall  f1-score   support

Sanders_neg       0.83      0.82      0.83      9920
Sanders_pos       0.85      0.85      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.839116272483
  Rubio
             precision    recall  f1-score   support

  Rubio_neg       0.86      0.86      0.86      9591
  Rubio_pos       0.86      0.86      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.860204702067
  Clinton
             precision    recall  f1-score   support

Clinton_neg       0.85      0.88      0.87     10978
Clinton_pos       0.88      0.85      0.87     11177

avg / total       0.87      0.87      0.87     22155

0.865989618596
  Trump
             precision    recall  f1-score   support

  Trump_neg       0.82      0.85      0.83     11018
  Trump_pos       0.85      0.82      0.83     11389

avg / total       0.83      0.83      0.83     22407

0.833712679074
  Cruz
             pr

In [254]:
# Define types final featurizer and classifier to use.

final_featurizer_name = "char3and4grams"
#final_featurizer = TfidfVectorizer(min_df = 5,   # Min number of docs a token must appear in (if an integer value)
#                             max_df = 0.8,       # Max percent of docs a token can appear in (if a float value)
#                             sublinear_tf = True,   # Need to look up what this is
#                             use_idf = False)    # Don't use inverse document frequency weighting
final_featurizer = CountVectorizer(ngram_range=(3,4),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char')  # create character ngrams
                             

final_classifier_name = "LinearSVC"
#final_classifier = MultinomialNB(alpha=1.0,        # Use default Laplacian smoothing
#                                 fit_prior=True)  # Fit priors based on training data
# 
#classifiers.append(BernoulliNB(alpha=1.0,        # Use default Laplacian smoothing
#                               binarize=0,       # Threshold for binarizing the input features
#                               fit_prior=True))  # Fit priors based on training data
final_classifier = LinearSVC(random_state=2016)
#classifiers.append(SGDClassifier(loss='hinge', 
#                                 penalty='l2', 
#                                alpha=1e-3, 
#                                 n_iter=5, 
#                                 random_state=0))
#final_classifier = LogisticRegression()

print 'done'

done


In [255]:
# Check that the main df is still ok.
# Show the class frequencies.
counts = df[label_field].value_counts()
total = counts.sum()

counts = counts.apply(lambda x: ("Count: %d" % x, "Frequency: %.2f" % (float(x)/total)) )

#print("%.2f" % a)
#string = 'string%d' % (i,)

print counts
print type(counts)

Trump_pos      (Count: 37962, Frequency: 0.11)
Cruz_pos       (Count: 37668, Frequency: 0.11)
Sanders_pos    (Count: 37544, Frequency: 0.11)
Clinton_pos    (Count: 37254, Frequency: 0.10)
Trump_neg      (Count: 36728, Frequency: 0.10)
Clinton_neg    (Count: 36594, Frequency: 0.10)
Rubio_pos      (Count: 33815, Frequency: 0.09)
Cruz_neg       (Count: 33743, Frequency: 0.09)
Sanders_neg    (Count: 33066, Frequency: 0.09)
Rubio_neg      (Count: 31970, Frequency: 0.09)
dtype: object
<class 'pandas.core.series.Series'>


In [256]:
# Locate output directory.
%ls ../classification/candidatesentiment/

LinearSVC_char3and4grams_Clinton_05-21-16.pkl                   Logistic_regular_termFrequency_Cruz_05-22-16.pkl_03.npy
LinearSVC_char3and4grams_Clinton_05-21-16.pkl_01.npy            Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl
LinearSVC_char3and4grams_Clinton_05-21-16.pkl_02.npy            Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_01.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl                      Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_02.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl_01.npy               Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_03.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl_02.npy               Logistic_regular_termFrequency_Rubio_05-22-16.pkl
LinearSVC_char3and4grams_Rubio_05-21-16.pkl                     Logistic_regular_termFrequency_Rubio_05-22-16.pkl_01.npy
LinearSVC_char3and4grams_Rubio_05-21-16.pkl_01.npy              Logistic_regular_termFrequency_Rubio_05-22-16.pkl_02.npy
LinearSVC_char3and4grams_Ru

In [257]:
prefix = "../classification/candidatesentiment/"

In [264]:
print candidates

['Trump' 'Clinton' 'Sanders' 'Cruz' 'Rubio']


In [262]:
# For each candidate, fit featurizer on each (entire) sample, train the classifier on it, 
# and write featurizer and classifier to file.

date = "05-25-16"
for candidate in candidates:
    print candidate
    
    # Get data for this candidate.
    candidate_df = df[ df['candidate']==candidate ]
    
    # Fit featurizer.
    X = final_featurizer.fit_transform(candidate_df[clean_text])
    
    # Fit classifier.
    y = candidate_df[label_field]
    final_classifier.fit(X,y)
    
    # Write them to file.
    
    featurizer_filename = final_featurizer_name + "_" + candidate + "_" + date + ".pkl"
    #joblib.dump(final_featurizer, prefix + featurizer_filename) 
    with open(prefix+featurizer_filename, 'wb') as f:
        pickle.dump(final_featurizer, f)
    print "Wrote featurizer to file: " + prefix + featurizer_filename
    
    classifier_filename = final_classifier_name + "_" + final_featurizer_name + "_" + candidate + "_" + date + ".pkl"
    #joblib.dump(final_classifier, prefix + classifier_filename)
    with open(prefix+classifier_filename, 'wb') as f:
        pickle.dump(final_classifier, f)
    print "Wrote classifier to file: " + prefix + classifier_filename

print 'done'

Trump
Wrote featurizer to file: ../classification/candidatesentiment/char3and4grams_Trump_05-25-16.pkl
Wrote classifier to file: ../classification/candidatesentiment/LinearSVC_char3and4grams_Trump_05-25-16.pkl
Clinton
Wrote featurizer to file: ../classification/candidatesentiment/char3and4grams_Clinton_05-25-16.pkl
Wrote classifier to file: ../classification/candidatesentiment/LinearSVC_char3and4grams_Clinton_05-25-16.pkl
Sanders
Wrote featurizer to file: ../classification/candidatesentiment/char3and4grams_Sanders_05-25-16.pkl
Wrote classifier to file: ../classification/candidatesentiment/LinearSVC_char3and4grams_Sanders_05-25-16.pkl
Cruz
Wrote featurizer to file: ../classification/candidatesentiment/char3and4grams_Cruz_05-25-16.pkl
Wrote classifier to file: ../classification/candidatesentiment/LinearSVC_char3and4grams_Cruz_05-25-16.pkl
Rubio
Wrote featurizer to file: ../classification/candidatesentiment/char3and4grams_Rubio_05-25-16.pkl
Wrote classifier to file: ../classification/cand

In [263]:
# Verify that the files were written to the output directory.
%ls ../classification/candidatesentiment/

LinearSVC_char3and4grams_Clinton_05-21-16.pkl                   Logistic_regular_termFrequency_Cruz_05-22-16.pkl_03.npy
LinearSVC_char3and4grams_Clinton_05-21-16.pkl_01.npy            Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl
LinearSVC_char3and4grams_Clinton_05-21-16.pkl_02.npy            Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_01.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl                      Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_02.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl_01.npy               Logistic_regular_termFrequency_Cruz_Deb-05-22-16.pkl_03.npy
LinearSVC_char3and4grams_Cruz_05-21-16.pkl_02.npy               Logistic_regular_termFrequency_Rubio_05-22-16.pkl
LinearSVC_char3and4grams_Rubio_05-21-16.pkl                     Logistic_regular_termFrequency_Rubio_05-22-16.pkl_01.npy
LinearSVC_char3and4grams_Rubio_05-21-16.pkl_01.npy              Logistic_regular_termFrequency_Rubio_05-22-16.pkl_02.npy
LinearSVC_char3and4grams_Ru

Results on large dataset of 1000 pos & 1000 neg per candidate per partition. (40 partitions)
05-21-16


LinearSVC

tfidf

  Sanders
[[8110 1810]
 [1662 9601]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.82      0.82      9920
Sanders_pos       0.84      0.85      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.836094981825
  Rubio
[[8180 1411]
 [1344 8801]]
             precision    recall  f1-score   support

  Rubio_neg       0.86      0.85      0.86      9591
  Rubio_pos       0.86      0.87      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.860407377381
  Clinton
[[9670 1308]
 [1635 9542]]
             precision    recall  f1-score   support

Clinton_neg       0.86      0.88      0.87     10978
Clinton_pos       0.88      0.85      0.87     11177

avg / total       0.87      0.87      0.87     22155

0.867163168585
  Trump
[[9254 1764]
 [2013 9376]]
             precision    recall  f1-score   support

  Trump_neg       0.82      0.84      0.83     11018
  Trump_pos       0.84      0.82      0.83     11389

avg / total       0.83      0.83      0.83     22407

0.831436604632
  Cruz
[[8729 1394]
 [1585 9716]]
             precision    recall  f1-score   support

   Cruz_neg       0.85      0.86      0.85     10123
   Cruz_pos       0.87      0.86      0.87     11301

avg / total       0.86      0.86      0.86     21424

0.860950336072

term freq

  Sanders
[[8162 1758]
 [1650 9613]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.82      0.83      9920
Sanders_pos       0.85      0.85      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.839116272483
  Rubio
[[8212 1379]
 [1380 8765]]
             precision    recall  f1-score   support

  Rubio_neg       0.86      0.86      0.86      9591
  Rubio_pos       0.86      0.86      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.860204702067
  Clinton
[[9657 1321]
 [1648 9529]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.88      0.87     10978
Clinton_pos       0.88      0.85      0.87     11177

avg / total       0.87      0.87      0.87     22155

0.865989618596
  Trump
[[9353 1665]
 [2061 9328]]
             precision    recall  f1-score   support

  Trump_neg       0.82      0.85      0.83     11018
  Trump_pos       0.85      0.82      0.83     11389

avg / total       0.83      0.83      0.83     22407

0.833712679074
  Cruz
[[8810 1313]
 [1665 9636]]
             precision    recall  f1-score   support

   Cruz_neg       0.84      0.87      0.86     10123
   Cruz_pos       0.88      0.85      0.87     11301

avg / total       0.86      0.86      0.86     21424

0.860997012696

word 1- & 2grams

  Sanders
[[7909 2011]
 [1917 9346]]
             precision    recall  f1-score   support

Sanders_neg       0.80      0.80      0.80      9920
Sanders_pos       0.82      0.83      0.83     11263

avg / total       0.81      0.81      0.81     21183

0.81456828589
  Rubio
[[8108 1483]
 [1428 8717]]
             precision    recall  f1-score   support

  Rubio_neg       0.85      0.85      0.85      9591
  Rubio_pos       0.85      0.86      0.86     10145

avg / total       0.85      0.85      0.85     19736

0.85250304013
  Clinton
[[9417 1561]
 [1765 9412]]
             precision    recall  f1-score   support

Clinton_neg       0.84      0.86      0.85     10978
Clinton_pos       0.86      0.84      0.85     11177

avg / total       0.85      0.85      0.85     22155

0.84987587452
  Trump
[[8868 2150]
 [2190 9199]]
             precision    recall  f1-score   support

  Trump_neg       0.80      0.80      0.80     11018
  Trump_pos       0.81      0.81      0.81     11389

avg / total       0.81      0.81      0.81     22407

0.80631052796
  Cruz
[[8453 1670]
 [1600 9701]]
             precision    recall  f1-score   support

   Cruz_neg       0.84      0.84      0.84     10123
   Cruz_pos       0.85      0.86      0.86     11301

avg / total       0.85      0.85      0.85     21424

0.847367438387

char 3grams

  Sanders
[[7893 2027]
 [1972 9291]]
             precision    recall  f1-score   support

Sanders_neg       0.80      0.80      0.80      9920
Sanders_pos       0.82      0.82      0.82     11263

avg / total       0.81      0.81      0.81     21183

0.811216541566
  Rubio
[[7932 1659]
 [1480 8665]]
             precision    recall  f1-score   support

  Rubio_neg       0.84      0.83      0.83      9591
  Rubio_pos       0.84      0.85      0.85     10145

avg / total       0.84      0.84      0.84     19736

0.840950547223
  Clinton
[[9172 1806]
 [1754 9423]]
             precision    recall  f1-score   support

Clinton_neg       0.84      0.84      0.84     10978
Clinton_pos       0.84      0.84      0.84     11177

avg / total       0.84      0.84      0.84     22155

0.839313924622
  Trump
[[8863 2155]
 [2218 9171]]
             precision    recall  f1-score   support

  Trump_neg       0.80      0.80      0.80     11018
  Trump_pos       0.81      0.81      0.81     11389

avg / total       0.80      0.80      0.80     22407

0.80483777391
  Cruz
[[8309 1814]
 [1657 9644]]
             precision    recall  f1-score   support

   Cruz_neg       0.83      0.82      0.83     10123
   Cruz_pos       0.84      0.85      0.85     11301

avg / total       0.84      0.84      0.84     21424

0.837985436893


Multinomial NB

tfidf

  Sanders
[[8007 1913]
 [1594 9669]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.81      0.82      9920
Sanders_pos       0.83      0.86      0.85     11263

avg / total       0.83      0.83      0.83     21183

0.834442713497
  Rubio
[[7765 1826]
 [1385 8760]]
             precision    recall  f1-score   support

  Rubio_neg       0.85      0.81      0.83      9591
  Rubio_pos       0.83      0.86      0.85     10145

avg / total       0.84      0.84      0.84     19736

0.837302391569
  Clinton
[[9602 1376]
 [1711 9466]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.87      0.86     10978
Clinton_pos       0.87      0.85      0.86     11177

avg / total       0.86      0.86      0.86     22155

0.860663507109
  Trump
[[9165 1853]
 [2147 9242]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.83      0.82     11018
  Trump_pos       0.83      0.81      0.82     11389

avg / total       0.82      0.82      0.82     22407

0.821484357567
  Cruz
[[8794 1329]
 [2071 9230]]
             precision    recall  f1-score   support

   Cruz_neg       0.81      0.87      0.84     10123
   Cruz_pos       0.87      0.82      0.84     11301

avg / total       0.84      0.84      0.84     21424

0.841299477222

term freq

  Sanders
[[7877 2043]
 [1598 9665]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.79      0.81      9920
Sanders_pos       0.83      0.86      0.84     11263

avg / total       0.83      0.83      0.83     21183

0.828116886182
  Rubio
[[7680 1911]
 [1480 8665]]
             precision    recall  f1-score   support

  Rubio_neg       0.84      0.80      0.82      9591
  Rubio_pos       0.82      0.85      0.84     10145

avg / total       0.83      0.83      0.83     19736

0.828182002432
  Clinton
[[9460 1518]
 [1774 9403]]
             precision    recall  f1-score   support

Clinton_neg       0.84      0.86      0.85     10978
Clinton_pos       0.86      0.84      0.85     11177

avg / total       0.85      0.85      0.85     22155

0.851410516813
  Trump
[[9146 1872]
 [2200 9189]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.83      0.82     11018
  Trump_pos       0.83      0.81      0.82     11389

avg / total       0.82      0.82      0.82     22407

0.818271076003
  Cruz
[[8777 1346]
 [2261 9040]]
             precision    recall  f1-score   support

   Cruz_neg       0.80      0.87      0.83     10123
   Cruz_pos       0.87      0.80      0.83     11301

avg / total       0.83      0.83      0.83     21424

0.831637415982

word 1- & 2grams

  Sanders
[[8375 1545]
 [1764 9499]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.84      0.84      9920
Sanders_pos       0.86      0.84      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.843789831469
  Rubio
[[8161 1430]
 [1389 8756]]
             precision    recall  f1-score   support

  Rubio_neg       0.85      0.85      0.85      9591
  Rubio_pos       0.86      0.86      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.857164572355
  Clinton
[[9659 1319]
 [1643 9534]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.88      0.87     10978
Clinton_pos       0.88      0.85      0.87     11177

avg / total       0.87      0.87      0.87     22155

0.866305574362
  Trump
[[9526 1492]
 [2207 9182]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.86      0.84     11018
  Trump_pos       0.86      0.81      0.83     11389

avg / total       0.84      0.83      0.83     22407

0.83491765966
  Cruz
[[9166  957]
 [2259 9042]]
             precision    recall  f1-score   support

   Cruz_neg       0.80      0.91      0.85     10123
   Cruz_pos       0.90      0.80      0.85     11301

avg / total       0.86      0.85      0.85     21424

0.849887976102

char 3grams

  Sanders
[[8190 1730]
 [2544 8719]]
             precision    recall  f1-score   support

Sanders_neg       0.76      0.83      0.79      9920
Sanders_pos       0.83      0.77      0.80     11263

avg / total       0.80      0.80      0.80     21183

0.798234433272
  Rubio
[[7618 1973]
 [1870 8275]]
             precision    recall  f1-score   support

  Rubio_neg       0.80      0.79      0.80      9591
  Rubio_pos       0.81      0.82      0.81     10145

avg / total       0.81      0.81      0.81     19736

0.805279691934
  Clinton
[[9204 1774]
 [2085 9092]]
             precision    recall  f1-score   support

Clinton_neg       0.82      0.84      0.83     10978
Clinton_pos       0.84      0.81      0.82     11177

avg / total       0.83      0.83      0.83     22155

0.825818099752
  Trump
[[9465 1553]
 [3125 8264]]
             precision    recall  f1-score   support

  Trump_neg       0.75      0.86      0.80     11018
  Trump_pos       0.84      0.73      0.78     11389

avg / total       0.80      0.79      0.79     22407

0.791225956174
  Cruz
[[8935 1188]
 [3369 7932]]
             precision    recall  f1-score   support

   Cruz_neg       0.73      0.88      0.80     10123
   Cruz_pos       0.87      0.70      0.78     11301

avg / total       0.80      0.79      0.79     21424

0.787294622853


Bernoulli NB

tfidf

  Sanders
[[8182 1738]
 [1833 9430]]
             precision    recall  f1-score   support

Sanders_neg       0.82      0.82      0.82      9920
Sanders_pos       0.84      0.84      0.84     11263

avg / total       0.83      0.83      0.83     21183

0.831421422839
  Rubio
[[7985 1606]
 [1568 8577]]
             precision    recall  f1-score   support

  Rubio_neg       0.84      0.83      0.83      9591
  Rubio_pos       0.84      0.85      0.84     10145

avg / total       0.84      0.84      0.84     19736

0.839177138225
  Clinton
[[9503 1475]
 [1695 9482]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.87      0.86     10978
Clinton_pos       0.87      0.85      0.86     11177

avg / total       0.86      0.86      0.86     22155

0.856917174453
  Trump
[[9307 1711]
 [2211 9178]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.84      0.83     11018
  Trump_pos       0.84      0.81      0.82     11389

avg / total       0.83      0.82      0.82     22407

0.824965412594
  Cruz
[[8889 1234]
 [2350 8951]]
             precision    recall  f1-score   support

   Cruz_neg       0.79      0.88      0.83     10123
   Cruz_pos       0.88      0.79      0.83     11301

avg / total       0.84      0.83      0.83     21424

0.832710978342

term freq

  Sanders
[[8182 1738]
 [1833 9430]]
             precision    recall  f1-score   support

Sanders_neg       0.82      0.82      0.82      9920
Sanders_pos       0.84      0.84      0.84     11263

avg / total       0.83      0.83      0.83     21183

0.831421422839
  Rubio
[[7985 1606]
 [1568 8577]]
             precision    recall  f1-score   support

  Rubio_neg       0.84      0.83      0.83      9591
  Rubio_pos       0.84      0.85      0.84     10145

avg / total       0.84      0.84      0.84     19736

0.839177138225
  Clinton
[[9503 1475]
 [1695 9482]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.87      0.86     10978
Clinton_pos       0.87      0.85      0.86     11177

avg / total       0.86      0.86      0.86     22155

0.856917174453
  Trump
[[9307 1711]
 [2211 9178]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.84      0.83     11018
  Trump_pos       0.84      0.81      0.82     11389

avg / total       0.83      0.82      0.82     22407

0.824965412594
  Cruz
[[8889 1234]
 [2350 8951]]
             precision    recall  f1-score   support

   Cruz_neg       0.79      0.88      0.83     10123
   Cruz_pos       0.88      0.79      0.83     11301

avg / total       0.84      0.83      0.83     21424

0.832710978342

word 1- & 2grams

  Sanders
[[8341 1579]
 [1758 9505]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.84      0.83      9920
Sanders_pos       0.86      0.84      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.842468016806
  Rubio
[[8270 1321]
 [1480 8665]]
             precision    recall  f1-score   support

  Rubio_neg       0.85      0.86      0.86      9591
  Rubio_pos       0.87      0.85      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.858076611269
  Clinton
[[9638 1340]
 [1661 9516]]
             precision    recall  f1-score   support

Clinton_neg       0.85      0.88      0.87     10978
Clinton_pos       0.88      0.85      0.86     11177

avg / total       0.86      0.86      0.86     22155

0.864545249379
  Trump
[[9591 1427]
 [2232 9157]]
             precision    recall  f1-score   support

  Trump_neg       0.81      0.87      0.84     11018
  Trump_pos       0.87      0.80      0.83     11389

avg / total       0.84      0.84      0.84     22407

0.836702816084
  Cruz
[[9115 1008]
 [2264 9037]]
             precision    recall  f1-score   support

   Cruz_neg       0.80      0.90      0.85     10123
   Cruz_pos       0.90      0.80      0.85     11301

avg / total       0.85      0.85      0.85     21424

0.847274085138

char 3grams

  Sanders
[[8170 1750]
 [2585 8678]]
             precision    recall  f1-score   support

Sanders_neg       0.76      0.82      0.79      9920
Sanders_pos       0.83      0.77      0.80     11263

avg / total       0.80      0.80      0.80     21183

0.795354765614
  Rubio
[[7644 1947]
 [1939 8206]]
             precision    recall  f1-score   support

  Rubio_neg       0.80      0.80      0.80      9591
  Rubio_pos       0.81      0.81      0.81     10145

avg / total       0.80      0.80      0.80     19736

0.803100932306
  Clinton
[[9204 1774]
 [2250 8927]]
             precision    recall  f1-score   support

Clinton_neg       0.80      0.84      0.82     10978
Clinton_pos       0.83      0.80      0.82     11177

avg / total       0.82      0.82      0.82     22155

0.818370570977
  Trump
[[9418 1600]
 [3104 8285]]
             precision    recall  f1-score   support

  Trump_neg       0.75      0.85      0.80     11018
  Trump_pos       0.84      0.73      0.78     11389

avg / total       0.80      0.79      0.79     22407

0.790065604499
  Cruz
[[8890 1233]
 [3348 7953]]
             precision    recall  f1-score   support

   Cruz_neg       0.73      0.88      0.80     10123
   Cruz_pos       0.87      0.70      0.78     11301

avg / total       0.80      0.79      0.79     21424

0.786174383869


Logistic

tfidf

  Sanders
[[7465 2455]
 [2184 9079]]
             precision    recall  f1-score   support

Sanders_neg       0.77      0.75      0.76      9920
Sanders_pos       0.79      0.81      0.80     11263

avg / total       0.78      0.78      0.78     21183

0.78100363499
  Rubio
[[7509 2082]
 [2277 7868]]
             precision    recall  f1-score   support

  Rubio_neg       0.77      0.78      0.78      9591
  Rubio_pos       0.79      0.78      0.78     10145

avg / total       0.78      0.78      0.78     19736

0.779134576409
  Clinton
[[9400 1578]
 [2996 8181]]
             precision    recall  f1-score   support

Clinton_neg       0.76      0.86      0.80     10978
Clinton_pos       0.84      0.73      0.78     11177

avg / total       0.80      0.79      0.79     22155

0.793545475062
  Trump
[[9719 1299]
 [4103 7286]]
             precision    recall  f1-score   support

  Trump_neg       0.70      0.88      0.78     11018
  Trump_pos       0.85      0.64      0.73     11389

avg / total       0.78      0.76      0.76     22407

0.758914624894
  Cruz
[[9083 1040]
 [3686 7615]]
             precision    recall  f1-score   support

   Cruz_neg       0.71      0.90      0.79     10123
   Cruz_pos       0.88      0.67      0.76     11301

avg / total       0.80      0.78      0.78     21424

0.779406273338

term freq

  Sanders
[[7491 2429]
 [2928 8335]]
             precision    recall  f1-score   support

Sanders_neg       0.72      0.76      0.74      9920
Sanders_pos       0.77      0.74      0.76     11263

avg / total       0.75      0.75      0.75     21183

0.747108530425
  Rubio
[[6993 2598]
 [2560 7585]]
             precision    recall  f1-score   support

  Rubio_neg       0.73      0.73      0.73      9591
  Rubio_pos       0.74      0.75      0.75     10145

avg / total       0.74      0.74      0.74     19736

0.738650182408
  Clinton
[[9182 1796]
 [3824 7353]]
             precision    recall  f1-score   support

Clinton_neg       0.71      0.84      0.77     10978
Clinton_pos       0.80      0.66      0.72     11177

avg / total       0.76      0.75      0.74     22155

0.746332656285
  Trump
[[9496 1522]
 [4563 6826]]
             precision    recall  f1-score   support

  Trump_neg       0.68      0.86      0.76     11018
  Trump_pos       0.82      0.60      0.69     11389

avg / total       0.75      0.73      0.72     22407

0.728433078949
  Cruz
[[8331 1792]
 [3622 7679]]
             precision    recall  f1-score   support

   Cruz_neg       0.70      0.82      0.75     10123
   Cruz_pos       0.81      0.68      0.74     11301

avg / total       0.76      0.75      0.75     21424

0.747292755788

word 1- & 2grams

  Sanders
[[8174 1746]
 [1732 9531]]
             precision    recall  f1-score   support

Sanders_neg       0.83      0.82      0.82      9920
Sanders_pos       0.85      0.85      0.85     11263

avg / total       0.84      0.84      0.84     21183

0.835811735826
  Rubio
[[8417 1174]
 [1659 8486]]
             precision    recall  f1-score   support

  Rubio_neg       0.84      0.88      0.86      9591
  Rubio_pos       0.88      0.84      0.86     10145

avg / total       0.86      0.86      0.86     19736

0.856455208756
  Clinton
[[9793 1185]
 [1920 9257]]
             precision    recall  f1-score   support

Clinton_neg       0.84      0.89      0.86     10978
Clinton_pos       0.89      0.83      0.86     11177

avg / total       0.86      0.86      0.86     22155

0.859851049425
  Trump
[[9620 1398]
 [2429 8960]]
             precision    recall  f1-score   support

  Trump_neg       0.80      0.87      0.83     11018
  Trump_pos       0.87      0.79      0.82     11389

avg / total       0.83      0.83      0.83     22407

0.829205159102
  Cruz
[[8927 1196]
 [1850 9451]]
             precision    recall  f1-score   support

   Cruz_neg       0.83      0.88      0.85     10123
   Cruz_pos       0.89      0.84      0.86     11301

avg / total       0.86      0.86      0.86     21424

0.85782300224

char 3grams

  Sanders
[[8113 1807]
 [2142 9121]]
             precision    recall  f1-score   support

Sanders_neg       0.79      0.82      0.80      9920
Sanders_pos       0.83      0.81      0.82     11263

avg / total       0.81      0.81      0.81     21183

0.813576924893
  Rubio
[[8030 1561]
 [1713 8432]]
             precision    recall  f1-score   support

  Rubio_neg       0.82      0.84      0.83      9591
  Rubio_pos       0.84      0.83      0.84     10145

avg / total       0.83      0.83      0.83     19736

0.834110255371
  Clinton
[[9313 1665]
 [1826 9351]]
             precision    recall  f1-score   support

Clinton_neg       0.84      0.85      0.84     10978
Clinton_pos       0.85      0.84      0.84     11177

avg / total       0.84      0.84      0.84     22155

0.842428345746
  Trump
[[9032 1986]
 [2269 9120]]
             precision    recall  f1-score   support

  Trump_neg       0.80      0.82      0.81     11018
  Trump_pos       0.82      0.80      0.81     11389

avg / total       0.81      0.81      0.81     22407

0.810103985362
  Cruz
[[8973 1150]
 [2432 8869]]
             precision    recall  f1-score   support

   Cruz_neg       0.79      0.89      0.83     10123
   Cruz_pos       0.89      0.78      0.83     11301

avg / total       0.84      0.83      0.83     21424

0.832804331591


done
