In [1]:
# Purpose:  Use supervised learning to train a classifier to predict Kaggle-assigned tweet sentiment.
#           Compare a variety of sample sizes, types of features, and classifiers.
#           Save the chosen classifier and featurizer to disk.
# Author:  Carol Sniegoski
# Date:  04/27/16
# Course:  MAS DSE capstone, Spring 2016

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import math

pd.set_option('display.max_colwidth', 240)
pd.set_option('display.max_columns', 50)

import sys
import os
import time
import datetime

from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer  # for featurizing using term frequencies
from sklearn.feature_extraction.text import CountVectorizer  # for featurizing using word n-grams

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

from sklearn import cross_validation
from sklearn.cross_validation import KFold

from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier

#from textblob import TextBlob
#from textblob import Blobber
#from textblob.taggers import NLTKTagger
#from textblob.sentiments import NaiveBayesAnalyzer

print 'done'

done


In [23]:
# Function defs.

# return string with non-ascii chars removed
def to_ascii(s):
    returnstr = s.strip()
    returnstr = "".join([ch for ch in returnstr if ord(ch)< 128])
    return returnstr

# Return df containing 'fraction' fraction of the original df,
# with each value in column 'col' equally represented.
def sample_equally( df, col, fraction ):
    n = int(df.shape[0] * fraction)  # Get the total number of records to sample.
    vals = pd.unique(df[col].values.ravel())  # Get the class labels.
    n_to_sample = int(n/len(vals))   # Get the number of records to sample from each class.
    
    samples = []
    for val in vals:
        #samples.append( df[df[col]==val].sample(n=n_to_sample) )  # This should work in python 0.16.1
        rows = np.random.choice(df[df[col]==val].index.values, n_to_sample)
        sampled_df = df.ix[rows]
        samples.append(sampled_df)
    
    result = pd.concat(samples)

    return result

# From newer version of python.
def cohen_kappa_score(y1, y2, labels=None, weights=None):
    confusion = confusion_matrix(y1, y2, labels=labels)
    n_classes = confusion.shape[0]
    sum0 = np.sum(confusion, axis=0)
    sum1 = np.sum(confusion, axis=1)
    expected = np.outer(sum0, sum1) / np.sum(sum0)

    if weights is None:
        w_mat = np.ones([n_classes, n_classes], dtype=np.int)
        w_mat.flat[:: n_classes + 1] = 0
    elif weights == "linear" or weights == "quadratic":
        w_mat = np.zeros([n_classes, n_classes], dtype=np.int)
        w_mat += np.arange(n_classes)
        if weights == "linear":
            w_mat = np.abs(w_mat - w_mat.T)
        else:
            w_mat = (w_mat - w_mat.T) ** 2
    else:
        raise ValueError("Unknown kappa weighting type.")

    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
    return 1 - k

print 'done'

done


In [5]:
# Locate the data.
%ls ../data/Kaggle/Kaggle_1stGOPDebateTweets/

Kaggle_topHashtagsByTopic_04-07-16.csv  database.sqlite
Sentiment.csv                           hashes.txt


In [6]:
# Set the names of the cleaned text field and the class label field,
# to be used in the sampling, featurizing, and classification steps.
clean_text = 'ascii_clean'  # This is where the cleaned text will be put.
label_field = 'K_sentiment' # This is where the labels will be put.

print 'done'

done


In [8]:
# Populate the clean_text field.
# Load data file & clean the 'text' column.

prefix = "../data/Kaggle/Kaggle_1stGOPDebateTweets/"
df = pd.read_csv(prefix + "Sentiment.csv")

# Convert to ascii
df['ascii'] = df['text'].apply(to_ascii)
df[clean_text] = df['ascii']

# Remove hashtags
#df[clean_text] = df[clean_text].str.replace(r"#([A-Za-z0-9_]+)", " ")

# Remove handles
#df[clean_text] = df[clean_text].str.replace(r"@([A-Za-z0-9_]+)", " ")

# Remove URLs
#df[clean_text] = df[clean_text].str.replace(r"http([A-Za-z0-9_:.\/]+)", " ")
# Replace URLs with "URL"
df[clean_text] = df[clean_text].str.replace(r"http([A-Za-z0-9_:.\/]+)", "URL")

# Remove punctuation symbols - but not @ or #
df[clean_text] = df[clean_text].str.replace(r"(['';:%()\+\*\"\…\“\”])", "")
#df[clean_text] = df[clean_text].str.replace(r"(['';:@%#()\+\*\"\…\“\”])", "")
#df[clean_text] = df[clean_text].str.replace(r"([;:@%#()\+\*\"\…\“\”])", "")

# Remove eol symbols
df[clean_text] = df[clean_text].str.replace(r"\n", " ")

# Remove &x symbols
df[clean_text] = df[clean_text].str.replace(r"&[a-z]+", " ")

# Convert to lowercase
df[clean_text] = df[clean_text].str.lower()

#print df[df['ascii']!=df[clean_text]][['ascii', 'K_sentiment']].head(10)
df[['text', clean_text]].head(3)
#df[df['ascii'].str.contains("\n")][['text', 'ascii']].head(10)

Unnamed: 0,text,ascii_clean
0,RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate,rt @nancyleegrahn how did everyone feel about the climate change question last night? exactly. #gopdebate
1,RT @ScottWalker: Didn't catch the full #GOPdebate last night. Here are some of Scott's best lines in 90 seconds. #Walker16 http://t.co/ZSfF…,rt @scottwalker didnt catch the full #gopdebate last night. here are some of scotts best lines in 90 seconds. #walker16 url
2,RT @TJMShow: No mention of Tamir Rice and the #GOPDebate was held in Cleveland? Wow.,rt @tjmshow no mention of tamir rice and the #gopdebate was held in cleveland? wow.


In [9]:
# Populate the label_field.
df[label_field] = df['sentiment']

# Show the class frequencies.
counts = df[label_field].value_counts()
total = counts.sum()

counts = counts.apply(lambda x: ("Count: %d" % x, "Frequency: %.2f" % (float(x)/total)) )

#print("%.2f" % a)
#string = 'string%d' % (i,)

print counts
print type(counts)

Negative    (Count: 8493, Frequency: 0.61)
Neutral     (Count: 3142, Frequency: 0.23)
Positive    (Count: 2236, Frequency: 0.16)
dtype: object
<class 'pandas.core.series.Series'>


In [66]:
# Sample the data so as to get equal numbers of each class.
# The sizes of the sampled dataset will be the following fractions of the size of the original dataset.
data_fractions = [.75] 

# Include the original unsampled data.
df_samples = [ df ]
sample_names = [ 'original data' ]

# Now add additional samples using random over- or undersampling so as to obtain equal numbers of each class. 
for data_fraction in data_fractions:
    df_samples.append(sample_equally( df, label_field, data_fraction ))
    sample_names.append( str(data_fraction)+' random sample' )

#for df_sample in df_samples:
for sample_name, df_sample in zip(sample_names, df_samples):
    print sample_name
    print df_sample.shape
    print df_sample[label_field].value_counts()
    print

print 'done'

original data
(13871, 24)
Negative    8493
Neutral     3142
Positive    2236
dtype: int64

0.75 random sample
(10401, 24)
Positive    3467
Neutral     3467
Negative    3467
dtype: int64

done


In [67]:
# Convert sampled data & their class labels into ndarrays.
X_samples = []
y_samples = []

for df_sample in df_samples:
    X_samples.append(df_sample[clean_text].values)
    y_samples.append(df_sample[label_field].values)

print type(X_samples[0])
print len(X_samples)
print len(y_samples)

<type 'numpy.ndarray'>
2
2


In [68]:
# Initialize list of types of feature vectors to use.
# Add feature vectors to this list by running one or more of the feature-generation cells below.

feature_vector_lists = []
feature_names = []
featurizers = []
print 'done'

done


In [69]:
# Create feature vectors using character n-grams.

this_feature_name = "char 3grams"
this_feature_vector_list = []

ngram_min_size = 3
ngram_max_size = 3

#test_string = ["I really like python, it's pretty awesome."]
vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char'  # create character ngrams
                             )

for X_sample, sample_name in zip(X_samples, sample_names):
    #X_featurized = vectorizer.fit_transform(X)
    #print X_featurized.shape
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    #feature_names.append( this_feature_name + ', ' + sample_name )
    
feature_vector_lists.append(this_feature_vector_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names
    
#print('{1}-grams: {0}'.format(vect.get_feature_names(), ngram_size))
#print 'Number of char n-grams:', len(vectorizer.get_feature_names())
#print vectorizer.get_feature_names()[1000:1025]


original data (13871, 7374)
0.75 random sample (10401, 6769)
['char 3grams']


In [15]:
# Create feature vectors using word n-grams.

this_feature_name = 'word 1- & 2grams'
this_feature_vector_list = []

ngram_min_size = 1
ngram_max_size = 2

vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,      # min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,    # max percent of docs a token can appear in (if a float value)
                             analyzer='word'  # create word ngrams; this is the default
                             )

for X_sample, sample_name in zip(X_samples, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    
feature_vector_lists.append(this_feature_vector_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

original data (13871, 8859)
0.75 random sample (10401, 8066)
['char 3grams', 'tfidf', 'word 1- & 2grams']


In [14]:
# Create feature vectors using term frequency (same as word 1-grams, presumably).

this_feature_name = 'term freq'
this_feature_vector_list = []

vectorizer = TfidfVectorizer(min_df = 5,     # Min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,   # Max percent of docs a token can appear in (if a float value)
                             sublinear_tf = True,   # Need to look up what this is
                             use_idf = False)  # Don't use inverse document frequency weighting

for X_sample, sample_name in zip(X_samples, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    
feature_vector_lists.append(this_feature_vector_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

original data (13871, 3242)
0.5 random sample (6933, 2042)
0.75 random sample (10401, 2841)
1.8 random sample (24966, 5494)
['char 3grams', 'word 1- & 2grams', 'term freq']


In [14]:
# Create feature vectors using tfidf.

this_feature_name = 'tfidf'
this_feature_vector_list = []

vectorizer = TfidfVectorizer(min_df = 5,     # Min number of docs a token must appear in (if an integer value)
                             max_df = 0.8,   # Max percent of docs a token can appear in (if a float value)
                             sublinear_tf = True,   # Need to look up what this is
                             use_idf = True)  # Use idf

for X_sample, sample_name in zip(X_samples, sample_names):
    this_feature_vector_list.append( vectorizer.fit_transform(X_sample) )
    
feature_vector_lists.append(this_feature_vector_list)
feature_names.append(this_feature_name)

for feature_vector, sample_name in zip(this_feature_vector_list, sample_names):
    print sample_name, feature_vector.shape
print feature_names

original data (13871, 3242)
0.75 random sample (10401, 2858)
['char 3grams', 'tfidf']


In [70]:
# Initialize list of classifiers to use.
# Add classifiers to this list by running one or more of the classifier-creation cells below.

classifiers = []
classifier_names = []
print 'done'

done


In [17]:
# Create Naive Bayes classifiers.

# Multinomial NaiveBayes. Commonly used for text classification.
classifier_names.append("Multinomial NB")
classifiers.append(MultinomialNB(alpha=1.0,        # Use default Laplacian smoothing
                                 fit_prior=True))  # Fit priors based on training data
                  
# Bernoulli NaiveBayes. Commonly used for text classification for short documents.
# Expects boolean features (e.g., word occurence/nonoccurence instead of term frequency or tfidf).
classifier_names.append('Bernoulli NB')
classifiers.append(BernoulliNB(alpha=1.0,        # Use default Laplacian smoothing
                               binarize=0,       # Threshold for binarizing the input features
                               fit_prior=True))  # Fit priors based on training data
print 'done'
print classifier_names

done
['Multinomial NB', 'Bernoulli NB']


In [71]:
# Create SVM / logistic regression classifiers.

# SVM with linear kernel.
classifier_names.append('LinearSVC')
classifiers.append(LinearSVC(random_state=0)) 

# Logistic regression (sometimes called maxent).
#classifier_names.append('Logistic')
#classifiers.append(SGDClassifier(loss='hinge', 
#                                 penalty='l2', 
#                                 alpha=1e-3, 
#                                 n_iter=5, 
#                                 random_state=0))
print 'done'
print classifier_names

done
['LinearSVC']


In [54]:
# Create ensemble classifiers.

# RandomForest.
classifier_names.append('RandomForest')
classifiers.append(RandomForestClassifier(n_estimators = 100))  # Number of decision stumps to use in the ensemble

# AdaBoost with decision stumps.
### WARNING: Training this is very, very slow. ###
#classifier_names.append('Adaboost on DecTree')
#classifiers.append( AdaBoostClassifier(
#        #BernoulliNB(alpha=1.0, binarize=0, fit_prior=True),  # Use BernoulliNB as the weak estimators
#        DecisionTreeClassifier(max_depth=2),   # Use decision tree stumps as the weak estimators
#        n_estimators=600,    # The max number of estimators at which boosting is terminated
#        learning_rate=1) )   # By how much to shrink the contribution of each (successive?) estimator
#                             # By default, AdaBoost uses estimated class probabilities, not boolean class values
print 'done'
print classifier_names

done
['LinearSVC', 'RandomForest']


In [72]:
# Initialize scores.
scores = {}
print 'done'

done


In [73]:
# Train and test classifiers using crossvalidation.
# This will use all the types of sampling, features, and classifiers that were created.

cv = 3  # Number of crossvalidation folds to use.
#scores = {}
#score_descriptions = []

for classifier, classifier_name in zip(classifiers, classifier_names):
    print 'beginning classifier ' + classifier_name
    scores[classifier_name] = {}
    for feature_vector_list, feature_name in zip(feature_vector_lists, feature_names):
        scores[classifier_name][feature_name] = {}
        for X, y_sample, sample_name in zip(feature_vector_list, y_samples, sample_names):
            y_pred = cross_validation.cross_val_predict(classifier, X, y_sample, cv=cv)
            scores[classifier_name][feature_name][sample_name] = ( y_sample, y_pred )
            #scores[classifier_name][feature_name][sample_name] = cross_validation.cross_val_score(classifier, X, y_sample, cv=cv)
            #scores.append( cross_validation.cross_val_score(classifier, X, y_sample, cv=cv) )
            #score_descriptions.append( classifier_name + ', ' + feature_name + ', ' + sample_name )
        print '   done with ' + feature_name
    
print 'done training and testing classifiers' 

beginning classifier LinearSVC
   done with char 3grams
done training and testing classifiers


In [75]:
# Print results.

for classifier_name, classifier_results in scores.items():
    print classifier_name
    print
    for feature_name, feature_results in classifier_results.items():
        print feature_name
        print
        #for sample_name, sample_scores in feature_results.items():
        for sample_name, results in feature_results.items():
            #print "  " + sample_name + ": mean=" + str(sample_scores.mean()) + ", std=" + str(sample_scores.std())
            print "  " + sample_name
            y, y_pred = results
            #print type(results)
            #print confusion_matrix( y, y_pred )
            #print classification_report( y, y_pred )
            print metrics.accuracy_score( y, y_pred )
            print cohen_kappa_score( y, y_pred )
            #print 
        print
    print
print 'done'

LinearSVC

char 3grams

  original data
0.570903323481
1
  0.75 random sample
0.755408133833
1


done


In [119]:
# Fit classifiers to the whole (sampled) dataset and look at the confusion matrix and classification report for each.

#OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)
predicteds = []
for classifier in classifiers:
    classifier.fit(X_featurized, y)
    predicteds.append( classifier.predict(X_featurized))
    
for y_pred in predicteds:
    print accuracy_score(y, y_pred)
    print '   Neg ', 'Neu ', 'Pos'
    print confusion_matrix(y, y_pred)
    print classification_report(y, y_pred)
    print

print 'done'

0.698202095952
   Neg  Neu  Pos
[[2434  679  354]
 [ 661 2343  463]
 [ 381  601 2485]]
             precision    recall  f1-score   support

   Negative       0.70      0.70      0.70      3467
    Neutral       0.65      0.68      0.66      3467
   Positive       0.75      0.72      0.73      3467

avg / total       0.70      0.70      0.70     10401


0.697817517546
   Neg  Neu  Pos
[[2261  885  321]
 [ 530 2560  377]
 [ 319  711 2437]]
             precision    recall  f1-score   support

   Negative       0.73      0.65      0.69      3467
    Neutral       0.62      0.74      0.67      3467
   Positive       0.78      0.70      0.74      3467

avg / total       0.71      0.70      0.70     10401


0.793192962215
   Neg  Neu  Pos
[[2709  470  288]
 [ 499 2573  395]
 [ 204  295 2968]]
             precision    recall  f1-score   support

   Negative       0.79      0.78      0.79      3467
    Neutral       0.77      0.74      0.76      3467
   Positive       0.81      0.86      0.8

In [120]:
# Apply classifiers to the whole (not sampled) dataset and look at the confusion matrix and classification report for each.

# Format the whole dataset.
X_all = df[clean_text].values
y_all = df[label_field].values
print type(X)

# Create feature vectors from the tweets. Use the same feature vector as above.
X_featurized_all = vectorizer.transform(X_all)

# Generate predictions using the classifier trained on the equally sampled data.
predicteds = []
for classifier in classifiers:
    predicteds.append( classifier.predict(X_featurized_all) )
    
for y_pred_all in predicteds:
    print accuracy_score(y_all, y_pred_all)
    print '   Neg ', 'Neu ', 'Pos'
    print confusion_matrix(y_all, y_pred_all)
    print classification_report(y_all, y_pred_all)
    print

#y_pred_all = classifier_2.predict(X_featurized_all)

# Get scores.
#print accuracy_score(y_all, y_pred_all)
#print '   Neg ', 'Neu ', 'Pos'
#print confusion_matrix(y_all, y_pred_all)
#print classification_report(y_all, y_pred_all)


<type 'numpy.ndarray'>
0.643068271934
   Neg  Neu  Pos
[[5475 2011 1007]
 [ 710 1949  483]
 [ 298  442 1496]]
             precision    recall  f1-score   support

   Negative       0.84      0.64      0.73      8493
    Neutral       0.44      0.62      0.52      3142
   Positive       0.50      0.67      0.57      2236

avg / total       0.70      0.64      0.66     13871


0.626559008002
   Neg  Neu  Pos
[[5070 2494  929]
 [ 557 2179  406]
 [ 252  542 1442]]
             precision    recall  f1-score   support

   Negative       0.86      0.60      0.71      8493
    Neutral       0.42      0.69      0.52      3142
   Positive       0.52      0.64      0.58      2236

avg / total       0.71      0.63      0.64     13871


0.685098406748
   Neg  Neu  Pos
[[5693 1765 1035]
 [ 617 2067  458]
 [ 210  283 1743]]
             precision    recall  f1-score   support

   Negative       0.87      0.67      0.76      8493
    Neutral       0.50      0.66      0.57      3142
   Positive       

In [None]:
### SAVE CLASSIFIERS TO DISK ###

In [None]:
# Now create/recreate the classifiers and featurizers to save to disk.
# This is not written very well; it's repetitive cut & paste.

In [56]:
# Create final classifier for LinearSVC, char 3grams, 1.8 sample.

# Get the desired feature vector. Sigh, if only I had used dicts from the beginning ...
classifier_name = "LinearSVC"
feature_name = "char 3grams"
sample_name = "1.8 random sample"

classifier_ix = classifier_names.index(classifier_name)
feature_ix = feature_names.index(feature_name)
sample_ix = sample_names.index(sample_name)

classifier = classifiers[classifier_ix]
X_featurized = feature_vector_lists[feature_ix][sample_ix]
y = y_samples[sample_ix]
print type(X_featurized), X_featurized.shape
print type(y), y.shape

print 'done'

<class 'scipy.sparse.csr.csr_matrix'> (24966, 9393)
<type 'numpy.ndarray'> (24966,)
done


In [57]:
%%timeit -n1 -r1

# Trein classifier on the entire feature vector.
classifier.fit(X_featurized, y)

1 loops, best of 1: 34.1 s per loop


In [58]:
# Save trained classifiers to pickle files.

# Find output directory.
%ls ../

[34mclassification[m[m/                [34mdocuments[m[m/                     [34mspark-1.6.0-bin-hadoop2.4[m[m/     tweet_neo4j_2-2016-01-23.zip
[34mcurl[m[m/                          [34mscala-2.11.7[m[m/                  spark-1.6.0-bin-hadoop2.4.tgz  [34mworkspace[m[m/
[34mdata[m[m/                          scala-2.11.7.tgz               [34mtarget[m[m/
[34mdataCharacterization[m[m/          [34mscripts[m[m/                       [34mtweet_neo4j_2[m[m/


In [59]:
# Create file names.

out_dir = '../classification/'
outfile_name = classifier_name + '_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/LinearSVC_char3grams_1.8randomsample_2016-04-29.pkl


In [60]:
%%timeit -n1 -r1

# Save trained classifier object to file.
joblib.dump(classifier, outfile) 

1 loops, best of 1: 14.8 ms per loop


In [28]:
# Create final classifier for LinearSVC, char 3grams, original sample.

# Get the desired feature vector. Sigh, if only I had used dicts from the beginning ...
classifier_name = "LinearSVC"
feature_name = "char 3grams"
sample_name = "original data"

classifier_ix = classifier_names.index(classifier_name)
feature_ix = feature_names.index(feature_name)
sample_ix = sample_names.index(sample_name)

classifier = classifiers[classifier_ix]
X_featurized = feature_vector_lists[feature_ix][sample_ix]
y = y_samples[sample_ix]
print type(X_featurized), X_featurized.shape
print type(y), y.shape

print 'done'

<class 'scipy.sparse.csr.csr_matrix'> (13871, 7374)
<type 'numpy.ndarray'> (13871,)
done


In [29]:
%%timeit -n1 -r1

# Trein classifier on the entire feature vector.
classifier.fit(X_featurized, y)

1 loops, best of 1: 12.9 s per loop


In [30]:
# Find output directory.
%ls ../

[34mclassification[m[m/                [34mdocuments[m[m/                     [34mspark-1.6.0-bin-hadoop2.4[m[m/     tweet_neo4j_2-2016-01-23.zip
[34mcurl[m[m/                          [34mscala-2.11.7[m[m/                  spark-1.6.0-bin-hadoop2.4.tgz  [34mworkspace[m[m/
[34mdata[m[m/                          scala-2.11.7.tgz               [34mtarget[m[m/
[34mdataCharacterization[m[m/          [34mscripts[m[m/                       [34mtweet_neo4j_2[m[m/


In [31]:
# Create file names.

out_dir = '../classification/'
outfile_name = classifier_name + '_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/LinearSVC_char3grams_originaldata_2016-04-28.pkl


In [32]:
%%timeit -n1 -r1

# Save trained classifier object to file.
joblib.dump(classifier, outfile) 

1 loops, best of 1: 15.9 ms per loop


In [33]:
# Create final classifier for Random Forest-depth 5, char 3grams, original sample.

# Get the desired feature vector. Sigh, if only I had used dicts from the beginning ...
classifier_name = "RandomForest-d5"
feature_name = "char 3grams"
sample_name = "original data"

classifier_ix = classifier_names.index(classifier_name)
feature_ix = feature_names.index(feature_name)
sample_ix = sample_names.index(sample_name)

classifier = classifiers[classifier_ix]
X_featurized = feature_vector_lists[feature_ix][sample_ix]
y = y_samples[sample_ix]
print type(X_featurized), X_featurized.shape
print type(y), y.shape

print 'done'

<class 'scipy.sparse.csr.csr_matrix'> (13871, 7374)
<type 'numpy.ndarray'> (13871,)
done


In [34]:
%%timeit -n1 -r1

# Trein classifier on the entire feature vector.
classifier.fit(X_featurized, y)

1 loops, best of 1: 929 ms per loop


In [35]:
# Create file names.

out_dir = '../classification/'
outfile_name = classifier_name + '_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/RandomForest-d5_char3grams_originaldata_2016-04-28.pkl


In [36]:
%%timeit -n1 -r1

# Save trained classifier object to file.
joblib.dump(classifier, outfile) 

1 loops, best of 1: 279 ms per loop


In [37]:
# Create final classifier for Random Forest, char 3grams, 0.75 random sample.

# Get the desired feature vector. Sigh, if only I had used dicts from the beginning ...
classifier_name = "RandomForest"
feature_name = "char 3grams"
sample_name = "0.75 random sample"

classifier_ix = classifier_names.index(classifier_name)
feature_ix = feature_names.index(feature_name)
sample_ix = sample_names.index(sample_name)

classifier = classifiers[classifier_ix]
X_featurized = feature_vector_lists[feature_ix][sample_ix]
y = y_samples[sample_ix]
print type(X_featurized), X_featurized.shape
print type(y), y.shape

print 'done'

<class 'scipy.sparse.csr.csr_matrix'> (10401, 6746)
<type 'numpy.ndarray'> (10401,)
done


In [38]:
%%timeit -n1 -r1

# Trein classifier on the entire feature vector.
classifier.fit(X_featurized, y)

1 loops, best of 1: 45.3 s per loop


In [39]:
# Create file names.

out_dir = '../classification/'
outfile_name = classifier_name + '_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/RandomForest_char3grams_0.75randomsample_2016-04-28.pkl


In [40]:
%%timeit -n1 -r1

# Save trained classifier object to file.
joblib.dump(classifier, outfile) 

1 loops, best of 1: 401 ms per loop


In [None]:
### SAVE FEATURIZERS TO DISK ###

In [61]:
# Recreate the featurizer to save:  char 3grams, 1.8 sample.

sample_name = '1.8 random sample'
sample_ix = sample_names.index(sample_name)
X_sample = X_samples[sample_ix]

ngram_min_size = 3
ngram_max_size = 3

vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char'  # create character ngrams
                             )
vectorizer.fit(X_sample)
print X_sample.shape

(24966,)


In [62]:
# Create file names.

out_dir = '../classification/'
outfile_name = 'featurizer_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/featurizer_char3grams_1.8randomsample_2016-04-29.pkl


In [63]:
%%timeit -n1 -r1

# Save featurizer object to file.
joblib.dump(vectorizer, outfile) 

1 loops, best of 1: 494 ms per loop


In [41]:
# Recreate the featurizer to save:  char 3grams, original data.

sample_name = 'original data'
sample_ix = sample_names.index(sample_name)
X_sample = X_samples[sample_ix]

ngram_min_size = 3
ngram_max_size = 3

vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char' # create character ngrams
                             )
vectorizer.fit(X_sample)
print X_sample.shape

(13871,)


In [42]:
# Create file names.

out_dir = '../classification/'
outfile_name = 'featurizer_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/featurizer_char3grams_originaldata_2016-04-28.pkl


In [43]:
%%timeit -n1 -r1

# Save featurizer object to file.
joblib.dump(vectorizer, outfile) 

1 loops, best of 1: 370 ms per loop


In [44]:
# Recreate the featurizer to save:  char 3grams, 0.75 random sample.

sample_name = '0.75 random sample'
sample_ix = sample_names.index(sample_name)
X_sample = X_samples[sample_ix]

ngram_min_size = 3
ngram_max_size = 3

vectorizer = CountVectorizer(ngram_range=(ngram_min_size,ngram_max_size),  # use n-gram sizes from min to max, inclusive
                             min_df = 5,     # min number of docs a token must appear in
                             max_df = .8,    # max percent of docs a token can appear in
                             analyzer='char' # create character ngrams
                             )
vectorizer.fit(X_sample)
print X_sample.shape

(10401,)


In [45]:
# Create file names.

out_dir = '../classification/'
outfile_name = 'featurizer_' + feature_name + '_' + sample_name + '_' + str(datetime.date.today()) + '.pkl'
outfile_name = outfile_name.replace(' ', '')
outfile = out_dir + outfile_name
print outfile

../classification/featurizer_char3grams_0.75randomsample_2016-04-28.pkl


In [46]:
%%timeit -n1 -r1

# Save featurizer object to file.
joblib.dump(vectorizer, outfile) 

1 loops, best of 1: 322 ms per loop


In [None]:
### DO NOT USE BELOW THIS POINT ###

In [None]:
# Visualize results.

#df = pd.read_table("data.csv",sep="|")
#grouped = df.groupby('app')['hours']

title = 'TITLE'
colors = "rgbcmyk"

fig, ax = plt.subplots()
initial_gap = 0.1
start = initial_gap
width = 1.0
gap = 0.05

#for app,group in grouped:
for 
    size = group.shape[0]
    ind = np.linspace(start,start + width, size+1)[:-1]   
    w = (ind[1]-ind[0])
    start = start + width + gap
    plt.bar(ind,group,w,color=list(colors[:size]))

tick_loc = (np.arange(len(grouped)) * (width+gap)) + initial_gap + width/2
ax.set_xticklabels([app for app,_ in grouped])
ax.xaxis.set_major_locator(mtick.FixedLocator(tick_loc))

plt.show()



In [None]:
# Read classifier back in.
# classifier = joblib.load(outfile_name) 

In [125]:
### OLD VERSION ###
# Train, test, and score classifiers using crossvalidation.

cv = 5  # Number of crossvalidation folds to use.
scores = []
score_descriptions = []

for classifier, classifier_name in zip(classifiers, classifier_names):
    print 'beginning classifier ' + classifier_name
    for feature_vector_list, feature_name in zip(feature_vector_lists, feature_names):
        for X, y_sample, sample_name in zip(feature_vector_list, y_samples, sample_names):
            scores.append( cross_validation.cross_val_score(classifier, X, y_sample, cv=cv) )
            score_descriptions.append( classifier_name + ', ' + feature_name + ', ' + sample_name )
        print '   done with ' + feature_name
    #print 'done with classifier ' + classifier_name
print 'done training and testing classifiers' 
print

for score, score_description in zip(scores, score_descriptions):
    print score_description
    #print score
    print score.mean(), score.std()
    print

print 'done'

beginning classifier LinearSVC
   done with char 3grams
   done with word 1- & 2grams
   done with term freq
   done with tfidf
beginning classifier Logistic
   done with char 3grams
   done with word 1- & 2grams
   done with term freq
   done with tfidf
done training and testing classifiers

LinearSVC, char 3grams, original data
0.58872294631 0.041397971201

LinearSVC, char 3grams, 0.75 random sample
0.765214793745 0.00575572887056

LinearSVC, char 3grams, 1.8 random sample
0.879196576384 0.00447410561315

LinearSVC, word 1- & 2grams, original data
0.603999992602 0.0239679891631

LinearSVC, word 1- & 2grams, 0.75 random sample
0.757136342151 0.00430706192601

LinearSVC, word 1- & 2grams, 1.8 random sample
0.869623854624 0.00539175961027

LinearSVC, term freq, original data
0.654172112618 0.0192540811024

LinearSVC, term freq, 0.75 random sample
0.702433003009 0.00358254670367

LinearSVC, term freq, 1.8 random sample
0.768886819512 0.00517943280617

LinearSVC, tfidf, original data
0.64

In [118]:

# Create classifier(s). Classify & score using crossvalidation.
# Note that all sklearn classifiers do multiclass classification.
# So we do not need to use sklearn.multiclass unless we want to experiment with different multiclass strategies.
# e.g.:
#classifiers.append( OneVsOneClassifier(LinearSVC(random_state=0)) )
#classifiers.append( OneVsRestClassifier(LinearSVC(random_state=0)) )


#classifiers.append( svm.SVC() )

# Multinomial NaiveBayes. Commonly used for text classification.
classifiers.append(MultinomialNB(alpha=1.0,        # Use default Laplacian smoothing
                                 fit_prior=True))  # Fit priors based on training data
                  
# Bernoulli NaiveBayes. Commonly used for text classification for short documents.
# Expects boolean features (e.g., word occurence/nonoccurence instead of term frequency or tfidf).
classifiers.append(BernoulliNB(alpha=1.0,        # Use default Laplacian smoothing
                               binarize=0,       # Threshold for binarizing the input features
                               fit_prior=True))  # Fit priors based on training data

# SVM with linear kernel.
classifiers.append(LinearSVC(random_state=0)) 

# Logistic regression (sometimes called maxent).
classifiers.append(SGDClassifier(loss='hinge', 
                                 penalty='l2', 
                                 alpha=1e-3, 
                                 n_iter=5, 
                                 random_state=0))

# RandomForest.
classifiers.append(RandomForestClassifier(n_estimators = 100))  # Number of decision stumps to use in the ensemble

# AdaBoost with decision stumps.
classifiers.append( AdaBoostClassifier(
        #BernoulliNB(alpha=1.0, binarize=0, fit_prior=True),  # Use BernoulliNB as the weak estimators
        DecisionTreeClassifier(max_depth=2),   # Use decision tree stumps as the weak estimators
        n_estimators=600,    # The max number of estimators at which boosting is terminated
        learning_rate=1) )   # By how much to shrink the contribution of each (successive?) estimator
                             # By default, AdaBoost uses estimated class probabilities, not boolean class values
 
#classifiers.append( AdaBoostClassifier( 
#            DecisionTreeClassifier(max_depth=2),
#            n_estimators=600,
#            learning_rate=1.5,
#            algorithm="SAMME") )  # Use boolean class values instead of estimated class probabilities

#classifiers.append( AdaBoostClassifier( 
#            DecisionTreeClassifier(max_depth=2),
#            n_estimators=600,
#            learning_rate=1.5,
#            algorithm="SAMME") )  # Use boolean class values instead of estimated class probabilities


cv = 5  # Number of crossvalidation folds to use.
scores = []

for classifier in classifiers:
    scores.append( cross_validation.cross_val_score(classifier, X_featurized, y, cv=cv) )
    
for score in scores:
    print score
    print score.mean()
    print

print 'done'

[ 0.65802113  0.64889529  0.65560366  0.65897066  0.63492063]
0.651282275202

[ 0.65513929  0.64457253  0.67003367  0.65897066  0.63203463]
0.65215015532

[ 0.70653218  0.69644573  0.70851371  0.69215969  0.68927369]
0.698584999161

[ 0.6364073   0.63400576  0.64165464  0.63251563  0.61519962]
0.631956590746

[ 0.7857829   0.79538905  0.8027898   0.79268879  0.77537278]
0.79040466418

[ 0.74447646  0.72910663  0.73256373  0.74266474  0.71284271]
0.73233085625

done


In [None]:
# Split into training & test sets.
# The KFold function returns lists of indices to use for the splits.

kf = KFold(len(X_selected), n_folds=10, shuffle=True, random_state=0)
print type(kf)

for train, test in kf:
    #print("%s %s" % (train.shape, test.shape))
    print train
    print test
    

In [None]:
# Create feature vectors.
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

print 'done'

In [None]:
# Use basic linear regression.
# Estimate error using 10-fold cross validation.

clf = sklearn.linear_model.LinearRegression()
scores = sklearn.cross_validation.cross_val_score(clf, X_selected, y, cv=10)
print scores
print scores.mean()
print 'done'

In [None]:
# Perform classification with SVM, kernel=rbf.
# Get the time needed for training and for classification.

classifier_rbf = svm.SVC()

t0 = time.time()
classifier_rbf.fit(train_vectors, train_labels)
t1 = time.time()

prediction_rbf = classifier_rbf.predict(test_vectors)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

print 'done'

In [None]:
# Look at the confusion matrix.

train_pred = clf.predict(X_train)
print iris.target_names
confusion_matrix(y_train, train_pred)


In [None]:
# Return df containing 'fraction' fraction of the original df,
# with equal representation by the value in column 'col'.
# Don't use this function.
def get_equal_samples_dumb(df, col, fraction):
    #vals = pd.unique(df[col].values.ravel())
    #print type(vals)
    #print "vals=", vals
    #vals.sort()
    
    weight_col = 'my_temp_weight'
    
    denom = df.shape[0]
    val_counts = df[col].value_counts()
    val_weights_dict = {}
    for val in val_counts.keys():
        val_weights_dict[val] = (val_counts[val] / float(denom))
    print val_weights_dict
  
    df[weight_col] = df[col]
    #df[weight_col].replace({ 'pos':'Positive', 'neg':'Negative' }, inplace=True)
    df[weight_col].replace(val_weights_dict, inplace=True)
    
    df_sampled = df.sample(frac=fraction, weight=df[weight_col])
    
    df_sampled.drop(weight_col, axis=1, inplace=True)
    
    return df_sampled


In [148]:
print df.shape

(13871, 23)
