Preparing Data
==============

In [1]:
import json
# import cPickle as pickle
import pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [2]:
!ls bossa/*json

bossa/control_tasks.json       bossa/tasks_export.json
bossa/control_tasks_runs.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [3]:
bossa_results = pd.read_json("bossa/tasks_runs_export.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,11203,2.5e-05,52775,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [4]:
bossa_tasks = pd.read_json("bossa/tasks_export.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,52851,{'url': 'http://www.nytimes.com/2013/02/22/art...


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [5]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,11195,2.1e-05,52776,2,{'url': 'http://dealbook.nytimes.com/2013/05/1...


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [6]:
bossa_tasks_scores.ix[50].info.keys()

dict_keys(['url', 'search_words', 'pub_date', 'text', 'appears_in_sentence', 'sentence', 'sentence_id', 'is_company', 'noun_phrases', 'appears_in_noun_phrases', 'media'])

In [7]:
def json_to_series(info):
#     keys, values = zip(*info.iteritems())
    keys, values = zip(*info.items())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,url,search_words,pub_date,text,appears_in_sentence,sentence,sentence_id,is_company,noun_phrases,appears_in_noun_phrases,media
50,11195,2.1e-05,52776,2,http://dealbook.nytimes.com/2013/05/17/a-toeho...,executive,2013-05-17T11:47:51Z,Chinese investors are increasingly opting to b...,0,Chinese investors are increasingly opting to b...,14,0,"[chinese investors, overseas companies, politi...",0,nyt
51,11205,1.8e-05,52776,-1,http://dealbook.nytimes.com/2013/05/17/a-toeho...,executive,2013-05-17T11:47:51Z,Chinese investors are increasingly opting to b...,0,Chinese investors are increasingly opting to b...,14,0,"[chinese investors, overseas companies, politi...",0,nyt
52,11207,1.7e-05,52776,1,http://dealbook.nytimes.com/2013/05/17/a-toeho...,executive,2013-05-17T11:47:51Z,Chinese investors are increasingly opting to b...,0,Chinese investors are increasingly opting to b...,14,0,"[chinese investors, overseas companies, politi...",0,nyt
53,11209,1.7e-05,52776,-2,http://dealbook.nytimes.com/2013/05/17/a-toeho...,executive,2013-05-17T11:47:51Z,Chinese investors are increasingly opting to b...,0,Chinese investors are increasingly opting to b...,14,0,"[chinese investors, overseas companies, politi...",0,nyt


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [8]:
bossa.to_csv("sentiment/scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [9]:
from collections import Counter

def majority(series): #receives a Pandas Series
    c = Counter(map(lambda x: 1 if x > 0 else -1 if x < 0 else 0, series))
    commons = c.most_common()
    if len(commons) > 1 and commons[0][1] == commons[1][1]:
        value = -9
    else:
        value = commons[0][0]
    return value
#     return Counter(map(lambda x: 1 if x > 0 else -1 if x < 0 else 0, series)).most_common(1)[0][0]

# score_calculate = np.average
score_calculate = majority

sentences = bossa.groupby(['sentence'])[['score']].aggregate(score_calculate)
sentences.to_csv("sentiment/scores.csv", encoding="utf8")
print(sentences.count())
sentences[1001:1004]

score    8996
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
"'We must hope after so much prevarication that this time Google's proposals represent a genuine attempt to address the concerns identified,' said David Wood, the legal counsel for Icomp, an industry group backed by Microsoft and a number of other companies.",0
"'We must push our leaders to step up and commit to action,' said Hugh Evans, the founder and chief executive of the charity.",-1
"'We need them to tell the story of how we are making decisions and putting the organization together,' said George Postolos, the Astros' president and chief executive, who added that the team would not want a broadcaster who was uncomfortable explaining the front office's strategy.",-1


Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [10]:
sentences.reset_index()['score'].value_counts()

 1    3991
-1    2193
-9    1530
 0    1282
dtype: int64

In [11]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

In [12]:
# raw_scores = sentences.reset_index()
# scores = raw_scores
# print('Zero:', len(scores[scores.score==0]))
# print('Non-zero:', len(scores[scores.score!=0]))

We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [13]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores = scores[scores.score!=-9]  # And ambiguous sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
# scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg' if s < 0 else 0)

In [14]:
sentences_df = scores[['sentence', 'sentiment']].reset_index(drop=True)
# sentences_df = sentences_df.reindex(np.random.permutation(sentences_df.index))

In [15]:
sentences_df.head()

Unnamed: 0,sentence,sentiment
0,"General Motors will recall nearly 3,200 manua...",neg
1,""" And Aim's problem is that many of its larges...",neg
2,"""'Don't be evil,' he cried, while being chased...",neg
3,"""A lot of companies seem to prefer it to other...",pos
4,"""An entrepreneur may well be unreasonable beca...",neg


#SCIKIT-LEARN

In [16]:
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density

In [17]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [18]:
from sklearn.cross_validation import train_test_split

# categories = [
#     'pos',
#     'neg',
# ]
# print("Loading sentences for categories:")
# print(categories)
# data_train, data_test = train_test_split(document_df, train_size=0.9, test_size=0.1, random_state=100)
# print('Data loaded:')
# print('Train set: {} samples'.format(len(data_train)))
# print('Test set: {} samples'.format(len(data_test)))

In [19]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

# data_train_size_mb = size_mb(data_train['sentence'])
# data_test_size_mb = size_mb(data_test['sentence'])

# print("%d documents - %0.3fMB (training set)" % (
#     len(data_train), data_train_size_mb))
# print("%d documents - %0.3fMB (test set)" % (
#     len(data_test), data_test_size_mb))
# print("%d categories" % len(categories))
# print()

In [20]:
###############################################################################
# Benchmark classifiers
def benchmark(name, clf, X_train, X_test, y_train, y_test):
    print("_" * 80)
    print("Training:", name)
    print(clf)
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

#     precision = metrics.precision_score(y_test, pred)
#     print("precision:   %0.3f" % score)

#     recall = metrics.recall_score(y_test, pred)
#     print("recall:   %0.3f" % score)

    print("classification report:")
    print(metrics.classification_report(y_test, pred))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return score#, precision, recall

In [21]:
# # Classifiers
# clf_list = [
#     (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge classifier"),
# #     (Perceptron(n_iter=50), "Perceptron"),
# #     (PassiveAggressiveClassifier(n_iter=50), "Passive-aggressive"),
# #     (KNeighborsClassifier(n_neighbors=10), "kNN"),
# #     (RandomForestClassifier(n_estimators=100), "Random forest")
# ]

# Classifiers
clf_list = [
    (RidgeClassifier(alpha=.00001, tol=1e-2, solver="lsqr"), "Ridge classifier"),
    (Perceptron(alpha=.00001, n_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(n_iter=50), "Passive-aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(n_estimators=100), "Random forest"),
    (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC'),
    (SGDClassifier(alpha=.000001, n_iter=50, penalty='l1'), 'SGDClassifier'),
    (NearestCentroid(), 'Nearest Centroid'),
    (MultinomialNB(alpha=.00001), 'Multinomial NB'),
    (BernoulliNB(alpha=.00001), 'Bernoulli NB'),
    (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC'),
]

In [22]:
# with open('sentiment/myclf.pickle', 'wb') as myclf_file:
#     pickle.dump(pipeline, myclf_file, protocol=2)

In [23]:
# with open('sentiment/myclf.pickle', 'rb') as myclf_file:
#     my_clf = pickle.load(myclf_file)

In [24]:
# Params
K = 10
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 1000, 2000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'vect__stop_words': (None, stopwords.words('english')),
    'tfidf__use_idf': (True, False),
    'clf__metric': ('euclidean', 'manhattan'),
#     'clf__shrink_threshold': (None, 0.1),
}

In [25]:
X = sentences_df['sentence']
y = sentences_df['sentiment']

skf = StratifiedKFold(y, K)
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', NearestCentroid()),
])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=skf)
grid_search.fit(X, y)
bs = grid_search.best_score_
be = grid_search.best_estimator_

print('*'*40)
print('Best score:', bs)
print()
print('Best parameters:')
print(be.get_params())
print('*'*40)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 15.8min finished


****************************************
Best score: 0.659282018111

Best parameters:
{'tfidf__smooth_idf': True, 'vect__analyzer': 'word', 'vect__encoding': 'utf-8', 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'b

In [26]:
with open('sentiment/gs.pickle', 'wb') as gs_file:
    pickle.dump(grid_search, gs_file, protocol=2)

In [27]:
with open('sentiment/be.pickle', 'wb') as be_file:
    pickle.dump(be, be_file, protocol=2)

In [28]:
results = []

for train_index, test_index in StratifiedKFold(y, K):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    benchmark_result = benchmark('NearestCentroid', be, X_train, X_test, y_train, y_test)
    results.append(benchmark_result)

________________________________________________________________________________
Training: NearestCentroid
Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=['i', 'me',...inear_tf=False, use_idf=True)), ('clf', NearestCentroid(metric='euclidean', shrink_threshold=None))])
train time: 1.581s
test time:  0.135s
accuracy:   0.603
classification report:
             precision    recall  f1-score   support

        neg       0.45      0.56      0.50       220
        pos       0.72      0.63      0.67       400

avg / total       0.63      0.60      0.61       620

confusion matrix:
[[123  97]
 [149 251]]

________________________________________________________________________________
Training: NearestCentroid
Pipeline(steps=[('vect', CountVector

In [143]:
results = []

X = sentences_df['sentence']
y = sentences_df['sentiment']
X_data, X_val, y_data, y_val = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=100)
X_data = X_data.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_data = y_data.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
for train_index, test_index in StratifiedKFold(y_data, K):
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    pipeline = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 2), max_df=1.0, stop_words='english')),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', NearestCentroid(metric='euclidean')),
    ])
#     grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=StratifiedKFold(y_data, K))
#     grid_search.fit(X_train, y_train)
#     be = grid_search.best_estimator_
#     print('*'*40)
#     print(be.get_params())
#     print('*'*40)
    benchmark_results = benchmark('NearestCentroid', pipeline, X_train, X_test, y_train, y_test)
    
#     results.append((be, grid_search.best_score_))

________________________________________________________________________________
Training: NearestCentroid
Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        ...inear_tf=False, use_idf=True)), ('clf', NearestCentroid(metric='euclidean', shrink_threshold=None))])
train time: 0.776s
test time:  0.082s
accuracy:   0.631
classification report:
             precision    recall  f1-score   support

        neg       0.45      0.46      0.45       221
        pos       0.72      0.72      0.72       438

avg / total       0.63      0.63      0.63       659

confusion matrix:
[[101 120]
 [123 315]]



In [93]:
from collections import defaultdict


results = []

X = sentences_df['sentence']
y = sentences_df['sentiment']
X_data, X_val, y_data, y_val = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=100)
X_data = X_data.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_data = y_data.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
for train_index, test_index in StratifiedKFold(y_data, K):
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', NearestCentroid()),
    ])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    be = grid_search.best_estimator_
    print('*'*40)
    print(be.get_params())
    print('*'*40)
    benchmark_results = benchmark('NearestCentroid', be, X_train, X_test, y_train, y_test)
    
    results.append((be, grid_search.best_score_))

Fitting 3 folds for each of 540 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.75, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=1000, min_df=1,
        ngram_ran

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.0min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.75, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=1000, min_df=1,
        ngram_ran

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.0min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 1.0, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 2), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 1.0, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1606 out of 1620 | elapsed:  8.0min remaining:    4.2s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.0min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 1.0, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1606 out of 1620 | elapsed:  8.0min remaining:    4.2s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 3), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.5, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 2000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=2000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.4s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.5, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1606 out of 1620 | elapsed:  8.0min remaining:    4.2s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 2), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.5, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 1), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.5, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='l1', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'l1', 'vect__input': 'content', 'vect__stop_words': None, 'vect__encoding': 'utf-8', 'tfidf__use_idf': True, 'vect__max_features': 1000, 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=1,
        ngram_range

[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1606 out of 1620 | elapsed:  8.1min remaining:    4.2s
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  8.1min finished


****************************************
{'vect__lowercase': True, 'vect__decode_error': 'strict', 'vect__tokenizer': None, 'tfidf__norm': 'l2', 'vect__ngram_range': (1, 3), 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'tfidf__sublinear_tf': False, 'vect__analyzer': 'word', 'clf__shrink_threshold': None, 'tfidf__smooth_idf': True, 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True), 'vect__max_df': 0.5, 'vect__strip_accents': None, 'vect__dtype': <class 'numpy.int64'>, 'clf': NearestCentroid(metric='euclidean', shrink_threshold=None), 'vect__vocabulary': None, 'vect__min_df': 1, 'clf__metric': 'euclidean', 'vect__input': 'content', 'vect__stop_words': ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 

In [76]:
from collections import defaultdict
# results = defaultdict(lambda: defaultdict(list))
results = defaultdict(list)

X = sentences_df['sentence']
y = sentences_df['sentiment']
X_data, X_val, y_data, y_val = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=100)
X_data = X_data.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_data = y_data.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
for train_index, test_index in StratifiedKFold(y_data, 2):
    X_train, X_test = X_data[train_index], X_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    for clf, name in clf_list:
        pipeline = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, 2))),
            ('tfidf', TfidfTransformer()),
            ('clf', clf),
        ])
#         grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
        benchmark_results = benchmark(name, pipeline, X_train, X_test, y_train, y_test)
        results[name].append(benchmark_results)

________________________________________________________________________________
Training: Ridge classifier
Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...True,
        fit_intercept=True, max_iter=None, normalize=False, solver='lsqr',
        tol=0.01))])
train time: 0.481s
test time:  0.233s
accuracy:   0.661
classification report:
             precision    recall  f1-score   support

        neg       0.49      0.32      0.39       986
        pos       0.71      0.83      0.77      1975

avg / total       0.64      0.66      0.64      2961

confusion matrix:
[[ 320  666]
 [ 339 1636]]

________________________________________________________________________________
Training: Perceptron
Pipeline(steps=[('vect', CountVector

In [63]:
y_test.value_counts()

pos    1975
neg     986
dtype: int64

In [47]:
X_data.reset_index(drop=True).head()

0    Three years ago, according to a previous NPD s...
1    While it might be counter-intuitive to expect ...
2    'I've seen five movies today, and it has been ...
3    Large checks came in from Pfizer, Caterpillar,...
4    The group stressed it had not employed any sen...
Name: sentence, dtype: object

In [48]:
y_data.reset_index(drop=True).head()

0    pos
1    pos
2    pos
3    pos
4    neg
Name: sentiment, dtype: object

In [52]:
len(X_train)

5329

In [76]:
X_train

582     'Our results for the fourth quarter of 2012 co...
594     'People enjoy the humor on board, and that we ...
595     'People have been bragging about booking six f...
596     'People have been expressing interest in this ...
597                                                   NaN
598     'People say you can't change China, but I woul...
599     'People were driving by daily and not realizin...
600                                                   NaN
601     'People who are trying too hard or people who ...
602                                                   NaN
603                                                   NaN
604     'President Obama is on pace to break the two m...
605     'President Obama's plan to cut Social Security...
606                                                   NaN
607                                                   NaN
608                                                   NaN
609     'Quality content has never been more important...
610           

In [85]:
X_data.reindex()

61      "The tapering of quantitative easing has only ...
5264    More and more Chinese parents apparently disag...
5267    More recently, General Motors offered shares t...
3719    He managed Lewis until 2010, when tensions ari...
4226    In a recent and noteworthy post at Realclimate...
399     'If you're buying something in an Apple store,...
8370    Three individuals with different levels of obj...
7080    The French choreographer and founder of the L....
8166    There is also some bad blood behind this story...
6235    Raiding the endowment 'was a suicidal thing to...
7348    The business's founders promote it as a low-pr...
3075    Don Steinbrugge, managing partner with Agecrof...
3963    Hyundai is offering a $750 credit for people w...
4437    In time, music became a sideline, as he took a...
7283    The authorities said the fraud was carried out...
4107    In 1998, Research in Motion sought professiona...
650     'Students' social media and digital footprint ...
4332    In its

In [73]:
X_data.ix[597]

KeyError: 597

In [60]:
X_train.ix[597]

nan

In [30]:
# K = 10
# parameters = {
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 1000, 5000, 10000, 50000),
#     'vect__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),  # unigrams or bigrams or trigrams
#     'vect__stop_words': (None, stopwords.words('english')),
#     'tfidf__use_idf': (True, False),
#     'tfidf__norm': ('l1', 'l2'),
# #     'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
# #     'clf__penalty': ('l1', 'l2', 'elasticnet'),
# #     'clf__n_iter': (10, 50, 80),
# #     'clf__loss': ('log', 'modified_huber'),
# }
# csf_list = [
#     (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
#     (Perceptron(n_iter=50), "Perceptron"),
#     (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
#     (KNeighborsClassifier(n_neighbors=10), "kNN"),
#     (RandomForestClassifier(n_estimators=100), "Random forest")
# ]
# results = {}
# for clf, name in csf_list:
#     pipeline = Pipeline([
#         ('vect', CountVectorizer()),
#         ('tfidf', TfidfTransformer()),
#         ('clf', clf),
#     ])
#     grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
#     skf = StratifiedKFold(y, K)
#     results[name] = cross_val_score(grid_search, X, y, cv=skf, n_jobs=-1, verbose=1)

In [36]:
# split a training set and a test set
y_train = train_labels
y_test = test_labels

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

Extracting features from the training data using a sparse vectorizer
done in 0.398901s at 2.688MB/s
n_samples: 5922, n_features: 18088

Extracting features from the test data using the same vectorizer
done in 0.027828s at 4.383MB/s
n_samples: 659, n_features: 18088



In [37]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 100

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))
print()

feature_names = np.asarray(feature_names)

Extracting 1000 best features by a chi-squared test
done in 0.022103s



In [38]:
# feature_names

In [39]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) < 80 else s[:75] + "..."

In [74]:
metrics.pairwise.pairwise_distances

<function sklearn.metrics.pairwise.pairwise_distances>