Preparing Data
==============

In [30]:
import json
import cPickle as pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [31]:
!ls bossa/*json

bossa/tasks_export.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [32]:
bossa_results = pd.read_json("bossa/tasks_runs_export.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,11203,2.5e-05,52775,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [33]:
bossa_tasks = pd.read_json("bossa/tasks_export.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,52851,"{u'search_words': u'founder', u'appears_in_sen..."


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [34]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,11195,2.1e-05,52776,2,"{u'search_words': u'executive', u'appears_in_s..."


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [35]:
bossa_tasks_scores.ix[50].info.keys()

[u'search_words',
 u'appears_in_sentence',
 u'url',
 u'media',
 u'appears_in_noun_phrases',
 u'noun_phrases',
 u'sentence_id',
 u'text',
 u'sentence',
 u'pub_date',
 u'is_company']

In [36]:
def json_to_series(info):
    keys, values = zip(*info.iteritems())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,search_words,appears_in_sentence,url,media,appears_in_noun_phrases,noun_phrases,sentence_id,text,sentence,pub_date,is_company
50,11195,2.1e-05,52776,2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
51,11205,1.8e-05,52776,-1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
52,11207,1.7e-05,52776,1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
53,11209,1.7e-05,52776,-2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [37]:
bossa.to_csv("sentiment/scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [38]:
sentences = bossa.groupby(['sentence'])[['score']].aggregate(np.average)
sentences.to_csv("sentiment/scores.csv", encoding="utf8")
print(sentences.count())
sentences[1001:1004]

score    8996
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
"'We must hope after so much prevarication that this time Google's proposals represent a genuine attempt to address the concerns identified,' said David Wood, the legal counsel for Icomp, an industry group backed by Microsoft and a number of other companies.",-0.333333
"'We must push our leaders to step up and commit to action,' said Hugh Evans, the founder and chief executive of the charity.",-0.285714
"'We need them to tell the story of how we are making decisions and putting the organization together,' said George Postolos, the Astros' president and chief executive, who added that the team would not want a broadcaster who was uncomfortable explaining the front office's strategy.",-0.666667


Sentence Classifier
-------------------

In [39]:
from nltk.corpus import stopwords
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [40]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [41]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
percentage = 0.85  #  percentage for training, rest for for testing
# We split to have enough representativenesss for both positive and negative sentiments
sent_min = min(
    scores[scores.sentiment=='pos'].sentiment.count(),
    scores[scores.sentiment=='neg'].sentiment.count(),
)
scores = scores[["sentence", "sentiment"]]
train_data = np.array([])
train_labels = np.array([])
test_data = np.array([])
test_labels = np.array([])
for sent in ('pos', 'neg'):
    sent_scores = scores[scores['sentiment']==sent]
    sent_scores = sent_scores.reindex(np.random.permutation(sent_scores.index))
    sent_sentences_count = int(sent_scores['sentence'].count())
    sent_train = sent_scores[["sentence", "sentiment"]][:int(sent_sentences_count * percentage)]
    sent_test = sent_scores[["sentence", "sentiment"]][int(sent_sentences_count * percentage) + 1:]
    print(sent, sent_min, sent_train.sentiment.count(), sent_test.sentiment.count())
    train_data = np.append(train_data, sent_train["sentence"])
    train_labels = np.append(train_labels, sent_train["sentiment"])
    test_data = np.append(test_data, sent_test["sentence"])
    test_labels = np.append(test_labels, sent_test["sentiment"])

pos 2939 4281 755
neg 2939 2498 440


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [42]:
document_df = scores[['sentence', 'sentiment']]
document_df = document_df.reindex(np.random.permutation(document_df.index))

In [43]:
size = int(len(document_df) * 0.9)
size

7178

In [44]:
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Lars Buitinck <L.J.Buitinck@uva.nl>
# License: BSD 3 clause

from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [45]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [56]:
categories = [
    'pos',
    'neg',
]
print("Loading sentences for categories:")
print(categories if categories else "all")
data_train = document_df[:size]
data_test = document_df[size:]
print('data loaded')

Loading sentences for categories:
['pos']
data loaded


In [154]:
document_df['sentiment'].value_counts()

pos    5037
neg    2939
dtype: int64

In [152]:
data_train['sentiment'].value_counts()

pos    4540
neg    2638
dtype: int64

In [57]:
train_labels = np.array(map(lambda x: 1 if x == 'pos' else 0, data_train['sentiment']))
train_labels

array([0, 1, 1, ..., 1, 1, 1])

In [58]:
test_labels = np.array(map(lambda x: 1 if x == 'pos' else 0, data_test['sentiment']))
test_labels

array([1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0,

In [59]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train['sentence'])
data_test_size_mb = size_mb(data_test['sentence'])

print("%d documents - %0.3fMB (training set)" % (
    len(data_train), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test), data_test_size_mb))
print("%d categories" % len(categories))
print()

7178 documents - 1.296MB (training set)
798 documents - 0.145MB (test set)
1 categories



In [83]:
np.any(vectorizer.fit_transform(data_train.sentence.tolist()) == vectorizer.fit_transform(data_train['sentence']))

<7178x20204 sparse matrix of type '<type 'numpy.bool_'>'
	with 145024312 stored elements in Compressed Sparse Row format>

In [70]:
vectorizer.fit_transform(data_train['sentence']) == vectorizer.fit_transform(data_train[['sentence']])

ValueError: max_df corresponds to < documents than min_df

In [155]:
# split a training set and a test set
y_train = train_labels
y_test = test_labels

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.sentence.tolist())
print(X_train)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

Extracting features from the training data using a sparse vectorizer
  (0, 13906)	0.280931966791
  (0, 19579)	0.258754235708
  (0, 5814)	0.23936049477
  (0, 16228)	0.183016452411
  (0, 2998)	0.179695401064
  (0, 19066)	0.177099732255
  (0, 641)	0.22360336359
  (0, 1712)	0.218671204275
  (0, 1492)	0.249549645659
  (0, 7314)	0.102164020077
  (0, 16563)	0.249549645659
  (0, 13361)	0.229436869908
  (0, 7274)	0.214398773542
  (0, 14135)	0.236576504625
  (0, 2371)	0.258754235708
  (0, 15495)	0.218671204275
  (0, 16628)	0.271727376742
  (0, 11237)	0.253822076392
  (0, 12822)	0.173562867311
  (0, 11968)	0.170716929997
  (1, 11388)	0.171541827176
  (1, 3919)	0.169621729789
  (1, 8054)	0.237998274652
  (1, 3538)	0.156759319167
  (1, 6554)	0.0596913408339
  :	:
  (7176, 16893)	0.142118600074
  (7176, 10645)	0.0957326757805
  (7176, 3078)	0.165865612074
  (7176, 19904)	0.177144468939
  (7176, 12374)	0.0868874065707
  (7176, 11105)	0.0784995308889
  (7176, 4024)	0.080406761479
  (7176, 1341)	0.0796

In [145]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 100

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train2, y_train)
X_test = ch2.transform(X_test2)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))
print()

feature_names = np.asarray(feature_names)

Extracting 100 best features by a chi-squared test
done in 0.020813s



In [146]:
feature_names

array([u'accusations', u'accused', u'affected', u'alan', u'allegations',
       u'anti', u'argument', u'arrested', u'asked', u'attacked',
       u'authorities', u'ban', u'best', u'branch', u'bribes', u'case',
       u'cases', u'charged', u'charges', u'civil', u'comment',
       u'communication', u'continuing', u'controversy', u'corrupt',
       u'corruption', u'creative', u'crimes', u'criminal', u'crisis',
       u'cut', u'death', u'debate', u'debt', u'declined', u'demanding',
       u'depuy', u'didn', u'difficult', u'emergency', u'entrepreneur',
       u'error', u'evidence', u'fallen', u'fell', u'force', u'forced',
       u'frustrated', u'gained', u'government', u'groom', u'guilty',
       u'halt', u'ignored', u'include', u'information', u'invested',
       u'investigation', u'irish', u'killed', u'killing', u'lawsuit',
       u'lose', u'losing', u'losses', u'lost', u'mails', u'marriage',
       u'mckesson', u'mobile', u'paid', u'plane', u'political',
       u'problems', u'prosecutors'

In [147]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) < 80 else s[:75] + "..."

In [148]:
###############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
#     print(X_train)
#     print(y_train)
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        print('qqq',clf.coef_)
        
#         if feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 print('>>>',i, category)
#                 print('www',np.argsort(clf.coef_[i]))
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

#     print("classification report:")
#     print(metrics.classification_report(y_test, pred,
#                                         target_names=categories))

#     print("confusion matrix:")
#     print(metrics.confusion_matrix(y_test, pred))

#     print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [149]:
results = []
for clf, name in (
        #(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")
    ):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', LinearSVC(penalty="l1", dual=False, tol=1e-3)),
  ('classification', LinearSVC())
])))

# make some plots

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='r')
plt.barh(indices + .3, training_time, .2, label="training time", color='g')
plt.barh(indices + .6, test_time, .2, label="test time", color='b')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=.95)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

plt.show()

Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)
train time: 0.015s
test time:  0.000s
accuracy:   0.444
dimensionality: 100
density: 1.000000
qqq [[-0.5335122  -0.16511454  0.00374413  0.48295704 -0.1601605   0.13537801
  -0.02309218 -0.55704241  0.08029599 -0.3415274  -0.04028362 -0.51794795
   0.16182623 -0.18308933 -0.47915366  0.00272184 -0.44229246 -0.03284175
   0.09967849 -0.13049082 -0.08857961 -0.59862664 -0.06061582 -0.26512979
  -0.29895648 -0.23065064  0.10973617 -0.53932572 -0.1779774  -0.28218344
  -0.17013736 -0.07778489 -0.04678277  0.03071914  0.01696396 -0.50072726
  -0.25707113  0.14075555  0.22639308 -0.08603145  0.2474953  -0.20903327
  -0.25046047 -0.33029966 -0.27934157  0.20697446 -0.30893246 -0.51634666
   0.19834124 -0.03078869  

