Preparing Data
==============

In [1]:
import json
import cPickle as pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [2]:
!ls bossa/*json

bossa/control_tasks.json       bossa/tasks_export.json
bossa/control_tasks_runs.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [3]:
bossa_results = pd.read_json("bossa/tasks_runs_export.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,11203,2.5e-05,52775,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [4]:
bossa_tasks = pd.read_json("bossa/tasks_export.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,52851,"{u'search_words': u'founder', u'appears_in_sen..."


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [5]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,11195,2.1e-05,52776,2,"{u'search_words': u'executive', u'appears_in_s..."


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [6]:
bossa_tasks_scores.ix[50].info.keys()

[u'search_words',
 u'appears_in_sentence',
 u'url',
 u'media',
 u'appears_in_noun_phrases',
 u'noun_phrases',
 u'sentence_id',
 u'text',
 u'sentence',
 u'pub_date',
 u'is_company']

In [7]:
def json_to_series(info):
    keys, values = zip(*info.iteritems())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,search_words,appears_in_sentence,url,media,appears_in_noun_phrases,noun_phrases,sentence_id,text,sentence,pub_date,is_company
50,11195,2.1e-05,52776,2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
51,11205,1.8e-05,52776,-1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
52,11207,1.7e-05,52776,1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
53,11209,1.7e-05,52776,-2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [8]:
bossa.to_csv("sentiment/scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [9]:
sentences = bossa.groupby(['sentence'])[['score']].aggregate(np.average)
sentences.to_csv("sentiment/scores.csv", encoding="utf8")
print(sentences.count())
sentences[1001:1004]

score    8996
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
"'We must hope after so much prevarication that this time Google's proposals represent a genuine attempt to address the concerns identified,' said David Wood, the legal counsel for Icomp, an industry group backed by Microsoft and a number of other companies.",-0.333333
"'We must push our leaders to step up and commit to action,' said Hugh Evans, the founder and chief executive of the charity.",-0.285714
"'We need them to tell the story of how we are making decisions and putting the organization together,' said George Postolos, the Astros' president and chief executive, who added that the team would not want a broadcaster who was uncomfortable explaining the front office's strategy.",-0.666667


In [10]:
from collections import Counter

def majority(series): #receives a Pandas Series
    return Counter(map(lambda x: 1 if x > 0 else -1 if x < 0 else 0, series)).most_common(1)[0][0]

# score_calculate = np.average
score_calculate = majority

sentences = bossa.groupby(['sentence'])[['score']].aggregate(score_calculate)

Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [11]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

In [12]:
raw_scores = sentences.reset_index()
scores = raw_scores
print('Zero:', len(scores[scores.score==0]))
print('Non-zero:', len(scores[scores.score!=0]))

('Zero:', 2415)
('Non-zero:', 6581)


We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [13]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
# scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg' if s < 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
len(scores)

6581

In [15]:
document_df = scores[['sentence', 'sentiment']]
document_df = document_df.reindex(np.random.permutation(document_df.index))

In [16]:
document_df.head()

Unnamed: 0,sentence,sentiment
5236,"Mike Davis, the U.S.G.A. executive director, s...",pos
2644,But he has also suggested that federal regulat...,pos
2493,Both the Apple and Google stores offer several...,pos
7423,The commission also considered Google's partne...,pos
8586,Video provided by Chevron shows young men flat...,neg


#SCIKIT-LEARN

In [29]:
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density

In [19]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [20]:
# from sklearn.cross_validation import train_test_split

# categories = [
#     'pos',
#     'neg',
# ]
# print("Loading sentences for categories:")
# print(categories)
# data_train, data_test = train_test_split(document_df, train_size=0.9, test_size=0.1, random_state=100)
# print('Data loaded:')
# print('Train set: {} samples'.format(len(data_train)))
# print('Test set: {} samples'.format(len(data_test)))

In [21]:
# len(document_df)

In [22]:
# data_train['sentiment'].value_counts()

In [23]:
# data_test['sentiment'].value_counts()

In [24]:
# train_labels = np.array(map(lambda x: 1 if x == 'pos' else 0, data_train['sentiment']))
# train_labels

In [25]:
# test_labels = np.array(map(lambda x: 1 if x == 'pos' else 0, data_test['sentiment']))
# test_labels

In [26]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

# data_train_size_mb = size_mb(data_train['sentence'])
# data_test_size_mb = size_mb(data_test['sentence'])

# print("%d documents - %0.3fMB (training set)" % (
#     len(data_train), data_train_size_mb))
# print("%d documents - %0.3fMB (test set)" % (
#     len(data_test), data_test_size_mb))
# print("%d categories" % len(categories))
# print()

In [None]:
X = scores['sentence']
y = scores['sentiment']

In [None]:
K = 10
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 1000, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)),  # unigrams or bigrams or trigrams
    'vect__stop_words': (None, stopwords.words('english')),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
#     'clf__penalty': ('l1', 'l2', 'elasticnet'),
#     'clf__n_iter': (10, 50, 80),
#     'clf__loss': ('log', 'modified_huber'),
}
csf_list = [
    (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
    (Perceptron(n_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(n_estimators=100), "Random forest")
]
results = []
for clf, name in csf_list:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf),
    ])
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
    skf = StratifiedKFold(y, K)
    cross_val_score(grid_search, X, y, cv=skf, n_jobs=-1, verbose=1)

In [36]:
# split a training set and a test set
y_train = train_labels
y_test = test_labels

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(data_train.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.sentence.tolist())
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

Extracting features from the training data using a sparse vectorizer
done in 0.398901s at 2.688MB/s
n_samples: 5922, n_features: 18088

Extracting features from the test data using the same vectorizer
done in 0.027828s at 4.383MB/s
n_samples: 659, n_features: 18088



In [37]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 100

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
t0 = time()
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print("done in %fs" % (time() - t0))
print()

feature_names = np.asarray(feature_names)

Extracting 1000 best features by a chi-squared test
done in 0.022103s



In [38]:
# feature_names

In [39]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) < 80 else s[:75] + "..."

In [40]:
X_train0 = X_train
X_test0 = X_test

###############################################################################
# Benchmark classifiers
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    
#     if type(clf) == GaussianNB:
#         X_train = X_train0.todense()
#         X_test = X_test0.todense()
#     else:
#         X_train = X_train0
#         X_test = X_test0
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

#     if hasattr(clf, 'coef_'):
#         print("dimensionality: %d" % clf.coef_.shape[1])
#         print("density: %f" % density(clf.coef_))

#         print('qqq',clf.coef_)
        
#         if feature_names is not None:
#             print("top 10 keywords per class:")
#             for i, category in enumerate(categories):
#                 print('>>>',i, category)
#                 print('www',np.argsort(clf.coef_[i]))
#                 top10 = np.argsort(clf.coef_[i])[-10:]
#                 print(trim("%s: %s"
#                       % (category, " ".join(feature_names[top10]))))
#         print()

#     print("classification report:")
#     print(metrics.classification_report(y_test, pred,
#                                         target_names=categories))

#     print("confusion matrix:")
#     print(metrics.confusion_matrix(y_test, pred))

#     print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time