Preparing Data
==============

In [9]:
import json
import cPickle as pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [10]:
!ls bossa/*json

bossa/tasks_export.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [11]:
bossa_results = pd.read_json("bossa/tasks_runs_export.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,11203,2.5e-05,52775,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [12]:
bossa_tasks = pd.read_json("bossa/tasks_export.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,52851,"{u'search_words': u'founder', u'appears_in_sen..."


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [13]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,11195,2.1e-05,52776,2,"{u'search_words': u'executive', u'appears_in_s..."


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [14]:
bossa_tasks_scores.ix[50].info.keys()

[u'search_words',
 u'appears_in_sentence',
 u'url',
 u'media',
 u'appears_in_noun_phrases',
 u'noun_phrases',
 u'sentence_id',
 u'text',
 u'sentence',
 u'pub_date',
 u'is_company']

In [15]:
def json_to_series(info):
    keys, values = zip(*info.iteritems())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,search_words,appears_in_sentence,url,media,appears_in_noun_phrases,noun_phrases,sentence_id,text,sentence,pub_date,is_company
50,11195,2.1e-05,52776,2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
51,11205,1.8e-05,52776,-1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
52,11207,1.7e-05,52776,1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
53,11209,1.7e-05,52776,-2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [16]:
bossa.to_csv("sentiment/scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [17]:
sentences = bossa.groupby(['sentence'])[['score']].aggregate(np.average)
sentences.to_csv("sentiment/scores.csv", encoding="utf8")
print sentences.count()
sentences[1001:1004]

score    8996
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
"'We must hope after so much prevarication that this time Google's proposals represent a genuine attempt to address the concerns identified,' said David Wood, the legal counsel for Icomp, an industry group backed by Microsoft and a number of other companies.",-0.333333
"'We must push our leaders to step up and commit to action,' said Hugh Evans, the founder and chief executive of the charity.",-0.285714
"'We need them to tell the story of how we are making decisions and putting the organization together,' said George Postolos, the Astros' president and chief executive, who added that the team would not want a broadcaster who was uncomfortable explaining the front office's strategy.",-0.666667


Sentence Classifier
-------------------

In [18]:
from nltk.corpus import stopwords
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [19]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [20]:
scores[scores.sentiment=='neg'].sentiment.count()

NameError: name 'scores' is not defined

In [None]:
np.random.permutation([1,2,3,4,5])

In [21]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
percentage = 0.85  #  percentage for training, rest for for testing
# We split to have enough representativenesss for both positive and negative sentiments
sent_min = min(
    scores[scores.sentiment=='pos'].sentiment.count(),
    scores[scores.sentiment=='neg'].sentiment.count(),
)
scores = scores[["sentence", "sentiment"]]
train_data = np.array([])
train_labels = np.array([])
test_data = np.array([])
test_labels = np.array([])
for sent in ('pos', 'neg'):
    sent_scores = scores[scores['sentiment']==sent]
    sent_scores = sent_scores.reindex(np.random.permutation(sent_scores.index))
    sent_sentences_count = int(sent_scores['sentence'].count())
    sent_train = sent_scores[["sentence", "sentiment"]][:int(sent_sentences_count * percentage)]
    sent_test = sent_scores[["sentence", "sentiment"]][int(sent_sentences_count * percentage) + 1:]
    print sent, sent_min, sent_train.sentiment.count(), sent_test.sentiment.count()
    train_data = np.append(train_data, sent_train["sentence"])
    train_labels = np.append(train_labels, sent_train["sentiment"])
    test_data = np.append(test_data, sent_test["sentence"])
    test_labels = np.append(test_labels, sent_test["sentiment"])

pos 2939 4281 755
neg 2939 2498 440


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Extract feacture vectors from training data, and create and train a `MultinomialNB` classifier.

In [22]:
sentence_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
sentence_clf = sentence_clf.fit(train_data, train_labels)

Evaluating the performance of the classifier.

In [23]:
# np.mean(sentence_clf.predict(test_data) == test_labels)
sentence_clf.score(test_data, test_labels)

0.64184100418410039

A performance of 64% is not very high, so we try now with a different pipeline using a `SGDClassifier` classifier.

In [24]:
sgdc_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=1000)),
])
sgdc_clf = sgdc_clf.fit(train_data, train_labels)
sgdc_clf.score(test_data, test_labels)

0.63263598326359838

Not an improve really. Let's try now a grid search to find the best combination of calssifier and parameters.

In [25]:
from pprint import pprint
from time import time

pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'vect__stop_words': (None, stopwords.words('spanish')),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__penalty': ('l1', 'l2', 'elasticnet'),
              'clf__loss': ('log', 'modified_huber')  #, 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive')
}
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3)),  # unigrams or bigrams or trigrams
    'vect__stop_words': (None, stopwords.words('english')),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l1', 'l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
    'clf__loss': ('log', 'modified_huber'),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
#print("parameters:")
#pprint(parameters)
t0 = time()
grid_search.fit(train_data, train_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
('pipeline:', ['vect', 'tfidf', 'clf'])
Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1138 out of 1152 | elapsed:  7.8min remaining:    5.8s
[Parallel(n_jobs=-1)]: Done 1152 out of 1152 | elapsed:  7.9min finished


done in 474.295s
()
Best score: 0.679
Best parameters set:
	clf__alpha: 0.001
	clf__loss: 'modified_huber'
	clf__penalty: 'l2'
	tfidf__use_idf: True
	vect__ngram_range: (1, 1)
	vect__stop_words: None


In [1]:
from pprint import pprint
from time import time

pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'vect__stop_words': (None, stopwords.words('english')),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__penalty': ('l1', 'l2', 'elasticnet'),
              'clf__loss': ('log', 'modified_huber')  #, 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive')
}
parameters2 = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3), (2, 3)),  # unigrams or bigrams or trigrams
    'vect__stop_words': (None, stopwords.words('english')),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l1', 'l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
    'clf__loss': ('log', 'modified_huber'),
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
#print("parameters:")
#pprint(parameters)
t0 = time()
grid_search.fit(train_data, train_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

NameError: name 'Pipeline' is not defined

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'vect__stop_words': (None, stopwords.words('spanish')),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-2, 1e-3, 1e-4),
              'clf__penalty': ('l1', 'l2', 'elasticnet'),
              'clf__loss': ('log', 'modified_huber')  #, 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive')
}
gs_clf = GridSearchCV(sgdc_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_labels)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
score

clf__alpha: 0.0001
clf__loss: 'log'
clf__penalty: 'l2'
tfidf__use_idf: True
vect__ngram_range: (1, 1)
vect__stop_words: None


0.66956778285882879

Set the best parameters for `SDGClassifier` and run a cross-validation.

In [None]:
sgdc_clf.set_params(**best_parameters)
sgdc_clf = sgdc_clf.fit(train_data, train_labels)
scores =  cross_val_score(sgdc_clf, train_data, train_labels, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(score)))
sgdc_clf.score(test_data, test_labels)

scores: [ 0.67133382  0.6740413   0.67625369  0.67601476  0.6797048 ]  mean: 0.675470  std: 0.000000


0.69790794979079496

Let's save it for later.

In [None]:
sgdc_scores =  cross_val_score(sgdc_clf, train_data, train_labels, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(sgdc_scores), np.mean(sgdc_scores), np.std(sgdc_scores)))
with open("sentiment/sgdc_clf.pickle", "wb") as sgdc_file:
    pickle.dump(sgdc_clf, sgdc_file)
sgdc_clf.score(test_data, test_labels)

scores: [ 0.67133382  0.6740413   0.67625369  0.67601476  0.6797048 ]  mean: 0.675470  std: 0.002757


0.69790794979079496

Let's take a look to the precision, recall and F-score of the classifier.

In [None]:
precision, recall, fscore, support = precision_recall_fscore_support(test_labels, sgdc_clf.predict(test_data))
precision, recall, fscore

(array([ 0.6803653 ,  0.70184426]),
 array([ 0.33863636,  0.90728477]),
 array([ 0.4522003 ,  0.79145003]))

Try several more classifiers.

In [None]:
for classifier in [SGDClassifier, LinearSVC, MultinomialNB, BernoulliNB]:
    sentence_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', classifier()),
    ])
    sentence_clf = sentence_clf.fit(train_data, train_labels)
    scores =  cross_val_score(sentence_clf, train_data, train_labels, cv=5)
    print(classifier.__name__)
    print("\tscores: %s  mean: %f  std: %f" % (str(scores), np.mean(scores), np.std(score)))
    print("\tscore: %s" % sentence_clf.score(test_data, test_labels))

SGDClassifier
	scores: [ 0.36845984  0.36873156  0.36873156  0.36826568  0.36826568]  mean: 0.368491  std: 0.000000
	score: 0.36820083682
LinearSVC
	scores: [ 0.66764923  0.6600295   0.65634218  0.66715867  0.66568266]  mean: 0.663372  std: 0.000000
	score: 0.684518828452
MultinomialNB
	scores: [ 0.64112012  0.6379056   0.6379056   0.64280443  0.6398524 ]  mean: 0.639918  std: 0.000000
	score: 0.638493723849
BernoulliNB
	scores: [ 0.66322771  0.67182891  0.66666667  0.66346863  0.67232472]  mean: 0.667503  std: 0.000000
	score: 0.680334728033


Let's run `GridSeachCV` to find the best parameters for the ones showing better socres. Staring from `LinearSVC`

In [None]:
sentence_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer(use_idf=False, smooth_idf=True)),
                     ('clf', LinearSVC(C=1.0, loss='l1')),
])
sentence_clf = sentence_clf.fit(train_data, train_labels)
print sentence_clf.score(test_data, test_labels)
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3), (1, 4), (2, 4), (3, 4), (1, 5), (2, 5), (3, 5), (4, 5)],
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__C': (10, 1, 1e-1, 1e-2),
              'clf__loss': ('l1', 'l2'),
}
gs_clf = GridSearchCV(sentence_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_labels)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print(u"%s: %r" % (param_name, best_parameters[param_name]))
score

0.687866108787
clf__C: 1
clf__loss: 'l1'
tfidf__smooth_idf: False
tfidf__use_idf: True
vect__ngram_range: (1, 1)


0.67001032600678567

It seems like `MultinomialNB` is also good, let's see the search grid.

In [None]:
sentence_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
sentence_clf = sentence_clf.fit(train_data, train_labels)
print sentence_clf.score(test_data, test_labels)
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3), (1, 4), (2, 4), (3, 4), (1, 5), (2, 5), (3, 5), (4, 5)],
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__alpha': (10, 1, 1e-1, 1e-2, 1e-3),
}
gs_clf = GridSearchCV(sentence_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_labels)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print(u"%s: %r" % (param_name, best_parameters[param_name]))
score

0.636820083682
clf__alpha: 0.1
tfidf__smooth_idf: True
tfidf__use_idf: True
vect__ngram_range: (1, 1)


0.66956778285882879

We create the two classifiers, run cross-validation, and save them as pickled objects.

In [None]:
multinb1_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
multinb2_clf = multinb1_clf.set_params(**best_parameters)
multinb_clf = multinb1_clf.fit(train_data, train_labels)
multinb_scores =  cross_val_score(multinb_clf, train_data, train_labels, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(multinb_scores), np.mean(multinb_scores), np.std(multinb_scores)))
with open("sentiment/multinb_clf.pickle", "wb") as multinb_file:
    pickle.dump(multinb_clf, multinb_file)
multinb_clf.score(test_data, test_labels)

scores: [ 0.6705969   0.66961652  0.67330383  0.66273063  0.66568266]  mean: 0.668386  std: 0.003738


0.67029288702928869

In [None]:
precision, recall, fscore, support = precision_recall_fscore_support(train_labels, multinb_clf.predict(train_data))
precision, recall, fscore

(array([ 0.98076042,  0.92232376]),
 array([ 0.85708567,  0.99018921]),
 array([ 0.9147618 ,  0.95505238]))

Doing the same for `LinearSVC`.

In [None]:
linear1_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC()),
])
clf__alpha = best_parameters.pop("clf__alpha")
linear2_clf = linear1_clf.set_params(**best_parameters)
linear_clf = linear2_clf.fit(train_data, train_labels)
linear_scores =  cross_val_score(linear_clf, train_data, train_labels, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(linear_scores), np.mean(linear_scores), np.std(linear_scores)))
with open("sentiment/linear_clf.pickle", "wb") as linear_file:
    pickle.dump(linear_clf, linear_file)
linear_clf.score(test_data, test_labels)

scores: [ 0.66617539  0.66519174  0.65486726  0.67158672  0.66568266]  mean: 0.664701  std: 0.005431


0.67866108786610879

In [None]:
precision, recall, fscore, support = precision_recall_fscore_support(train_labels, multinb_clf.predict(train_data))
precision, recall, fscore

(array([ 0.98076042,  0.92232376]),
 array([ 0.85708567,  0.99018921]),
 array([ 0.9147618 ,  0.95505238]))

Let's try with `BernoulliNB`, the one that showed the best results so far.

In [None]:
sentence_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BernoulliNB()),
])
sentence_clf = sentence_clf.fit(train_data, train_labels)
print sentence_clf.score(test_data, test_labels)
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3), (1, 4), (2, 4), (3, 4), (1, 5), (2, 5), (3, 5), (4, 5)],
              'tfidf__use_idf': (True, False),
              'tfidf__smooth_idf': (True, False),
              'clf__alpha': (10, 1, 1e-1, 1e-2, 1e-3),
}
gs_clf = GridSearchCV(sentence_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train_data, train_labels)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print(u"%s: %r" % (param_name, best_parameters[param_name]))
score

0.680334728033
clf__alpha: 1
tfidf__smooth_idf: True
tfidf__use_idf: True
vect__ngram_range: (1, 1)


0.66337217878743182

In [30]:
bernoulli1_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', decode_error='replace', stop_words=stopwords.words('spanish'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BernoulliNB()),
])
bernoulli2_clf = bernoulli1_clf.set_params(**best_parameters)
bernoulli_clf = bernoulli2_clf.fit(train_data, train_labels)
multinb_scores =  cross_val_score(multinb_clf, train_data, train_labels, cv=5)
print("scores: %s  mean: %f  std: %f" % (str(multinb_scores), np.mean(multinb_scores), np.std(multinb_scores)))
with open("sentiment/bernoulli_clf.pickle", "wb") as multinb_file:
    pickle.dump(multinb_clf, multinb_file)
multinb_clf.score(test_data, test_labels)

scores: [ 0.6705969   0.66961652  0.67330383  0.66273063  0.66568266]  mean: 0.668386  std: 0.003738


0.67029288702928869

In [31]:
precision, recall, fscore, support = precision_recall_fscore_support(train_labels, multinb_clf.predict(train_data))
precision, recall, fscore

(array([ 0.98076042,  0.92232376]),
 array([ 0.85708567,  0.99018921]),
 array([ 0.9147618 ,  0.95505238]))