Preparing Data
==============

In [1]:
import json
import cPickle as pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [2]:
!ls bossa/*json

bossa/tasks_export.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [3]:
bossa_results = pd.read_json("bossa/tasks_runs_export.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,11203,2.5e-05,52775,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [4]:
bossa_tasks = pd.read_json("bossa/tasks_export.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,52851,"{u'search_words': u'founder', u'appears_in_sen..."


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [5]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,11195,2.1e-05,52776,2,"{u'search_words': u'executive', u'appears_in_s..."


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [6]:
bossa_tasks_scores.ix[50].info.keys()

[u'search_words',
 u'appears_in_sentence',
 u'url',
 u'media',
 u'appears_in_noun_phrases',
 u'noun_phrases',
 u'sentence_id',
 u'text',
 u'sentence',
 u'pub_date',
 u'is_company']

In [7]:
def json_to_series(info):
    keys, values = zip(*info.iteritems())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,search_words,appears_in_sentence,url,media,appears_in_noun_phrases,noun_phrases,sentence_id,text,sentence,pub_date,is_company
50,11195,2.1e-05,52776,2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
51,11205,1.8e-05,52776,-1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
52,11207,1.7e-05,52776,1,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0
53,11209,1.7e-05,52776,-2,executive,0,http://dealbook.nytimes.com/2013/05/17/a-toeho...,nyt,0,"[chinese investors, overseas companies, politi...",14,Chinese investors are increasingly opting to b...,Chinese investors are increasingly opting to b...,2013-05-17T11:47:51Z,0


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [8]:
bossa.to_csv("sentiment/scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [9]:
sentences = bossa.groupby(['sentence'])[['score']].aggregate(np.average)
sentences.to_csv("sentiment/scores.csv", encoding="utf8")
print sentences.count()
sentences[1001:1004]

score    8996
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
"'We must hope after so much prevarication that this time Google's proposals represent a genuine attempt to address the concerns identified,' said David Wood, the legal counsel for Icomp, an industry group backed by Microsoft and a number of other companies.",-0.333333
"'We must push our leaders to step up and commit to action,' said Hugh Evans, the founder and chief executive of the charity.",-0.285714
"'We need them to tell the story of how we are making decisions and putting the organization together,' said George Postolos, the Astros' president and chief executive, who added that the team would not want a broadcaster who was uncomfortable explaining the front office's strategy.",-0.666667


Sentence Classifier
-------------------

In [10]:
from nltk.corpus import stopwords
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [11]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [12]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
percentage = 0.85  #  percentage for training, rest for for testing
# We split to have enough representativenesss for both positive and negative sentiments
sent_min = min(
    scores[scores.sentiment=='pos'].sentiment.count(),
    scores[scores.sentiment=='neg'].sentiment.count(),
)
scores = scores[["sentence", "sentiment"]]
train_data = np.array([])
train_labels = np.array([])
test_data = np.array([])
test_labels = np.array([])
for sent in ('pos', 'neg'):
    sent_scores = scores[scores['sentiment']==sent]
    sent_scores = sent_scores.reindex(np.random.permutation(sent_scores.index))
    sent_sentences_count = int(sent_scores['sentence'].count())
    sent_train = sent_scores[["sentence", "sentiment"]][:int(sent_sentences_count * percentage)]
    sent_test = sent_scores[["sentence", "sentiment"]][int(sent_sentences_count * percentage) + 1:]
    print sent, sent_min, sent_train.sentiment.count(), sent_test.sentiment.count()
    train_data = np.append(train_data, sent_train["sentence"])
    train_labels = np.append(train_labels, sent_train["sentiment"])
    test_data = np.append(test_data, sent_test["sentence"])
    test_labels = np.append(test_labels, sent_test["sentiment"])

pos 2939 4281 755
neg 2939 2498 440


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#NLTK

In [13]:
document_df = scores[['sentence', 'sentiment']]
document_df = document_df.reindex(np.random.permutation(document_df.index))

In [14]:
document_df.head()

Unnamed: 0,sentence,sentiment
6638,"Slawomir Sierakowski is a sociologist, a found...",pos
8815,While swapping notes on China's latest 'airpoc...,neg
4601,"It was a stunning fall for Mr. Diamond, the on...",neg
6740,Started in Ghana by Danish entrepreneur Erik E...,pos
6632,Six of the women had been teachers; Clair is a...,neg


In [15]:
original_documents = [(r[1]['sentence'], r[1]['sentiment']) for r in document_df.iterrows()]

In [16]:
pos_file = open('positive-words.txt')
all_pos_words = [w.strip().lower() for w in pos_file]

neg_file = open('negative-words.txt')
all_neg_words = [w.strip().lower() for w in neg_file]

In [17]:
search_terms_f = open('search_terms.txt')
search_terms = []
for t in search_terms:
    l = t.strip().split()
    search_terms.extend(l)

In [18]:
import string

import nltk
from nltk.corpus import stopwords

###Options

In [19]:
#stopwords_english
stopwords_english = stopwords.words('english')
#stopwords_domain
stopwords_domain = search_terms + ["'s", '--']
#stopwords_english
punctuation = string.punctuation

In [20]:
def document_features1(document):      
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = word in document_words
    return features

In [21]:
def document_features2(document):      
    document_words = list(document)
    features = {}
    for word in document_words:
        if word in word_features:
            features[word] = features.get(word, 0) + 1
    return features

In [22]:
def document_features3(document):
    word_features3 = list(pos_words) + list(neg_words)
    document_words = set(document)
    features = {}
    for word in word_features3:
        features[word] = word in document_words
    return features

In [23]:
options = {}

#Option 1
opt1 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': False,
}
options[1] = opt1

#Option 2
opt2 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': False,
}
options[2] = opt2

#Option 3
opt3 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[3] = opt3

#Option 4
opt4 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': False,
}
options[4] = opt4

#Option 5
opt5 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[5] = opt5

#Option 6
opt6 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[6] = opt6

#Option 7
opt7 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': False,
}
options[7] = opt7

#Option 8
opt8 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[8] = opt8

#Option 9
opt9 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[9] = opt9

#Option 10
opt10 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': True,
}
options[10] = opt10

#Option 11
opt11 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[11] = opt11

#Option 12
opt12 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[12] = opt12

#Option 13
opt13 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[13] = opt13

#Option 14
opt14 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[14] = opt14

#Option 15
opt15 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': True,
}
options[15] = opt15

#Option 16
opt16 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[16] = opt16

#Option 17
opt17 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[17] = opt17

#Option 18
opt18 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[18] = opt18

#Option 19
opt19 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[19] = opt19

#Option 20
opt20 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[20] = opt20

#Option 21
opt21 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[21] = opt21

#Option 22
opt22 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[22] = opt22

#Option 23
opt23 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[23] = opt23

#Option 24
opt24 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[24] = opt24

In [24]:
def filt(x):
    c1 = x not in options[opt]['stopwords_english']
    c2 = x not in options[opt]['stopwords_domain']
    c3 = x not in options[opt]['punctuation']
    c4 = True if options[opt]['numbers'] else not x.isdigit()
    
    cs = [c1, c2, c3, c4]
    return all(cs)
 
def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered_words = filter(filt, tokens)
    return filtered_words

def print_option(opt):
    d = dict(opt)
    for k in d:
        if (type(d[k]) == list or type(d[k]) == str) and len(d[k]) > 0:
            d[k] = '[...]'
        elif hasattr(d[k], '__call__'):
            d[k] = d[k].func_name
    print(d)

In [25]:
best_option = None
best_accuracy = 0
for opt in options:
    print('=======================================================')
    print('RUNNING THE NaiveBayesClassifier WITH THE NEXT OPTIONS:')
    print_option(options[opt])
    print('\n')
    documents = [(preprocess(d[0]), d[1]) for d in original_documents]
    all_words = [w.lower() for d in original_documents for w in nltk.word_tokenize(d[0])]
    filtered_words = filter(filt, all_words)
    freq_dist = nltk.FreqDist(filtered_words)

    most_common_words = [word for word, freq in freq_dist.most_common()]
    most_common_words = filter(filt, most_common_words)
    if options[opt]['stopwords_domain_in_features']:
        most_common_words = list(set(most_common_words).difference(search_terms))
    if options[opt]['reverse']:
        most_common_words.reverse()
    word_features = most_common_words[:options[opt]['length']]
    print('----------------')
    print('Total features:')
    print(options[opt]['length'])
    print('Top 10 features:')
    print(word_features[:10])
    print('----------------')
    
    if options[opt]['document_features'] == document_features3:
        pos_words = set(all_pos_words).intersection(word_features)
        print('Total positive words:')
        print(len(pos_words))
        print('Top 10 positive words:')
        print list(pos_words)[:10]

        neg_words = set(all_neg_words).intersection(word_features)
        print('Total positive words:')
        print(len(neg_words))
        print('Top 10 positive words:')
        print list(neg_words)[:10]
    
    featureset = [(options[opt]['document_features'](d[0]), d[1]) for d in documents]
    size = int(len(featureset) * 0.9)
    train_set = featureset[:size]
    test_set = featureset[size:]
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classifier, test_set)
    print('****************')
    print('ACCURACY = {}'.format(accuracy))
    print('****************')
    classifier.show_most_informative_features(10)
    print('\n\n\n\n\n')
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_option = options[opt]

print('**************** BEST RESULT ****************')
print('BEST OPTION:')
print(best_option)
print('ACCURACY =', best_accuracy)
print('*********************************************')

RUNNING THE NaiveBayesClassifier WITH THE NEXT OPTIONS:
{'reverse': False, 'stopwords_domain_in_features': False, 'punctuation': '[...]', 'length': 1000, 'numbers': False, 'document_features': 'document_features1', 'stopwords_domain': '[...]', 'stopwords_english': '[...]'}


----------------
Total features:
1000
Top 10 features:
[u'executive', u'said', u'chief', u'apple', u'manager', u'company', u'mr.', u'new', u'founder', u'like']
----------------
****************
ACCURACY = 0.660401002506
****************
Most Informative Features
                 comment = True              neg : pos    =     10.8 : 1.0
                    debt = True              neg : pos    =      7.9 : 1.0
                 charges = True              neg : pos    =      7.3 : 1.0
                declined = True              neg : pos    =      6.5 : 1.0
                 related = True              neg : pos    =      6.4 : 1.0
                   space = True              pos : neg    =      6.2 : 1.0
           