Preparing Data
==============

In [1]:
import json
import cPickle as pickle
import numpy as np
import pandas as pd
from pandas import factorize

In [2]:
!ls bossa/*json

bossa/control_tasks.json       bossa/tasks_export.json
bossa/control_tasks_runs.json  bossa/tasks_runs_export.json


BOSSA Results
-------------

Processing `results_bossa.json` to get a *dictionary* with keys the task ids, and values in as the average value of the scores. To do that, we first convert scores from categorical (`neg`, `neu`, `pos`) to a numeric scale.

In [3]:
bossa_results = pd.read_json("bossa/control_tasks_runs.json")
bossa_results.rename(columns={"created": "start_time", "id": "result_id", "info": "score"}, inplace=True)
bossa_results[['start_time']]= bossa_results[['start_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results[['finish_time']]= bossa_results[['finish_time']].apply(pd.to_datetime, dayfirst=True)
bossa_results['score'] = pd.Categorical(bossa_results['score'], categories=['vneg', 'neg', 'neu', 'pos', 'vpos'])
bossa_results['score'].cat.rename_categories([-2, -1, 0, 1, 2], inplace=True)
# Normalize everything to -1, 0, 1
# bossa_results['score'] = bossa_results['score'].astype(float).apply(lambda x: -1 if x < 0 else 1 if x > 0 else 0)
bossa_results["seconds"] = (bossa_results["finish_time"] - bossa_results["start_time"]).astype('timedelta64[us]') / 1e6
bossa_results = bossa_results[["result_id", "seconds", "task_id", "score"]]
bossa_results.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score
50,60199,2.1e-05,61882,1


The information about the sentence comes in a dictionary inside the cells of the serie `info`, so we expand it.

In [4]:
bossa_tasks = pd.read_json("bossa/control_tasks.json")
bossa_tasks[['created']]= bossa_tasks[['created']].apply(pd.to_datetime, dayfirst=True)
bossa_tasks.rename(columns={'id': 'task_id'}, inplace=True)
bossa_tasks = bossa_tasks[['task_id', 'info']]
bossa_tasks.ix[[50]]

Unnamed: 0,task_id,info
50,61838,"{u'id': u'76', u'sentence': u'The house, which..."


And finally we merge the `DataFrame` with the scores with the one containing the sentences.

In [5]:
bossa_tasks_scores = pd.merge(bossa_results, bossa_tasks, on='task_id')
bossa_tasks_scores.ix[[50]]

Unnamed: 0,result_id,seconds,task_id,score,info
50,60496,2e-05,61779,-1,"{u'id': u'17', u'sentence': u'The veteran, Eri..."


Let's now expand the column `info` into as many new columns as keys has the dictionary `info`.

In [6]:
bossa_tasks_scores.ix[50].info.keys()

[u'id', u'sentence']

In [7]:
def json_to_series(info):
    keys, values = zip(*info.iteritems())
    return pd.Series(values, index=keys)

bossa_info = bossa_tasks_scores["info"].apply(json_to_series)
bossa_info.reset_index()
bossa = pd.concat([bossa_tasks_scores, bossa_info], axis=1)
bossa.pop("info")
# bossa['id'] = bossa['id'].astype(float)
bossa.ix[50:53]

Unnamed: 0,result_id,seconds,task_id,score,id,sentence
50,60496,2e-05,61779,-1,17,"The veteran, Eric Harroun, 31, of Phoenix, had..."
51,60097,2.7e-05,61780,1,18,The Mets' four-game sweep of the Yankees was b...
52,60297,2.2e-05,61780,1,18,The Mets' four-game sweep of the Yankees was b...
53,60497,2.3e-05,61780,0,18,The Mets' four-game sweep of the Yankees was b...


Aggregate
---------

We now aggregate calculating the average per `sentence_id` using a group by. In the process, we lose the source of the data, that's why we first have to save it.

In [8]:
bossa.to_csv("sentiment/control_scores_ungrouped.csv", encoding="utf8")

Finally, we aggregate and create a new `DataFrame` for the different sentences and their score.

In [11]:
sentences = bossa.groupby(['sentence'])[['score']].aggregate(np.average)
sentences.to_csv("sentiment/control_scores.csv", encoding="utf8")
print sentences.count()
sentences[51:54]

score    199
dtype: int64


Unnamed: 0_level_0,score
sentence,Unnamed: 1_level_1
Dark floors and sunken rooms on the first floor could be challenging for small children and visitors.,-0.666667
"Dozens of cars were scorched, and a ghostly layer of ash coated the shelves of a supermarket",-1.0
"Ecuador's government expressed ""outrage and dismay"" and warned the rebels, believed to be members of the Revolutionary Armed Forces of Colombia, or FARC, that it would be ""relentless"" in the defense of its territory.",0.333333


In [10]:
from collections import Counter

def majority(series): #receives a Pandas Series
    return Counter(map(lambda x: 1 if x > 0 else -1 if x < 0 else 0, series)).most_common(1)[0][0]

# score_calculate = np.average
score_calculate = majority

sentences = bossa.groupby(['sentence'])[['score']].aggregate(score_calculate)

Sentence Classifier
-------------------

In [11]:
from nltk.corpus import stopwords
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

Create the tranining and testing sets (data and labels) from a randomized version of the set of assessed sentences.

In [12]:
sentences.reset_index().count()

sentence    8996
score       8996
dtype: int64

We could consider 3 classes, but it toruns out that using binary classficication seems to produce better results. Still, try multi-classs classifiers is something worth trying.

In [13]:
raw_scores = sentences.reset_index()
scores = raw_scores
scores = scores[scores.score!=0]  # We ignore the neutral sentences
scores['sentiment'] = scores['score'].apply(lambda s: 'pos' if s > 0 else 'neg')
# percentage = 0.85  #  percentage for training, rest for for testing
# # We split to have enough representativenesss for both positive and negative sentiments
# sent_min = min(
#     scores[scores.sentiment=='pos'].sentiment.count(),
#     scores[scores.sentiment=='neg'].sentiment.count(),
# )
# scores = scores[["sentence", "sentiment"]]
# train_data = np.array([])
# train_labels = np.array([])
# test_data = np.array([])
# test_labels = np.array([])
# for sent in ('pos', 'neg'):
#     sent_scores = scores[scores['sentiment']==sent]
#     sent_scores = sent_scores.reindex(np.random.permutation(sent_scores.index))
#     sent_sentences_count = int(sent_scores['sentence'].count())
#     sent_train = sent_scores[["sentence", "sentiment"]][:int(sent_sentences_count * percentage)]
#     sent_test = sent_scores[["sentence", "sentiment"]][int(sent_sentences_count * percentage) + 1:]
#     print sent, sent_min, sent_train.sentiment.count(), sent_test.sentiment.count()
#     train_data = np.append(train_data, sent_train["sentence"])
#     train_labels = np.append(train_labels, sent_train["sentiment"])
#     test_data = np.append(test_data, sent_test["sentence"])
#     test_labels = np.append(test_labels, sent_test["sentiment"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#NLTK

In [14]:
document_df = scores[['sentence', 'sentiment']]
document_df = document_df.reindex(np.random.permutation(document_df.index))

In [15]:
document_df.head()

Unnamed: 0,sentence,sentiment
6881,Ten miles north of the Robert Mondavi Winery i...,pos
6936,That re-created surface filled with objects in...,pos
5593,Mr. Wheeler served from 1992 to 2004 as the ch...,pos
5248,"Mitch Nash, a Lenox, Mass., based art director...",pos
1599,"Alcoa, based in Pittsburgh, released its earni...",neg


In [16]:
original_documents = [(r[1]['sentence'], r[1]['sentiment']) for r in document_df.iterrows()]

In [17]:
pos_file = open('positive-words.txt')
all_pos_words = [w.strip().lower() for w in pos_file]

neg_file = open('negative-words.txt')
all_neg_words = [w.strip().lower() for w in neg_file]

In [18]:
search_terms_f = open('search_terms.txt')
search_terms = []
for t in search_terms_f:
    l = t.strip().split()
    search_terms.extend(l)

In [19]:
import string

import nltk
from nltk.corpus import stopwords

###Options

In [20]:
#stopwords_english
stopwords_english = stopwords.words('english')
#stopwords_domain
stopwords_domain = search_terms + ["'s", '--']
#stopwords_english
punctuation = string.punctuation

In [21]:
def document_features1(document):      
    document_words = set(document)
    features = {}
    for word in word_features:
        features[word] = word in document_words
    return features

In [22]:
def document_features2(document):      
    document_words = list(document)
    features = {}
    for word in document_words:
        if word in word_features:
            features[word] = features.get(word, 0) + 1
    return features

In [23]:
def document_features3(document):
    word_features3 = list(pos_words) + list(neg_words)
    document_words = set(document)
    features = {}
    for word in word_features3:
        features[word] = word in document_words
    return features

In [24]:
# Filter stopwords, punctuation marks, and numbers
# 'stopwords_english': stopwords_english,
# 'stopwords_domain': stopwords_domain,
# 'punctuation': punctuation,
# 'numbers': True,
#
# Do not filter stopwords, punctuation marks, or numbers
# 'stopwords_english': [],
# 'stopwords_domain': [],
# 'punctuation': [],
# 'numbers': False,

options = {}

#Option 1
opt1 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[1] = opt1

#Option 2
opt2 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': True,
}
options[2] = opt2

#Option 3
opt3 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[3] = opt3

#Option 4
opt4 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[4] = opt4

#Option 5
opt5 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[5] = opt5

#Option 6
opt6 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[6] = opt6

#Option 7
opt7 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': True,
}
options[7] = opt7

#Option 8
opt8 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[8] = opt8

#Option 9
opt9 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': False,
}
options[9] = opt9

#Option 10
opt10 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': False,
}
options[10] = opt10

#Option 11
opt11 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[11] = opt11

#Option 12
opt12 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': False,
}
options[12] = opt12

#Option 13
opt13 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[13] = opt13

#Option 14
opt14 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[14] = opt14

#Option 15
opt15 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': stopwords_domain,
    'punctuation': [],
    'numbers': False,
}
options[15] = opt15

#Option 16
opt16 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[16] = opt16

#Option 17
opt17 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[17] = opt17

#Option 18
opt18 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[18] = opt18

#Option 19
opt19 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': True,
}
options[19] = opt19

#Option 20
opt20 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': True,
}
options[20] = opt20

#Option 21
opt21 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[21] = opt21

#Option 22
opt22 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': stopwords_english,
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[22] = opt22

#Option 23
opt23 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': punctuation,
    'numbers': False,
}
options[23] = opt23

#Option 24
opt24 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features1,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[24] = opt24

################################################################################################

#Option 25
opt25 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features2,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[25] = opt25

#Option 26
opt26 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features3,
    'stopwords_english': stopwords_english,
    'stopwords_domain': stopwords_domain,
    'punctuation': punctuation,
    'numbers': True,
}
options[26] = opt26

#Option 27
opt27 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features2,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[27] = opt27

#Option 28
opt28 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': False,
    'document_features': document_features3,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[28] = opt28

#Option 29
opt29 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features2,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[29] = opt29

#Option 30
opt30 = {
    'length': 1000,
    'reverse': False,
    'stopwords_domain_in_features': True,
    'document_features': document_features3,
    'stopwords_english': [],
    'stopwords_domain': [],
    'punctuation': [],
    'numbers': False,
}
options[30] = opt30

In [25]:
def filt(x):
    c1 = x not in options[opt]['stopwords_english']
    c2 = x not in options[opt]['stopwords_domain']
    c3 = x not in options[opt]['punctuation']
    c4 = True if options[opt]['numbers'] else not x.isdigit()
    
    cs = [c1, c2, c3, c4]
    return all(cs)
 
def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered_words = filter(filt, tokens)
    return filtered_words

def print_option(opt):
    d = dict(opt)
    for k in d:
        if (type(d[k]) == list or type(d[k]) == str) and len(d[k]) > 0:
            d[k] = ['...']
        elif hasattr(d[k], '__call__'):
            d[k] = d[k].func_name
    print(d)

In [26]:
#%%prun -l4
from collections import defaultdict
from random import shuffle

best_option = None
best_accuracy = 0
best_index = 0
d_accuracies = defaultdict(list)

all_words = [w.lower() for d in original_documents for w in nltk.word_tokenize(d[0])]

for i in range(10):
    shuffle(original_documents)
    for opt in options:
        print('=======================================================')
        print('RUNNING THE NaiveBayesClassifier WITH THE NEXT OPTIONS:')
        print('\nOption {}'.format(opt))
        print_option(options[opt])
        print(score_calculate.func_name)
        print('\niteration: {}\n'.format(i))
        
        documents = [(preprocess(d[0]), d[1]) for d in original_documents]
        filtered_words = filter(filt, all_words)
        freq_dist = nltk.FreqDist(filtered_words)
        most_common_words = [word for word, freq in freq_dist.most_common()]
        most_common_words = filter(filt, most_common_words)

        if options[opt]['stopwords_domain_in_features']:
            most_common_words = list(set(most_common_words).difference(search_terms))
        if options[opt]['reverse']:
            most_common_words.reverse()
        
        word_features = most_common_words[:options[opt]['length']]
        print('----------------')
        print('Total features:')
        print(options[opt]['length'])
        print('Top 10 features:')
        print(word_features[:10])
        print('----------------')

        if options[opt]['document_features'] == document_features3:
            pos_words = set(all_pos_words).intersection(word_features)
            print('Total positive words:')
            print(len(pos_words))
            print('Top 10 positive words:')
            print list(pos_words)[:10]

            neg_words = set(all_neg_words).intersection(word_features)
            print('Total positive words:')
            print(len(neg_words))
            print('Top 10 positive words:')
            print list(neg_words)[:10]

        featureset = [(options[opt]['document_features'](d[0]), d[1]) for d in documents]
        size = int(len(featureset) * 0.9)
        train_set = featureset[:size]
        test_set = featureset[size:]

        classifier = nltk.NaiveBayesClassifier.train(train_set)
        accuracy = nltk.classify.accuracy(classifier, test_set)
        print('****************')
        print('ACCURACY = {}'.format(accuracy))
        print('****************')
        classifier.show_most_informative_features(10)
        print('\n\n\n\n\n')
        d_accuracies[opt].append(accuracy)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_option = options[opt]
            best_index = opt

print('**************** BEST RESULT ****************')
print('BEST OPTION: {}'.format(best_index))
print(best_option)
print('ACCURACY =', best_accuracy)
print('*********************************************')

RUNNING THE NaiveBayesClassifier WITH THE NEXT OPTIONS:

Option 1
{'reverse': False, 'stopwords_domain_in_features': False, 'punctuation': ['...'], 'length': 1000, 'numbers': True, 'document_features': 'document_features1', 'stopwords_domain': ['...'], 'stopwords_english': ['...']}
majority

iteration: 0

----------------
Total features:
1000
Top 10 features:
[u'said', u'chief', u'mr.', u'like', u'would', u'director', u'one', u'year', u'also', u'last']
----------------
****************
ACCURACY = 0.702579666161
****************
Most Informative Features
                 accused = True              neg : pos    =     25.6 : 1.0
                 comment = True              neg : pos    =      8.4 : 1.0
                   force = True              neg : pos    =      8.1 : 1.0
                declined = True              neg : pos    =      7.7 : 1.0
                   built = True              pos : neg    =      7.3 : 1.0
                    rose = True              pos : neg    =      

In [29]:
np.array(d_accuracies[12]).mean()

0.70136570561456746

In [30]:
np.array(d_accuracies[12]).std()

0.010768555386470502

In [37]:
np.array(d_accuracies[16]).mean()

0.69317147192716244

In [35]:
np.array(d_accuracies[16]).std()

0.020311178005161891

In [41]:
for k in options:
    print('Option {}'.format(k))
    print(np.array(d_accuracies[k]).mean())
    print(np.array(d_accuracies[k]).std())
    print('--------')

Option 1
0.691350531108
0.0168320784508
--------
Option 2
0.689226100152
0.0150801314098
--------
Option 3
0.686494688923
0.0175027263797
--------
Option 4
0.701517450683
0.0122161848635
--------
Option 5
0.685584218513
0.0158977855584
--------
Option 6
0.691957511381
0.0205948862851
--------
Option 7
0.699544764795
0.0150219953515
--------
Option 8
0.691198786039
0.0191012024305
--------
Option 9
0.691957511381
0.0184730534671
--------
Option 10
0.690895295903
0.016273553167
--------
Option 11
0.686494688923
0.0149943785457
--------
Option 12
0.701365705615
0.0107685553865
--------
Option 13
0.686949924127
0.0186350110829
--------
Option 14
0.692564491654
0.0177249672236
--------
Option 15
0.698786039454
0.013005003861
--------
Option 16
0.693171471927
0.0203111780052
--------
Option 17
0.658573596358
0.021265976378
--------
Option 18
0.658573596358
0.0216735308909
--------
Option 19
0.658877086495
0.0185948087187
--------
Option 20
0.658877086495
0.0189140351452
--------
Option 21
0.

In [None]:
import cPickle as pickle

with open("sentiment/nltk_NaiveBayesClassifier_majority.pickle", "wb") as nltk_file:
    pickle.dump(???, nltk_file)