In [1]:
#Import needed libraries and get data
%matplotlib inline

import pandas as pd
import numpy as np
import re
import nltk
import urllib
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tag.perceptron import PerceptronTagger

train = pd.read_table('train.tsv')
train["Length"] = train['Phrase'].apply(lambda x: len(x.split()))
ordered = train.sort(['SentenceId', 'Length'], ascending=[1, 0])
train_sentences = train.groupby('SentenceId').first().reset_index()


max_length = max(ordered['Length'])
bins = [0, 1, max_length/15, max_length/4, max_length/2, max_length]
group_names = ['SingleWord', 'SmallPhrase', 'Phrase', 'LongPhrase', 'Sentences']
categories = pd.cut(ordered['Length'], bins, labels=group_names)
ordered['categories'] = pd.cut(ordered['Length'], bins, labels=group_names)


sentences = ordered[ordered['categories'] == 'Sentences'].reset_index()
longphrase = ordered[ordered['categories'] == 'LongPhrase'].reset_index()
phrase = ordered[ordered['categories'] == 'Phrase'].reset_index()
smallphrase = ordered[ordered['categories'] == 'SmallPhrase'].reset_index()
singleword = ordered[ordered['categories'] == 'SingleWord'].reset_index()

tagger = PerceptronTagger()

singleword['POS'] = 'N/A'
print bins[::-1]
print len(sentences), len(longphrase), len(phrase), len(smallphrase), len(singleword)
ordered



[52, 26, 13, 3, 1, 0]
4193 20141 68215 46979 16531


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Length,categories
0,1,1,A series of escapades demonstrating the adage ...,1,37,Sentences
27,28,1,"is also good for the gander , some of which oc...",2,23,LongPhrase
28,29,1,"is also good for the gander , some of which oc...",2,22,LongPhrase
31,32,1,"good for the gander , some of which occasional...",2,20,LongPhrase
32,33,1,"for the gander , some of which occasionally am...",2,19,LongPhrase
33,34,1,"the gander , some of which occasionally amuses...",1,18,LongPhrase
38,39,1,some of which occasionally amuses but none of ...,2,15,LongPhrase
1,2,1,A series of escapades demonstrating the adage ...,2,14,LongPhrase
5,6,1,of escapades demonstrating the adage that what...,2,12,Phrase
43,44,1,occasionally amuses but none of which amounts ...,2,12,Phrase


In [4]:
def review_to_words( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    word = letters_only.lower()
    #tagset = None
    #token = nltk.word_tokenize(word)
    #pos = nltk.tag._pos_tag(token, tagset, tagger)
    #return word, pos
    return word

In [5]:
num_reviews_single = singleword["Phrase"].size
clean_single_reviews = []

for i in xrange(0, num_reviews_single):
    cleaned = review_to_words(singleword['Phrase'][i])
    #cleaned, pos = review_to_words(singleword["Phrase"][i])
    #singleword['POS'][i] = str([x[1] for x in pos])
    clean_single_reviews.append(cleaned)
    print 'cleaning ', i, cleaned
print clean_single_reviews
singleword

cleaning  0 a
cleaning  1 series
cleaning  2 of
cleaning  3 escapades
cleaning  4 demonstrating
cleaning  5 the
cleaning  6 adage
cleaning  7 that
cleaning  8 what
cleaning  9 is
cleaning  10 good
cleaning  11 for
cleaning  12 goose
cleaning  13 also
cleaning  14 gander
cleaning  15  
cleaning  16 some
cleaning  17 which
cleaning  18 occasionally
cleaning  19 amuses
cleaning  20 but
cleaning  21 none
cleaning  22 amounts
cleaning  23 to
cleaning  24 much
cleaning  25 story
cleaning  26  
cleaning  27 this
cleaning  28 quiet
cleaning  29 introspective
cleaning  30 and
cleaning  31 entertaining
cleaning  32 independent
cleaning  33 worth
cleaning  34 seeking
cleaning  35 even
cleaning  36 fans
cleaning  37 ismail
cleaning  38 merchant
cleaning  39  s
cleaning  40 work
cleaning  41 i
cleaning  42 suspect
cleaning  43 would
cleaning  44 have
cleaning  45 hard
cleaning  46 time
cleaning  47 sitting
cleaning  48 through
cleaning  49 one
cleaning  50 positively
cleaning  51 thrilling
cleaning

Unnamed: 0,index,PhraseId,SentenceId,Phrase,Sentiment,Length,categories,POS
0,3,4,1,A,2,1,SingleWord,
1,4,5,1,series,2,1,SingleWord,
2,6,7,1,of,2,1,SingleWord,
3,8,9,1,escapades,2,1,SingleWord,
4,11,12,1,demonstrating,2,1,SingleWord,
5,13,14,1,the,2,1,SingleWord,
6,14,15,1,adage,2,1,SingleWord,
7,16,17,1,that,2,1,SingleWord,
8,18,19,1,what,2,1,SingleWord,
9,20,21,1,is,2,1,SingleWord,


In [30]:
#singleword.to_csv('singleword.csv', header=True, index=True)

In [6]:
singleword = pd.read_csv('singleword.csv')
singleword

Unnamed: 0.1,Unnamed: 0,index,PhraseId,SentenceId,Phrase,Sentiment,Length,categories,POS
0,0,3,4,1,A,2,1,SingleWord,['DT']
1,1,4,5,1,series,2,1,SingleWord,['NN']
2,2,6,7,1,of,2,1,SingleWord,['IN']
3,3,8,9,1,escapades,2,1,SingleWord,['NNS']
4,4,11,12,1,demonstrating,2,1,SingleWord,['VBG']
5,5,13,14,1,the,2,1,SingleWord,['DT']
6,6,14,15,1,adage,2,1,SingleWord,['NN']
7,7,16,17,1,that,2,1,SingleWord,['IN']
8,8,18,19,1,what,2,1,SingleWord,['WP']
9,9,20,21,1,is,2,1,SingleWord,['VBZ']


In [7]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 9000) 
single_data_features = vectorizer.fit_transform(clean_single_reviews)
single_data_features = single_data_features.toarray()

In [8]:
vocab = vectorizer.get_feature_names()
print len(vocab), vocab



In [8]:
# Initialize a Random Forest classifier with 20 trees
#forest_single = RandomForestClassifier(n_estimators = 20) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
#forest_single = forest_single.fit(single_data_features, singleword["Sentiment"])
print 'done'

done


In [10]:
import cPickle

#with open('forest.cpickle', 'wb') as f:
#    cPickle.dump(forest_single, f)

In [11]:
with open('forest.cpickle', 'rb') as f:
    forest_single = cPickle.load(f)

In [2]:
test = pd.read_table('test.tsv')
test["Length"] = test['Phrase'].apply(lambda x: len(x.split()))
ordered = test.sort(['SentenceId', 'Length'], ascending=[1, 0])

max_length = max(ordered['Length'])
bins = [0, 1, max_length/15, max_length/4, max_length/2, max_length]
group_names = ['SingleWord', 'SmallPhrase', 'Phrase', 'LongPhrase', 'Sentences']
categories = pd.cut(ordered['Length'], bins, labels=group_names)
ordered['categories'] = pd.cut(ordered['Length'], bins, labels=group_names)


sentences_test = ordered[ordered['categories'] == 'Sentences'].reset_index()
longphrase_test = ordered[ordered['categories'] == 'LongPhrase'].reset_index()
phrase_test = ordered[ordered['categories'] == 'Phrase'].reset_index()
smallphrase_test = ordered[ordered['categories'] == 'SmallPhrase'].reset_index()
singleword_test = ordered[ordered['categories'] == 'SingleWord'].reset_index()
singleword_test['POS'] = 'N/A'

num_reviews_single = singleword_test["Phrase"].size
clean_single_reviews = []

  app.launch_new_instance()


In [22]:
# for i in xrange(0, num_reviews_single):
#     cleaned, pos = review_to_words(singleword_test["Phrase"][i])
#     singleword_test['POS'][i] = str([x[1] for x in pos])
#     singleword_test['Phrase'][i] = cleaned
#     clean_single_reviews.append(cleaned)
#     print 'cleaning ', i, cleaned

#singleword_test.to_csv('singleword_test.csv', header=True, index=True)

In [2]:
singleword_test = pd.read_csv('singleword_test.csv')
singleword_test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,index,PhraseId,SentenceId,Phrase,Length,categories,POS,Sentiment
0,0,0,0,2,156063,8545,an,1,SingleWord,['DT'],3
1,1,1,1,7,156068,8545,intermittently,1,SingleWord,['RB'],2
2,2,2,2,8,156069,8545,pleasing,1,SingleWord,['VBG'],2
3,3,3,3,9,156070,8545,but,1,SingleWord,['CC'],2
4,4,4,4,11,156072,8545,mostly,1,SingleWord,['RB'],3
5,5,5,5,12,156073,8545,routine,1,SingleWord,['NN'],2
6,6,6,6,13,156074,8545,effort,1,SingleWord,['NN'],2
7,7,7,7,14,156075,8545,,1,SingleWord,[],2
8,8,8,8,16,156077,8546,kidman,1,SingleWord,['NN'],2
9,9,9,9,20,156081,8546,is,1,SingleWord,['VBZ'],2


In [15]:
print "Applying tree to single words...\n" 
single_data_features = vectorizer.fit_transform(clean_single_reviews)
single_data_features = single_data_features.toarray()
result_single = forest_single.predict(single_data_features)

Applying tree to single words...



ValueError: empty vocabulary; perhaps the documents only contain stop words

In [14]:
singleword_test['Sentiment'] = result_single.tolist()
singleword_test

NameError: name 'result_single' is not defined

In [17]:
print singleword.PhraseId.values 

[     4      5      7 ..., 156047 156059 156060]


In [3]:
def basic_clean( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    words = letters_only.lower().split()
    return words

test_sentences = ordered.groupby('SentenceId').first().reset_index()

remaining = ordered[~ordered['PhraseId'].isin(test_sentences['PhraseId'])].reset_index()
test_phrases = remaining[remaining.categories != 'SingleWord'].reset_index()

print len(remaining), len(test_phrases), len(ordered), len(test_sentences), max(ordered.SentenceId)
test_phrases['Sentiment'] = '2'

num_reviews_phrases = test_phrases["Phrase"].size
clean_phrases_reviews = []

#for i in xrange(0, num_reviews_phrases):
#    print 'cleaning ', i, 'of ', num_reviews_phrases
#    test_phrases['Phrase'][i] = basic_clean(test_phrases['Phrase'][i])
test_phrases

147531 131001 156060 8529 8544


Unnamed: 0,level_0,index,PhraseId,SentenceId,Phrase,Sentiment,Length,categories
0,0,27,28,1,"is also good for the gander , some of which oc...",2,23,LongPhrase
1,1,28,29,1,"is also good for the gander , some of which oc...",2,22,LongPhrase
2,2,31,32,1,"good for the gander , some of which occasional...",2,20,LongPhrase
3,3,32,33,1,"for the gander , some of which occasionally am...",2,19,LongPhrase
4,4,33,34,1,"the gander , some of which occasionally amuses...",2,18,LongPhrase
5,5,38,39,1,some of which occasionally amuses but none of ...,2,15,LongPhrase
6,6,1,2,1,A series of escapades demonstrating the adage ...,2,14,LongPhrase
7,7,5,6,1,of escapades demonstrating the adage that what...,2,12,Phrase
8,8,43,44,1,occasionally amuses but none of which amounts ...,2,12,Phrase
9,9,7,8,1,escapades demonstrating the adage that what is...,2,11,Phrase


In [62]:
#test_phrases.to_csv('test_phrases.csv', header=True, index=True)
#test_phrases = pd.read_csv('test_phrases.csv')
#test_phrases

Unnamed: 0.1,Unnamed: 0,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment
0,0,0,1,156062,8545,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2
1,1,1,3,156064,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2
2,2,2,4,156065,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2
3,3,3,5,156066,8545,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2
4,4,4,6,156067,8545,"['intermittently', 'pleasing']",2,SmallPhrase,2
5,5,5,10,156071,8545,"['mostly', 'routine']",2,SmallPhrase,2
6,6,14,17,156078,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",42,Sentences,2
7,7,15,18,156079,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",41,Sentences,2
8,8,16,22,156083,8546,"['the', 'only', 'thing', 'that', 's', 'worth',...",39,Sentences,2
9,9,17,28,156089,8546,"['that', 's', 'worth', 'watching', 'in', 'birt...",36,Sentences,2


In [3]:
import csv
all_sentiments = []
phrases_size = test_phrases['Phrase'].size

negations = ['however','but','although','not','no','neither','never','noone','nobody','none','nor','nothing','nowhere','hardly','scarcely','isn','wasn','didn']

#test_phrases["Phrase"] = test_phrases['Phrase'].apply(lambda x: )

new_file = open('all_sentiments.txt', 'w')
test_phrases

for i in xrange(0,phrases_size):
    print 'index ',i,' of',phrases_size
    raw_phrase = test_phrases['Phrase'][i]
    phrase = basic_clean(raw_phrase)
    sentiments = []
    for j in phrase:
        word = singleword_test[singleword_test.Phrase == j]
        pos_str = str(word.POS).partition('[')[-1].rpartition(']')[0]
        try:
            sentiments.append([int(word.Sentiment),'NEGATE']) if j in negations else sentiments.append([int(word.Sentiment),pos_str])
        except:
            pass
    new_file.write("%s\n" % sentiments)
    all_sentiments.append(sentiments)
new_file.close()
all_sentiments

NameError: name 'test_phrases' is not defined

In [3]:
#test_phrases['SentPOS'] = all_sentiments
#test_phrases.to_csv('test_phrases.csv', header=True, index=True)
test_phrases = pd.read_csv('test_phrases.csv')
test_phrases

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment,SentPOS
0,0,0,0,1,156062,8545,"['an', 'intermittently', 'pleasing', 'but', 'm...",7,Phrase,2,"[[3, ""'DT'""], [2, ""'RB'""], [2, ""'VBG'""], [2, '..."
1,1,1,1,3,156064,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",6,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
2,2,2,2,4,156065,8545,"['intermittently', 'pleasing', 'but', 'mostly'...",5,Phrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE'], [3,..."
3,3,3,3,5,156066,8545,"['intermittently', 'pleasing', 'but']",3,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""], [2, 'NEGATE']]"
4,4,4,4,6,156067,8545,"['intermittently', 'pleasing']",2,SmallPhrase,2,"[[2, ""'RB'""], [2, ""'VBG'""]]"
5,5,5,5,10,156071,8545,"['mostly', 'routine']",2,SmallPhrase,2,"[[3, ""'RB'""], [2, ""'NN'""]]"
6,6,6,14,17,156078,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",42,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
7,7,7,15,18,156079,8546,"['is', 'really', 'the', 'only', 'thing', 'that...",41,Sentences,2,"[[2, ""'VBZ'""], [2, ""'RB'""], [2, ""'DT'""], [2, ""..."
8,8,8,16,22,156083,8546,"['the', 'only', 'thing', 'that', 's', 'worth',...",39,Sentences,2,"[[2, ""'DT'""], [2, ""'RB'""], [2, ""'NN'""], [2, ""'..."
9,9,9,17,28,156089,8546,"['that', 's', 'worth', 'watching', 'in', 'birt...",36,Sentences,2,"[[2, ""'IN'""], [2, ""'NN'""], [3, ""'NN'""], [2, ""'..."


In [7]:
import re, string

test_phrases['SentPOS'].tolist()
pattern = re.compile('[\W_]+')
length = test_phrases['SentPOS'].size

predicted_sent = []

for i in xrange(0,length/3):
    print i
    raw_order = pattern.sub(' ', test_phrases['SentPOS'][i])
    order = raw_order.replace('\[\[\]\]', '').split()
    str_to_int = [int(i) if i.isdigit() else i for i in order]
    test = [tuple(str_to_int[i:i+2]) for i in range(0, len(str_to_int),2)]
    overall_sent = 0
    reverse = {0:4,1:3,2:2,3:1,4:0}
    rev = False
    affected = ['RB','RBR','RBS','JJS','JJR','JJ']
    for pair in test:
        try:
            init_val = pair[0]
            #print len(pair),init_val,'--------------'
            if pair[1] == 'NEGATE':
                print 'reversing'
                val = reverse[pair[0]]
                rev = not rev
            else:
                if rev:
                    if pair[1] in affected:
                        print 'affected'
                        val = reverse[init_val]
                else:
                    val = init_val
            overall_sent += val
            sentiment = int(round(overall_sent/float(len(test))))
        except: 
            pass
    test_phrases['Sentiment'][i] = sentiment
    if sentiment != 2:
        print sentiment
    predicted_sent.append(sentiment)
test_phrase
                    
            

35311
reversing
affected
affected
affected
reversing
35312
reversing
affected
affected
affected
reversing
35313
reversing
affected
affected
35314
reversing
affected
affected
35315
reversing
affected
affected
35316
reversing
affected
affected
35317
reversing
affected
affected
35318
reversing
affected
affected
35319
reversing
affected
affected
35320
35321
35322
reversing
affected
affected
35323
35324
35325
35326
35327
3
35328
35329
3
35330
35331
35332
35333
3
35334
35335
35336
35337
3
35338
reversing
affected
affected
affected
35339
reversing
affected
affected
affected
35340
reversing
affected
affected
affected
35341
reversing
affected
affected
affected
35342
35343
reversing
affected
35344
35345
reversing
affected
35346
35347
35348
35349
35350
35351
35352
35353
35354
3
35355
3
35356
35357
35358
35359
35360
3
35361
35362
35363
35364
3
35365
35366
3
35367
3
35368
35369
35370
3
35371
35372
3
35373
35374
35375
35376
35377
35378
35379
35380
35381
reversing
35382
reversing
35383
35384
35385
35

NameError: name 'test_phrase' is not defined

In [9]:
not_two = test_phrases[test_phrases.Sentiment != 2]
not_two

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,level_0,index,PhraseId,SentenceId,Phrase,Length,categories,Sentiment,SentPOS
