In [16]:
import re
import urllib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords # Import the stop word list

def review_to_words( raw_review ):
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))   

train_df = pd.read_table('train.tsv')
test_df = pd.read_table('test.tsv')

test_df['Sentiment'] = -1
train_sentences = train_df.groupby('SentenceId').first().reset_index()
test_sentences = test_df.groupby('SentenceId').first().reset_index()
train_sentences["Phrase"] = train_sentences["Phrase"].apply(review_to_words)
test_sentences["Phrase"] = test_sentences["Phrase"].apply(review_to_words)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

 
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

train_data_features = vectorizer.fit_transform(train_sentences["Phrase"].tolist())
test_data_features = vectorizer.fit_transform(test_sentences["Phrase"].tolist())

train_data_features = train_data_features.toarray()
test_data_features = test_data_features.toarray()
# tmp = train_data_features
# for i in range(0, tmp.shape[1]):
#     if tmp[0][i] > 0:
#         print "-------------------"
#         print i
#         print tmp[0][i]
#         print "-------------------"
        
# vocab = vectorizer.get_feature_names()
# # Sum up the counts of each vocabulary word
# dist = np.sum(train_data_features, axis=0)

# # For each, print the vocabulary word and the number of times it 
# # appears in the data set
# for tag, count in zip(vocab, dist):
#     print count, tag

In [18]:
from sklearn import cross_validation
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
# scores = cross_validation.cross_val_score(clf, train_data_features[0:len(train_sentences["Sentiment"])], train_sentences["Sentiment"], cv=10)
scores = cross_validation.cross_val_score(clf, train_data_features, train_sentences["Sentiment"], cv=10)
print scores.mean()

0.366982636043


In [19]:
# Use the random forest to make sentiment label predictions
clf = clf.fit(train_data_features, train_sentences["Sentiment"])
result = clf.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame(data={"PhraseId":test_sentences["PhraseId"], "Sentiment":result})

In [None]:
# used to download .txt files that contain negative and positive words
# urllib.urlretrieve('http://www.unc.edu/~ncaren/haphazard/negative.txt','negative.txt')
# urllib.urlretrieve('http://www.unc.edu/~ncaren/haphazard/positive.txt','positive.txt')

In [24]:
merged

Unnamed: 0,PhraseId,Sentiment
0,156061,4
1,156076,0
2,156154,4
3,156178,0
4,156219,1
5,156250,2
6,156272,4
7,156324,2
8,156362,1
9,156405,2


In [25]:
negWords = open("negative.txt").read().split("\n")
posWords = open("positive.txt").read().split("\n")


test_df["containsSpaces"] = test_df["Phrase"].apply(lambda x: ' ' not in x)
sw = test_df[test_df.containsSpaces == True]

def get_sentiment_single_word(word):
    if word in negWords:
        return 1
    elif word in posWords:
        return 3
    elif word in posWords and word in negWords:
        return 2
    else:
        return 2
    
sw['Sentiment'] = sw['Phrase'].apply(get_sentiment_single_word)

sw_output = sw[['PhraseId', 'Sentiment']]

merged = pd.merge(output, sw_output, how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [26]:
# tmp = test_df[['PhraseId', 'Sentiment']]
remaining = test_df[~test_df['PhraseId'].isin(merged['PhraseId'])].reset_index()

In [27]:
def get_sentiment_phrase(phrase):
    phrase_sentiment = []
    for word in phrase.split():
        if word in negWords:
            phrase_sentiment.append(0)
        elif word in posWords:
            phrase_sentiment.append(4)
        elif word in posWords and word in negWords:
            phrase_sentiment.append(2)
        else:
            phrase_sentiment.append(2)
    if len(phrase_sentiment):
        return sum(phrase_sentiment)/len(phrase_sentiment)
    else:
        return 2

remaining['Sentiment'] = remaining['Phrase'].apply(get_sentiment_phrase)

In [30]:
remaining_output = remaining[['PhraseId', 'Sentiment']]
final = pd.merge(merged, remaining_output, how='outer')
final = final.sort(['PhraseId'])
final['PhraseId'] = final['PhraseId'].apply(lambda x: int(x))
final['Sentiment'] = final['Sentiment'].apply(lambda x: 2)
final.to_csv( "ZG_rotten_tomatoes_baseline.csv", index=False, quoting=3 )

  app.launch_new_instance()


In [29]:
final

Unnamed: 0,PhraseId,Sentiment
0,156061,2
13324,156062,2
3310,156063,2
13325,156064,2
13326,156065,2
13327,156066,2
13328,156067,2
3311,156068,2
3312,156069,2
3313,156070,2


In [None]:
%matplotlib inline

plt.hist(final.Sentiment)
plt.show()
for i in range(0, 5):
    print i, len(final[final.Sentiment == i]) / float(len(final))

In [13]:
final

Unnamed: 0,PhraseId,Sentiment
0,156061,2
21113,156062,2
3310,156063,2
21114,156064,2
21115,156065,2
21116,156066,2
21117,156067,2
3311,156068,2
3312,156069,2
3313,156070,2
