In [1]:
import urllib
# define URLs
test_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/testdata.txt"
train_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/training.txt"
    
# define local file names
test_data_file_name = 'test_data.csv'
train_data_file_name = 'train_data.csv'
    
# download files using urlib
test_data_f = urllib.urlretrieve(test_data_url, test_data_file_name)
train_data_f = urllib.urlretrieve(train_data_url, train_data_file_name)

In [2]:
import pandas as pd
    
test_data_df = pd.read_csv(test_data_file_name, header=None, delimiter="\t", quoting=3)
test_data_df.columns = ["Text"]
train_data_df = pd.read_csv(train_data_file_name, header=None, delimiter="\t", quoting=3)
train_data_df.columns = ["Sentiment","Text"]

In [3]:
train_data_df.shape

(7086, 2)

In [4]:
test_data_df.shape

(33052, 1)

In [5]:
train_data_df.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [6]:
train_data_df.Sentiment.value_counts()

1    3995
0    3091
Name: Sentiment, dtype: int64

In [7]:
import numpy as np 
np.mean([len(s.split(" ")) for s in train_data_df.Text])

10.886819079875812

In [8]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

In [9]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [10]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied']
singles = [stemmer.stem(plural) for plural in plurals]
print singles

[u'caress', u'fli', u'die', u'mule', u'deni']


In [11]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [13]:
text = re.sub("[^a-zA-Z]", " ", "i am onzali suba")
    # tokenize
tokens = nltk.word_tokenize(text)
print tokens

['i', 'am', 'onzali', 'suba']


In [14]:
corpus_data_features = vectorizer.fit_transform(
    train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [15]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape


(40138, 85)

In [16]:
corpus_data_features_nd

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
vocab = vectorizer.get_feature_names()
print vocab

[u'aaa', u'amaz', u'angelina', u'awesom', u'beauti', u'becaus', u'boston', u'brokeback', u'citi', u'code', u'cool', u'cruis', u'd', u'da', u'drive', u'francisco', u'friend', u'fuck', u'geico', u'good', u'got', u'great', u'ha', u'harri', u'harvard', u'hate', u'hi', u'hilton', u'honda', u'imposs', u'joli', u'just', u'know', u'laker', u'left', u'like', u'littl', u'london', u'look', u'lot', u'love', u'm', u'macbook', u'make', u'miss', u'mission', u'mit', u'mountain', u'movi', u'need', u'new', u'oh', u'onli', u'pari', u'peopl', u'person', u'potter', u'purdu', u'realli', u'right', u'rock', u's', u'said', u'san', u'say', u'seattl', u'shanghai', u'stori', u'stupid', u'suck', u't', u'thi', u'thing', u'think', u'time', u'tom', u'toyota', u'ucla', u've', u'vinci', u'wa', u'want', u'way', u'whi', u'work']


In [20]:
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)
# print dist
# For each, print the vocabulary word and the number of times it 
# appears in the data set
for tag, count in zip(vocab, dist):
    print count, tag

1179 aaa
485 amaz
1765 angelina
3170 awesom
2146 beauti
1694 becaus
2190 boston
2000 brokeback
423 citi
2003 code
481 cool
2031 cruis
439 d
2087 da
433 drive
1926 francisco
477 friend
452 fuck
1085 geico
773 good
571 got
1178 great
776 ha
2094 harri
2103 harvard
4492 hate
794 hi
2086 hilton
2192 honda
1098 imposs
1764 joli
1054 just
896 know
2019 laker
425 left
4080 like
507 littl
2233 london
811 look
421 lot
10334 love
1568 m
1059 macbook
631 make
1098 miss
1101 mission
1340 mit
2081 mountain
1207 movi
1220 need
459 new
551 oh
674 onli
2094 pari
1018 peopl
454 person
2093 potter
1167 purdu
2126 realli
661 right
475 rock
3914 s
495 said
2038 san
627 say
2019 seattl
1189 shanghai
467 stori
2886 stupid
4614 suck
1455 t
1705 thi
662 thing
1524 think
781 time
2117 tom
2028 toyota
2008 ucla
774 ve
2001 vinci
3703 wa
1656 want
932 way
547 whi
512 work


In [21]:
from sklearn.cross_validation import train_test_split
    
# remember that corpus_data_features_nd contains all of our 
# original train and test data, so we need to exclude
# the unlabeled test entries
X_train, X_test, y_train, y_test  = train_test_split(
        corpus_data_features_nd[0:len(train_data_df)], 
        train_data_df.Sentiment,
        train_size=0.85, 
        random_state=1234)



In [22]:
from sklearn.linear_model import LogisticRegression
    
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)

In [23]:
y_pred = log_model.predict(X_test)

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       467
          1       0.99      0.98      0.99       596

avg / total       0.98      0.98      0.98      1063



In [26]:
# train classifier
log_model = LogisticRegression()
log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)
    
# get predictions
test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])
    
# sample some of them
import random
spl = random.sample(xrange(len(test_pred)), 15)
    
# print text and labels
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print sentiment, text

1 Geico would be great, and I really hope that works out.
0 BOSTON SUCKS!!
1 i'd love to see the clips and lakers in the second round, though the winner would just be a stepping stone for the mavs or spurs...
0 Stupid UCLA, deserves a good poking..
0 cause obviously toyota dealer is cheating on my feelings..
0 I actually found a cab driver who didn't know a particular street in London ( this is amazing for anyone who knows taxi drivers in London with their prodigious memory for addresses ).
0 the stupid honda lol or a BUG!..
0 Ok, I'm gonna be honest, Lakers suck, the only reason they're this far is because of their size advantage, and Suns are a bad defensive team.
1 I love the London Little People, btw.
0 You are a fucking bitch and I think I may hate you even more than I hate Paris Hilton...
0 I really, really hate TOM CRUISE.
1 I'm not crazy about HK either, but Shanghai is sounding awesome.
1 for as long as I can remember I've wanted Honda....
0 For those who say he doesn't deserv