# Load data

In [1]:
import os
import pandas as pd
import numpy as np
from KaggleWord2VecUtility import KaggleWord2VecUtility

data_file = 'data/labeledTrainData.tsv'
data = pd.read_csv(data_file, header=0, delimiter='\t', quoting=3)

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score as AUC

# split train set and validation set
train_i, vali_i = train_test_split(np.arange(len(data)), train_size=0.8, random_state=44)

train = data.ix[train_i]
vali = data.ix[vali_i]

In [3]:
train.shape, vali.shape

((20000, 3), (5000, 3))

# parse train data

In [4]:
print "Parsing train reviews..."

clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(review)))

clean_vali_reviews = []
for review in vali['review']:
    clean_vali_reviews.append( " ".join(KaggleWord2VecUtility.review_to_wordlist(review)))  

Parsing train reviews...




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [8]:
clean_train_reviews[0], train.head(1)

(u'there is no reason to see this movie a good plot idea is handled very badly in the middle of the movie everything changes and from there on nothing makes much sense the reason for the killings are not made clear the acting is awful nick stahl obviously needs a better director he was excellent in in the bedroom but here he is terrible amber benson from buffy has to change her character someday even those of you who enjoy gratuitous sex and violence will be disappointed even though the movie was minutes which is too short for a good movie but too long for this one there are no deleted scenes in the dvd which means they never bothered to fill in the missing parts to the characters don t spend the time on this one',
              id  sentiment                                             review
 10730  "5008_1"          0  "There is no reason to see this movie. A good ...)

In [14]:
print len(clean_train_reviews)

20000


# Vectorize using TF-IDF

In [12]:
vectorizer = TfidfVectorizer(max_features = 4000, ngram_range = (1, 3), sublinear_tf = True)

In [13]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)
vali_data_features = vectorizer.transform(clean_vali_reviews)

MemoryError: 

## Avoid memory issue using HashingVectorizer

In [16]:
from sklearn.feature_extraction.text import HashingVectorizer

hv = HashingVectorizer(n_features=40000)
train_data_features = hv.transform(clean_train_reviews)
vali_data_features = hv.transform(clean_vali_reviews)

In [18]:
def train_and_eval_auc( model, train_x, train_y, test_x, test_y ):
    model.fit( train_x, train_y )
    p = model.predict_proba( test_x )
    auc = AUC( test_y, p[:,1] )
    return auc

# predict using Logistic Regression Classifier

In [19]:
lr = LogisticRegression()
auc = train_and_eval_auc(lr, train_data_features, train["sentiment"], vali_data_features, vali["sentiment"].values)

print "Logistic regression AUC: ", auc

Logistic regression AUC:  0.926312727496


# Submit

In [21]:
test_file = 'data/testData.tsv'
test = pd.read_csv(test_file, header = 0, delimiter = "\t", quoting = 3)

clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append( " ".join( KaggleWord2VecUtility.review_to_wordlist( review )))
    
test_x = hv.transform(clean_test_reviews)

In [20]:
hv

HashingVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
         dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
         lowercase=True, n_features=40000, ngram_range=(1, 1),
         non_negative=False, norm=u'l2', preprocessor=None,
         stop_words=None, strip_accents=None,
         token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None)

In [45]:
p = lr.predict_proba(test_x)[:, 1]

In [48]:
test["sentiment"] = p

In [49]:
test.head()

Unnamed: 0,id,review,sentiment
0,"""12311_10""","""Naturally in a film who's main themes are of ...",0.927147
1,"""8348_2""","""This movie is a disaster within a disaster fi...",0.284623
2,"""5828_4""","""All in all, this is a movie for kids. We saw ...",0.670801
3,"""7186_2""","""Afraid of the Dark left me with the impressio...",0.433388
4,"""12128_7""","""A very accurate depiction of small time mob l...",0.841837


In [51]:
output_file = 'data/pred_submit.csv'
test[['id','sentiment']].to_csv(output_file, index = False, quoting = 3)