In [1]:
import pandas as pd

In [9]:
train = pd.read_csv('../../Datasets/Kaggle/IMDB/labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('../../Datasets/Kaggle/IMDB/testData.tsv', delimiter='\t')
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [10]:
test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [32]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

In [33]:
def review_to_text(review, remove_stopwords):
    raw_text = BeautifulSoup(review, 'html').get_text()
    letters = re.sub('[^a-zA-Z]', ' ', raw_text)
    words = letters.lower().split()
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    return words

In [34]:
X_train = []
for review in train['review']:
    X_train.append(' '.join(review_to_text(review, True))) 
y_train = train['sentiment'] 

X_test = []
for review in test['review']:
    X_test.append(' '.join(review_to_text(review, True)))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [35]:
pip_count = Pipeline([('count_vec', CountVectorizer(analyzer='word')), ('mnb', MultinomialNB())])
pip_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer='word')), ('mnb', MultinomialNB())])

params_count = {'count_vec__binary':[True, False], 'count_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary':[True, False], 'tfidf_vec__ngram_range':[(1, 1), (1, 2)], 'mnb__alpha':[0.1, 1.0, 10.0]}

gs_count = GridSearchCV(pip_count, params_count, cv=4, n_jobs=-1, verbose=1)
gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)
gs_count.fit(X_train, y_train)

print(gs_count.best_score_)
print(gs_count.best_params_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.6min finished


0.88216
{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}


In [37]:
count_y_predict = gs_count.predict(X_test)

gs_tfidf = GridSearchCV(pip_tfidf, params_tfidf, cv=4, n_jobs=-1, verbose=1)
gs_tfidf.fit(X_train, y_train)
print(gs_tfidf.best_score_)
print(gs_tfidf.best_params_)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.9min finished


0.88712
{'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}


In [38]:
tfidf_y_predict = gs_tfidf.predict(X_test)
submission_count = pd.DataFrame({'id': test['id'], 'sentiment': count_y_predict})
submission_tfidf = pd.DataFrame({'id': test['id'], 'sentiment': tfidf_y_predict})

submission_count.to_csv('./submission_count.csv', index=False)
submission_tfidf.to_csv('./submission_tfidf.csv', index=False)

In [39]:
unlabeled_train = pd.read_csv('../../Datasets/Kaggle/IMDB/unlabeledTrainData.tsv', delimiter='\t', quoting=3)
unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [40]:
import nltk.data

In [41]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [45]:
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_text(raw_sentence, False))
    return sentences

In [47]:
corpora = []    
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review, tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [49]:
num_features = 300
min_word_count = 20
num_workers = 4
context = 10
downsampling = 1e-3

In [50]:
from gensim.models import word2vec

model = word2vec.Word2Vec(corpora, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling)
model.init_sims(replace=True)
model_name = './300features_20minwords_10context'
model.save(model_name)

In [51]:
from gensim.models import Word2Vec

model = Word2Vec.load('./300features_20minwords_10context')
model.most_similar('man')

[('woman', 0.616397500038147),
 ('lad', 0.5815320014953613),
 ('lady', 0.577421247959137),
 ('person', 0.5396988987922668),
 ('soldier', 0.5393463373184204),
 ('monk', 0.5319982767105103),
 ('guy', 0.5270613431930542),
 ('boy', 0.5121172070503235),
 ('businessman', 0.5077206492424011),
 ('chap', 0.5066514611244202)]

In [61]:
import numpy as np  

def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.index2word) # AttributeError: 'Word2Vec' object has no attribute 'index2word'
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter += 1
    return reviewFeatureVecs

In [62]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_text( review, remove_stopwords=True ))
trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_text( review, remove_stopwords=True ))
testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


AttributeError: 'Word2Vec' object has no attribute 'index2word'

In [58]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

gbc = GradientBoostingClassifier()
params_gbc = {'n_estimators': [10, 100, 500], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [2, 3, 4]}
gs = GridSearchCV(gbc, params_gbc, cv=4, n_jobs=-1, verbose=1)
gs.fit(trainDataVecs, y_train)

print(gs.best_score_)
print(gs.best_params_)

result = gs.predict(testDataVecs)
output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
output.to_csv( "../submission_w2v.csv", index=False, quoting=3)

NameError: name 'trainDataVecs' is not defined