In [1]:
import numpy as np
import shorttext
import pandas as pd

Using TensorFlow backend.


# Data

In [2]:
# source: https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
imdbdata = pd.read_csv('/data/hok/testdata/imdb/IMDB Dataset.csv')

In [3]:
imdbdata.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


In [4]:
imdbdata.iloc[:100,].groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,58
positive,42


In [5]:
imdbdata = imdbdata.iloc[:100,]
imdbdata

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
95,Daniel Day-Lewis is the most versatile actor a...,positive
96,My guess would be this was originally going to...,negative
97,"Well, I like to watch bad horror B-Movies, cau...",negative
98,"This IS the worst movie I have ever seen, as w...",negative


# Data Conversion Pipeline

In [6]:
import re

In [7]:
remove_htmltags = lambda s: re.sub(r'<(/)*(\w)+([\s|/])*>', '', s)
remove_specialchars = lambda s: re.sub(r'[^A-Za-z\s]', ' ', s)
remove_consecutivespaces = lambda s: re.sub(r'\s+', ' ', s)
convert_tolowercase = lambda s: s.lower()
strip_string = lambda s: s.strip()

In [8]:
pipeline = [remove_htmltags, remove_specialchars, remove_consecutivespaces, convert_tolowercase, strip_string]

In [9]:
preprocess = shorttext.utils.text_preprocessor(pipeline)

# Word2Vec Model

In [10]:
wmodel = shorttext.utils.load_word2vec_model('/data/hok/langmodel/word2vec/GoogleNews-vectors-negative300.bin')

In [11]:
from collections import defaultdict

trainclassdict = defaultdict(lambda : [])
for review, sentiment in zip(imdbdata['review'], imdbdata['sentiment']):
    trainclassdict[sentiment] += [preprocess(review)]
    
trainclassdict = dict(trainclassdict)

In [12]:
kmodel = shorttext.classifiers.frameworks.CLSTMWordEmbed(2, vecsize=wmodel.vector_size)

In [13]:
classifier = shorttext.classifiers.VarNNEmbeddedVecClassifier(wmodel)

In [14]:
classifier.train(trainclassdict, kmodel)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
classifier.score('this movie is bad')

{'positive': 0.00097388605, 'negative': 0.99902606}

In [16]:
classifier.score('this movie is fabulous')

{'positive': 0.41388372, 'negative': 0.58611625}

In [18]:
classifier.save_compact_model('imdb_clstm.bin')