In [1]:
import pandas as pd
import os
df = pd.DataFrame()
labels = {'pos':1, 'neg':0}
for s in ('test','train'):
    for l in ('pos','neg'):
        path = './aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index = True)
df.columns = ['review', 'sentiment']
    

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [3]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

In [4]:
df1 = pd.read_csv('./movie_data.csv')
df1.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(ngram_range=(1,2))
docs = np.array(['I love watching Tv'
                , 'There is a good show on tv'
                , 'I love watching tv because there are good shows on tv'])
bag = count.fit_transform(docs)

In [6]:
count.vocabulary_

{'are': 0,
 'are good': 1,
 'because': 2,
 'because there': 3,
 'good': 4,
 'good show': 5,
 'good shows': 6,
 'is': 7,
 'is good': 8,
 'love': 9,
 'love watching': 10,
 'on': 11,
 'on tv': 12,
 'show': 13,
 'show on': 14,
 'shows': 15,
 'shows on': 16,
 'there': 17,
 'there are': 18,
 'there is': 19,
 'tv': 20,
 'tv because': 21,
 'watching': 22,
 'watching tv': 23}

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
Tfidf = TfidfTransformer()

t = Tfidf.fit_transform(bag)

In [8]:
print(t.toarray())

[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.46609584  0.46609584  0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.361965
   0.          0.46609584  0.46609584]
 [ 0.          0.          0.          0.          0.258401    0.33976626
   0.          0.33976626  0.33976626  0.          0.          0.258401
   0.258401    0.33976626  0.33976626  0.          0.          0.258401    0.
   0.33976626  0.20067143  0.          0.          0.        ]
 [ 0.25800541  0.25800541  0.25800541  0.25800541  0.19621977  0.
   0.25800541  0.          0.          0.19621977  0.19621977  0.19621977
   0.19621977  0.          0.          0.25800541  0.25800541  0.19621977
   0.25800541  0.          0.30476431  0.25800541  0.19621977  0.19621977]]


In [9]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
   # emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) # + \'.join(emoticons).replace('-', '')
    return text

preprocessor('Hello <i>World</i>.!')

'hello world '

In [10]:
df['review'] = df['review'].apply(preprocessor)

In [11]:
def tokenizer(text):
    return text.split()
tokenizer(docs[1])

['There', 'is', 'a', 'good', 'show', 'on', 'tv']

In [12]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def tokenizer_porter(text):
    return [stemmer.stem(s) for s in text.split()]

In [13]:
tokenizer_porter(docs[2])

['I',
 'love',
 'watch',
 'tv',
 'becaus',
 'there',
 'are',
 'good',
 'show',
 'on',
 'tv']

In [14]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/OlivierDeMeulder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [16]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)]
               , 'vect__stop_words': [stop, None]
               , 'vect__tokenizer': [tokenizer, tokenizer_porter]
               , 'clf__penalty': ['l1','l2']
               ,'clf__C': [1.0, 10.0, 100.0]}]
#              ,{' vect__ngram_range': [(1,1)]
#                , 'vect__stop_words': [stop, None]
#                , 'vect__tokenizer': [tokenizer, tokenizer_porter]
#                , 'vect__use_idf':[ False]
#                , 'vect__norm':[ None]
#                , 'clf__penalty': ['l1','l2']
#                , 'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv =5, verbose=1, n_jobs =-1)
gs_lr_tfidf.fit( X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Process ForkPoolWorker-7:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/OlivierDeMeulder/anaconda/lib/python3.5/m

KeyboardInterrupt: 

In [19]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, ngram_range=(1,2), stop_words=stop
                       , tokenizer=tokenizer_porter, use_idf=False, norm=None)
clf = LogisticRegression(random_state=0, C=10.0, penalty='l2')
tfidf_clf = Pipeline([('vect', tfidf), ('clf', clf)])
tfidf_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=None, preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [20]:
negreview1 = "The best thing I can say about Fifty Shades Darker is that it is one of the best comedies of 2017. Charlie Hunnam, is probably somewhere spiking the football that he backed out of this franchise before it ruined his career. There aren't enough buzzwords to describe how bad this is, you simply have to see it to believe it. If you aren't going to laugh at this film or get some Amazon recommendations for butt plugs this Valentine's Day, don't get suckered into seeing this film. Just watch porn and maintain the ounce of dignity you have left."
negreview2 = "Compared to the actual book, this movie was an embarrassment. The plot was horribly summarized and not accurately portrayed. They saved a lot of big moments for the last movie as does every other Hollywood creation. Was very disappointed with this."
negreview3 = "I honestly feel bad for anyone who likes this Movie and thinks its kinky. obviously you have a non existent sex life. Watch porn, at least the dialogue would be more believable and the sex scenes more accurate to Doms and Subs"
posreview1 = "I don't have any idea what the critics are watching, but I really liked the movie!! First movie was more getting to know each other and getting to know BDSM..but this one is more relationship and commitment ...and yes..there is still plenty of sex ;-) .....One scene REALLY made me smile...Ana says a line that Dakota Johnson's Mom said in one of my favorite movies...Working Girl....Music was wonderful again too.....and pretty true to the book..."
posreview2 = "Alot of hot and steamy scenes. Could have done with a little less of that and more of the story line but it was better than the first one. of course this takes without saying the books were better. must reads"
posreview3 = "Was interesting. A lot more humor and outside the bedroom action in this one. Nice to get back story on Christian. Still a lot of kinky sex scenes... the elevator!! ;) I really enjoyed this movie"

qreview1 = "I did not really love it, but i did not hate it either."
qreview2 = "i did not enjoy this movie, it was dreadful"
qreview3 = "This is one of those moview where it is really hard to say if you like it or not."
tfidf_clf.predict(np.array([negreview1,negreview2,negreview3,posreview1,posreview2,posreview3, qreview1, qreview2, qreview3]))
tfidf_clf.predict_proba(np.array([negreview1,negreview2,negreview3,posreview1,posreview2,posreview3, qreview1, qreview2, qreview3]))


array([[ 0.77015477,  0.22984523],
       [ 0.9986276 ,  0.0013724 ],
       [ 0.94348985,  0.05651015],
       [ 0.00706379,  0.99293621],
       [ 0.3101002 ,  0.6898998 ],
       [ 0.03026029,  0.96973971],
       [ 0.26022643,  0.73977357],
       [ 0.22525168,  0.77474832],
       [ 0.37977367,  0.62022633]])