In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
!pip install pyprind

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2


In [3]:
import pyprind
import os

In [7]:
basepath = "aclImdb"
labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding = "utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index = True)
            pbar.update()
        
df.columns = ["review", "sentiment"]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:21


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1


In [10]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("movie_data.csv", index = False, encoding = "utf-8")

In [4]:
df = pd.read_csv("movie_data.csv", encoding = "utf-8")
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
count = CountVectorizer()
docs = np.array(["The sun is shining",
                 "The weather is sweet",
                 "The sun is shining, the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [7]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [8]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

In [10]:
tfidf = TfidfTransformer(use_idf = True, norm = "l2", smooth_idf = True)
np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [11]:
df.loc[0, "review"][-900:]

'My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of "Nasaan ka man" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!'

In [12]:
import re

In [13]:
def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = (re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", ""))
    return text

In [14]:
preprocessor(df.loc[0, "review"][-900:])

'my family and i normally do not watch local movies for the simple reason that they are poorly made they lack the depth and just not worth our time the trailer of nasaan ka man caught my attention my daughter in law s and daughter s so we took time out to watch it this afternoon the movie exceeded our expectations the cinematography was very good the story beautiful and the acting awesome jericho rosales was really very good so s claudine barretto the fact that i despised diether ocampo proves he was effective at his role i have never been this touched moved and affected by a local movie before imagine a cynic like me dabbing my eyes at the end of the movie congratulations to star cinema way to go jericho and claudine '

In [15]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [29]:
df["review"] = df["review"].apply(preprocessor)

In [30]:
def tokenizer(text):
    return text.split()

In [31]:
tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [32]:
!pip install nltk



In [33]:
from nltk.stem.porter import PorterStemmer

In [34]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [35]:
import nltk

In [36]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/mito/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
from nltk.corpus import stopwords

In [38]:
stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [39]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfidf = TfidfVectorizer(strip_accents = None, 
                        lowercase = False,
                        preprocessor = None)
param_grid = [{"vect__ngram_range": [(1, 1)],
               "vect__stop_words": [stop, None],
               "vect__tokenizer": [tokenizer, tokenizer_porter],
               "clf__penalty": ["l1", "l2"],
               "clf__C": [1.0, 10.0, 100.0]},
              {"vect__ngram_range": [(1,1)],
               "vect__stop_words": [stop, None],
               "vect__tokenizer": [tokenizer, tokenizer_porter],
               "vect__use_idf": [False],
               "vect__norm": [None],
               "clf__penalty": ["l1", "l2"],
               "clf__C": [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([("vect", tfidf),
                     ("clf", LogisticRegression(random_state = 0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring = "accuracy",
                           cv = 5, verbose = 1, n_jobs = -1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
print("Best parameter set: %s " % gs_lr_tfidf.best_params_)
print("CV accuracy: %.3f" % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print("Test accuracy: %.3f" % clf.score(X_test, y_test))

In [42]:
stopwords = stopwords.words("english")
def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text.lower())
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [43]:
def stream_docs(path):
    with open(path, "r", encoding = "utf-8") as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [44]:
next(stream_docs(path = "movie_data.csv"))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [45]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except:
        return None, None
    return docs, y

In [46]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [48]:
vect = HashingVectorizer(decode_error = "ignore",
                         n_features = 2**21,
                         preprocessor = None,
                         tokenizer = tokenizer)
clf = SGDClassifier(loss = "log", random_state = 1, max_iter = 1)
doc_stream = stream_docs(path = "movie_data.csv")

In [49]:
import pyprind

In [50]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:33


In [51]:
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print("Accuracy: %.3f" %clf.score(X_test, y_test))

Accuracy: 0.868


In [52]:
count = CountVectorizer(stop_words = "english",
                        max_df = .1,
                        max_features = 5000)
X = count.fit_transform(df["review"].values)

In [53]:
from sklearn.decomposition import LatentDirichletAllocation

In [55]:
lda = LatentDirichletAllocation(n_components = 10,
                                random_state = 123,
                                learning_method = "batch")
X_topics = lda.fit_transform(X)

In [57]:
lda.components_.shape

(10, 5000)

In [58]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort() [:-n_top_words-1: -1]]))

Topic 1:
comedy original horror black version
Topic 2:
worst minutes script guy money
Topic 3:
book dvd read version watched
Topic 4:
family performance father beautiful mother
Topic 5:
series episode tv kids episodes
Topic 6:
murder police wife woman john
Topic 7:
documentary camera audience human sense
Topic 8:
music song songs musical dance
Topic 9:
horror effects budget gore special
Topic 10:
action war game fight hero


In [59]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print("\nHorror movie #%d:" % (iter_idx + 1))
    print(df["review"][movie_idx][:300], "...")


Horror movie #1:
 spoilers when undercover brooklyn north det eddie santos nestor serrano was to meet his drug supplier tito zapatti larry romano in the williamsburg section of brooklyn in a buy and bust operation with tito being the one who gets busted that things went haywire with both det santos and tito ending u ...

Horror movie #2:
 spoilers extremely brutal police drama set in san francisco involving a sting operation that goes terribly wrong a cop det falon sam elliott mistakenly and savagely beats to death an undercover policeman winch mike watson thinking that he murdered his partner det sam levinson mike burstyn a partner ...

Horror movie #3:
this first rate western tale of the gold rush brings great excitement romance and james stewart to the screen the far country is the only one out of all five stewart mann westerns that is often overlooked stewart yet again puts a new look on the ever present personalities he had in the five stewart  ...


In [60]:
import pickle

In [61]:
dest = os.path.join("movieclassifier", "pkl_objects")
pickle.dump(stop, open(os.path.join(dest, "stopwords.pkl"), "wb"), protocol = 4)
pickle.dump(clf, open(os.path.join(dest, "classifier.pkl"), "wb"), protocol = 4)