# Preparing the IMDb movie review data for text processing

In [3]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar: tar.extractall()

In [9]:
import pyprind
import pandas as pd
import os
basepath = 'aclImdb' # change the `basepath` to the directory of the unzipped movie dataset
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

In [10]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()                
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:07:36


In [11]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [12]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [73]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [75]:
df.review[0]

'In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich famil

In [37]:
# Transforming words into feature vectors
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining','The weather is sweet','The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [44]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
cunt = TfidfVectorizer()
bag = count.fit_transform(docs)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
#ag = count.fit_transform(docs)
print(tfidf.fit_transform(bag).toarray())

[[0.   0.37 0.61 0.61 0.   0.37 0.  ]
 [0.   0.37 0.   0.   0.61 0.37 0.61]
 [0.55 0.38 0.32 0.32 0.32 0.38 0.32]]


In [19]:
# Cleaning text data
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [45]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [46]:
preprocessor(df.loc[0, 'review'][-50:])

'zation my vote is seven title brazil not available'

In [47]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [83]:
df['review'] = df['review'].apply(preprocessor)
df.review[0]

'in 1974 the teenager martha moxley maggie grace moves to the high class area of belle haven greenwich connecticut on the mischief night eve of halloween she was murdered in the backyard of her house and her murder remained unsolved twenty two years later the writer mark fuhrman christopher meloni who is a former la detective that has fallen in disgrace for perjury in o j simpson trial and moved to idaho decides to investigate the case with his partner stephen weeks andrew mitchell with the purpose of writing a book the locals squirm and do not welcome them but with the support of the retired detective steve carroll robert forster that was in charge of the investigation in the 70 s they discover the criminal and a net of power and money to cover the murder murder in greenwich is a good tv movie with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a kennedy the powerful and rich family used their influence to cover the mur

# Processing documents into tokens

In [100]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data] 

[nltk_data]    |   Unzipping corpora\pros_cons.zip.
[nltk_data]    | Downloading package qc to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\qc.zip.
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package rte to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\rte.zip.
[nltk_data]    | Downloading package semcor to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\sentiwordnet.zip.
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     C:\U

[nltk_data]    |   Unzipping corpora\nonbreaking_prefixes.zip.
[nltk_data]    | Downloading package vader_lexicon to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping stemmers\porter_test.zip.
[nltk_data]    | Downloading package wmt15_eval to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping models\wmt15_eval.zip.
[nltk_data]    | Downloading package mwa_ppdb to
[nltk_data]    |     C:\Users\Nida\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping misc\mwa_ppdb.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection all


True

In [156]:
from nltk.stem.porter import PorterStemmer

def tokenizer(text): 
    return text.split()
# tokenizer('runners like running and thus they run')

def tokenizer_porter(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')
# tokenizer_porter(df.values['review'])

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [177]:
#df=df.drop("tokenized_sentences", axis=1)
df.head()

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1
1,ok so i really like kris kristofferson and his...,0
2,spoiler do not read this if you think about w...,0
3,hi for all the people who have seen this wonde...,1
4,i recently bought the dvd forgetting just how ...,0


In [178]:
from nltk.tokenize import WhitespaceTokenizer, TreebankWordTokenizer, WordPunctTokenizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['token'] = df['review'].apply(word_tokenize)

In [181]:
tokenizer=WhitespaceTokenizer()
df['token'] = df['review'].apply(tokenizer.tokenize)

In [195]:
tokenizer=TreebankWordTokenizer()
df['token'] = df['review'].apply(tokenizer.tokenize)

In [197]:
tokenizer=PorterStemmer()
df['token'] = df['review'].apply(tokenizer.stem)

In [198]:
tokenizer=WordPunctTokenizer()
df['token'] = df['review'].apply(tokenizer.tokenize)

In [208]:
#df['token'].astype(str)
#df.dtypes
df.head()

Unnamed: 0,review,sentiment,token
0,in 1974 the teenager martha moxley maggie grac...,1,"[in, 1974, the, teenager, martha, moxley, magg..."
1,ok so i really like kris kristofferson and his...,0,"[ok, so, i, really, like, kris, kristofferson,..."
2,spoiler do not read this if you think about w...,0,"[spoiler, do, not, read, this, if, you, think,..."
3,hi for all the people who have seen this wonde...,1,"[hi, for, all, the, people, who, have, seen, t..."
4,i recently bought the dvd forgetting just how ...,0,"[i, recently, bought, the, dvd, forgetting, ju..."


# Training a logistic regression model for document classification

In [55]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)], 
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter], 
               'clf__penalty': ['l1', 'l2'], 
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1,1)], 
               'vect__stop_words': [stop, None], 
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False], 
               'vect__norm':[None], 
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0, solver='liblinear'))])

In [65]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: 

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
#Best parameter set: {'clf__C': 10.0, 'vect__stop_words': None,
#'clf__penalty': 'l2', 'vect__tokenizer': <function tokenizer at
#0x7f6c704948c8>, 'vect__ngram_range': (1, 1)}

In [None]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

# Working with bigger data – online algorithms and out-of-core learning

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
    text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
    + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
    next(csv) # skip header
    for line in csv:
    text, label = line[:-3], int(line[-2])
    yield text, label

In [None]:
next(stream_docs(path='movie_data.csv'))
#('"In 1974, the teenager Martha Moxley ... ',1)

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
    for _ in range(size):
    text, label = next(doc_stream)
    docs.append(text)
    y.append(label)
    except StopIteration:
    return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
    n_features=2**21,
    preprocessor=None,
    tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [None]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
    break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
clf = clf.partial_fit(X_test, y_test)

# LDA with scikit-learn

In [None]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
    max_df=.1,
    max_features=5000)
X = count.fit_transform(df['review'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10,
    random_state=123,
    learning_method='batch')
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
    for i in topic.argsort()\
    [:-n_top_words - 1:-1]]))

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')
'''
Horror movie #1:
House of Dracula works from the same basic premise as House of
Frankenstein from the year before; namely that Universal's three most
famous monsters; Dracula, Frankenstein's Monster and The Wolf Man are
appearing in the movie together. Naturally, the film is rather messy
therefore, but the fact that ...
Horror movie #2:
Okay, what the hell kind of TRASH have I been watching now? "The
Witches' Mountain" has got to be one of the most incoherent and insane
Spanish exploitation flicks ever and yet, at the same time, it's also
strangely compelling. There's absolutely nothing that makes sense here
and I even doubt there ...
Horror movie #3:
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a
total freakfest from start to finish. A fun freakfest at that, but at
times it was a tad too reliant on kitsch rather than the horror. The
story is difficult to summarize succinctly: a carefree, normal teenage
girl starts coming fac ...
'''