In [None]:
#PREPARING DATA

In [21]:
import pyprind
import pandas as pd
import os
import sys
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000, stream=sys.stdout)

#first initialized a new progress bar object, pbar, with 50,000 iterations, 
#which was the number of documents we were going to read in

df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
                df = df._append([[txt, labels[l]]],
                               ignore_index=True)
                pbar.update()
df.columns = ['review', 'sentiment']

#we iterated over the train and test subdirectories in the main aclImdb directory and read the individual text files, 
#from the pos and neg subdirectories that we eventually appended to the df pandas DataFrame

In [22]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))  #shuffle the dataset since it is sorted
df.to_csv('movie_data.csv', index=False, encoding='utf-8') # store the new movie review dataset as a CSV file

df = pd.read_csv('movie_data.csv', encoding='utf-8')
# the following column renaming is necessary on some computers:
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [23]:
df.shape #check df contains all 50000 rows

(50000, 2)

In [24]:
#BAG OF WORDS MODEL

In [25]:
#construct a bag-of-words model based on the word counts in the respective documents,
#we can use the CountVectorizer class implemented in scikit-learn

#this is a sample dataset
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet,'
                 'and one and one is two'])
bag = count.fit_transform(docs)



In [26]:
print(count.vocabulary_) #construct the vocab of the bog-of-words model

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [27]:
print(bag.toarray()) #shows number of times a word occur in each sentence(row), every column represents a word

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [28]:
# the model above is a one-gram model since words are splitted into individual words
# two-gram means words are splitted into pairs



In [29]:
# TfidfTransformer class, which takes the raw term frequencies from the CountVectorizer class as input 
# and transforms them into tf-idfs

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True) #smooth_idf=True is helpful for assigning zero weight (that is, idf(t, d) = log(1) = 0) to terms that occur in all documents
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())


[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [30]:
#as seen from above, 'is' which is column 2 had the highest frequency in the 3rd document
#however it is associated with a relatively small tf-idf (0.45) in the third document
#since it is present in the first and second document and thus unlikely to contain any useful discriminatory info


In [31]:
#CLEANING TEXT DATA OF MOVIE REVIEW

#stripping unwanted characters

#removing punctuation except for emoticon characters such as :)

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)        #regex, <[^>]*>, to remove all of the HTML markup 
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', #finding emoticon
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +  #removed all non-word characters from the text via the regex [\W]+ and converted the text into lowercase characters
            ' '.join(emoticons).replace('-', '')) #added the temporarily stored emoticons to the end of the processed document string
    return text

#confirming preprocesser function works properly
preprocessor(df.loc[0, 'review'][-50:])


  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', #finding emoticon
  text = (re.sub('[\W]+', ' ', text.lower()) +  #removed all non-word characters from the text via the regex [\W]+ and converted the text into lowercase characters


'is seven title brazil not available'

In [32]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [33]:
import numpy as np

In [34]:
#apply preprocessor to movie review dataset

df['review'] = df['review'].apply(preprocessor)

In [35]:
#processing documents into tokens using stemming by importing NLTK(natural language toolkit)

In [36]:
pip install nltk




In [37]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [38]:
#as seen from above, tokenizer reduce words to root form e.g. running converted to run

In [39]:
#removing stop words(e.g. is, and, has)

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes'' running and runs a lot') if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lssb2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [40]:
#First, we will divide the DataFrame of cleaned text documents into 25,000 documents for training and 25,000 documents for testing

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [41]:
#Next, we will use a GridSearchCV object to find the optimal set of parameters for our logistic regression model 
#using 5-fold stratified cross-validation:

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
small_param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [str.split],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
        },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],
        'vect__tokenizer': [str.split],
        'vect__use_idf':[False],
        'vect__norm':[None],
        'clf__penalty': ['l2', 'l1'],
        'clf__C': [1.0, 10.0]
        },
    ]
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))  #LIBLINEAR solver as it can perform better than the default choice ('lbfgs') for relatively large datasets
    ])
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
                           scoring='accuracy', cv=5,
                           verbose=2, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [42]:
# from above, we replaced CountVectorizer and TfidfTransformer from the previous 
# subsection with TfidfVectorizer, which combines CountVectorizer with the TfidfTransformer

In [43]:
#printing the best parameters
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <method 'split' of 'str' objects>}


In [44]:
#now print the average 5-fold cross-validation accuracy 
#scores on the training dataset and the classification accuracy on the test dataset:
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')


CV Accuracy: 0.897
Test Accuracy: 0.899


In [45]:
gs_lr_tfidf.predict(["bad"])

array([0], dtype=int64)

In [46]:
import pickle

In [47]:
data = {"model": gs_lr_tfidf}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

lr_loaded = data["model"]


In [48]:
y_pred = lr_loaded.predict(["bad"])
y_pred

array([0], dtype=int64)