In [19]:
import pyprind
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import os
import sys
import re

In [5]:
np.random.seed(0)

base_path = 'data/aclImdb'

labels = {'pos': 1, 'neg': 0}

pbar = pyprind.ProgBar(50000, stream=sys.stdout)

df = pd.DataFrame()


for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(base_path, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            
            pbar.update()
            
df.columns = ['review', 'sentiment']

df = df.reindex(np.random.permutation(df.index))

df.to_csv('data/movie_data.csv', index=False, encoding='utf-8')

  df = df.append([[txt, labels[l]]], ignore_index=True)


In [6]:
df.head()

Unnamed: 0,review,sentiment
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0
25747,hi for all the people who have seen this wonde...,1
42642,"I recently bought the DVD, forgetting just how...",0


## Bag-of-Words Model


The idea behind bag-of-words is quite simple and can be summarized as follows:

- We create a vocabulary of unique tokens—for example, words—from the entire set of documents.
- We construct a feature vector from each document that contains the counts of how often each
word occurs in the particular document.



In [7]:
# Uni-gram model
count = CountVectorizer()

# Bi-gram model
# count = CountVectorizer(ngram_range=(2,2))

docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet and one and one is two'
                 ])


bag = count.fit_transform(docs)

# Lets print the content of the vocabulary
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


## Assessing word relevancy via term frequency-inverse document frequency

When we are analyzing text data, we often encounter words that occur across multiple documents
from both classes. These frequently occurring words typically don’t contain useful or discriminatory
information.

In [8]:
tf_idf = TfidfTransformer(use_idf=True,
                          norm='l2',
                          smooth_idf=True
                          )


np.set_printoptions(precision=2)


# perform tf_idf on count vectorizer
print(tf_idf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


> The word 'is' had the largest term frequency in the third
document, being the most frequently occurring word. However, after transforming the same feature
vector into tf-idfs, the word 'is' is now associated with a relatively small tf-idf (0.45) in the third
document, since it is also present in the first and second document and thus is unlikely to contain
any useful discriminatory information.

## Cleaning Text Data

The first important step—before we build our bag-of-words model—is to clean the text data by stripping it of all unwanted characters.

In [9]:
def preprocessor(text):
    # Remove the html mark-up
    text = re.sub('<[^>]*>', '', text)
    # Remove all emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text



# Applying the pre-processor
df['review'] = df['review'].apply(preprocessor)

## Processing documents into tokens

After successfully preparing the movie review dataset, we now need to think about how to split the
text corpora into individual elements. One way to tokenize documents is to split them into individual
words by splitting the cleaned documents at their whitespace characters:

In [10]:
def tokenizer(text):
    return text.split()

In the context of tokenization, another useful technique is word stemming, which is the process of
transforming a word into its root form. It allows us to map related words to the same stem.

In [11]:
porter = PorterStemmer()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]




> While stemming can create non-real words, such as 'thu' (from 'thus'), as shown in
the previous example, a technique called lemmatization aims to obtain the canonical
(grammatically correct) forms of individual words—the so-called lemmas. However, lemmatization
is computationally more difficult and expensive compared to stemming and,
in practice, it has been observed that stemming and lemmatization have little impact on
the performance of text classification

In [12]:
# Removing stopwords
nltk.download('stopwords')

stop = stopwords.words('english')

[word for word in tokenizer_porter('a runner likes running running and runs a lot') if word not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\INNO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'run', 'lot']

## Training a Logistic Regression Model for Document Classification

In [13]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000: , 'review'].values
y_test = df.loc[25000: , 'sentiment'].values


# Use the Grid  search CV to find the optimal set of parameters for logistic regression
tf_idf = TfidfVectorizer(strip_accents=None,
                         lowercase=False,
                         preprocessor=None
                         )


small_param_grid = [
                    {
                            'vect__ngram_range': [(1, 1)],
                            'vect__stop_words': [None],
                            'vect__tokenizer': [tokenizer, tokenizer_porter],
                            'clf__penalty': ['l2'],
                            'clf__C': [1.0, 10.0]
                    },
                    {
                            'vect__ngram_range': [(1, 1)],
                            'vect__stop_words': [stop, None],
                            'vect__tokenizer': [tokenizer],
                            'vect__use_idf':[False],
                            'vect__norm':[None],
                            'clf__penalty': ['l2'],
                            'clf__C': [1.0, 10.0]
                    }
                   ]


lr_tfidf = Pipeline([
                ('vect', tf_idf),
                ('clf', LogisticRegression(solver='liblinear'))
                   ])


gs_lr_tf_idf = GridSearchCV(estimator=lr_tfidf, param_grid=small_param_grid,
                            scoring='accuracy', cv=5, verbose=2, n_jobs=-1
                            )

gs_lr_tf_idf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


> Note that for the logistic regression classifier, we are using the LIBLINEAR solver as it can perform
better than the default choice ('lbfgs') for relatively large datasets.

In [14]:
print(f'Best parameter set: {gs_lr_tf_idf.best_params_}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x0000021BE3245430>}


> As you can see in the preceding output, we obtained the best grid search results using the regular
tokenizer without Porter stemming, no stop word library, and tf-idfs in combination with a logistic
regression classifier that uses L2-regularization with the regularization strength C of 10.0.


In [15]:
print(f'CV Training Accuracy: {gs_lr_tf_idf.best_score_:.3f}')

CV Training Accuracy: 0.891


In [16]:
clf = gs_lr_tf_idf.best_estimator_

print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

Test Accuracy: 0.897


## Topic modeling with latent Dirichlet allocation

Topic modeling describes the broad task of assigning topics to unlabeled text documents. For example,
a typical application is the categorization of documents in a large text corpus of newspaper articles.
In applications of topic modeling, we then aim to assign category labels to those articles, for example,
sports, finance, world news, politics, and local news. Thus, in the context of the broad categories of
machine learning, consider topic modeling as a clustering task, a subcategory of unsupervised learning.


> Please note that `Latent Dirichlet Allocation` is NOT `Linear Discriminant Analysis`

In [18]:
count = CountVectorizer(stop_words=stop,
                        max_df = 0.1,
                        max_features=5000
                        )

X = count.fit_transform(df['review'].values)

> Notice that we set the maximum document frequency of words to be considered to 10 percent (max_
df=.1) to exclude words that occur too frequently across documents. The rationale behind the removal
of frequently occurring words is that these might be common words appearing across all documents
that are, therefore, less likely to be associated with a specific topic category of a given document.
Also, we limited the number of words to be considered to the most frequently occurring 5,000 words
(max_features=5000), to limit the dimensionality of this dataset to improve the inference performed
by LDA. However, both max_df=.1 and max_features=5000 are hyperparameter values chosen arbitrarily,
and readers are encouraged to tune them while comparing the results.

In [20]:
lda = LatentDirichletAllocation(n_components=10,
                                random_state=123,
                                learning_method='batch'
                                )


X_topics = lda.fit_transform(X)

>By setting learning_method='batch', we let the lda estimator do its estimation based on all available
training data (the bag-of-words matrix) in one iteration, which is slower than the alternative 'online'
learning method, but can lead to more accurate results (setting learning_method='online' is analogous
to online or mini-batch learning).

In [21]:
n_top_words = 5

feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    

Topic 1: 
comedy jokes humor original comic
Topic 2: 
action john fight western role
Topic 3: 
horror effects gore game blood
Topic 4: 
performance music role excellent wonderful
Topic 5: 
series book tv episode dvd
Topic 6: 
sex feel kind believe maybe
Topic 7: 
cinema human documentary yet art
Topic 8: 
war police murder men american
Topic 9: 
worst guy money minutes terrible
Topic 10: 
family father mother girl wife


In [34]:
war_topic = X_topics[:7].argsort()[::-1]


for iter_idx, movie_idx in enumerate(war_topic[:2]):
    print(f'\War Topic #{(iter_idx + 1)}:')
    print(df['review'][movie_idx])
    print('\n')

\War Topic #1:
0    i went and saw this movie last night after bei...
4    bill paxton has taken the true story of the 19...
8    this movie is amazing because the fact that th...
2    as a recreational golfer with some knowledge o...
9     quitting may be as much about exiting a pre o...
6    maybe i m reading into this too much but i won...
1    actor turned director bill paxton follows up h...
5    i saw this film on september 1st 2005 in india...
7    i felt this film did have many good qualities ...
3    i saw this film in a sneak preview and it is d...
Name: review, dtype: object


\War Topic #2:
1    actor turned director bill paxton follows up h...
3    i saw this film in a sneak preview and it is d...
0    i went and saw this movie last night after bei...
6    maybe i m reading into this too much but i won...
8    this movie is amazing because the fact that th...
5    i saw this film on september 1st 2005 in india...
9     quitting may be as much about exiting a pre o...
2    