<a href="https://colab.research.google.com/github/michaelajao/topic-modelling-with-LDA/blob/master/topic_modelling_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
# for numerical analysis
import numpy as np 
# to store and process in a dataframe
import pandas as pd 
# to create word clouds
from wordcloud import WordCloud, STOPWORDS 

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
df = pd.read_csv("/content/drive/My Drive/crawler/IMDB Dataset.csv")
df['ID'] = [x for x in range(1, len(df.values)+1)]
df.head()

Unnamed: 0,review,sentiment,ID
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,2
2,I thought this was a wonderful way to spend ti...,positive,3
3,Basically there's a family where a little boy ...,negative,4
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,5


In [3]:
# Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item
lem = WordNetLemmatizer()

# this function loops through the words by properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word
def stop_lemmatize(doc):
    tokens = nltk.word_tokenize(doc)
    tmp = ""
    for w in tokens:
        if w not in stop:
            tmp += lem.lemmatize(w, get_wordnet_pos(w)) + " "
    return tmp

In [4]:
# function to tag first charater lemmatize accepts
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
    

In [5]:
stop = stopwords.words('english')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(stop_lemmatize(token))
    return result

In [6]:
processed_docs = df['review'].map(preprocess)
processed_docs[:10]

0    [reviewer , mention , watch , episode , hooked...
1    [wonderful , little , production , film , tech...
2    [thought , wonderful , spend , time , summer ,...
3    [basically , family , little , jake , think , ...
4    [petter , mattei , love , time , money , visua...
5    [probably , time , favorite , movie , story , ...
6    [sure , like , resurrection , date , seahunt ,...
7    [amaze , fresh , innovative , idea , air , yea...
8    [encourage , positive , comment , film , look ...
9    [like , original , wrench , laughter , like , ...
Name: review, dtype: object

In [7]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v, dictionary[k])
    count += 1
    if count > 10:
        break

0  
1 accustom  accustom 
2 agenda  agenda 
3 agreement  agreement 
4 appeal  appeal 
5 aryan  aryan 
6 audience  audience 
7 away  away 
8 bitch  bitch 
9 brutality  brutality 
10 call  call 


In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(39, 1),
 (88, 1),
 (113, 1),
 (150, 1),
 (184, 1),
 (227, 1),
 (441, 1),
 (634, 2),
 (642, 1),
 (777, 1),
 (1091, 1),
 (1355, 1),
 (1792, 4),
 (2616, 1),
 (3207, 1),
 (3436, 1),
 (4094, 1),
 (4754, 1),
 (5139, 1),
 (5726, 1),
 (5906, 4),
 (6266, 1)]

In [10]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 39 ("go ") appears 1 time.
Word 88 ("show ") appears 1 time.
Word 113 ("concern ") appears 1 time.
Word 150 ("time ") appears 1 time.
Word 184 ("love ") appears 1 time.
Word 227 ("instead ") appears 1 time.
Word 441 ("positive ") appears 1 time.
Word 634 ("relationship ") appears 2 time.
Word 642 ("stereotype ") appears 1 time.
Word 777 ("especially ") appears 1 time.
Word 1091 ("culture ") appears 1 time.
Word 1355 ("express ") appears 1 time.
Word 1792 ("american ") appears 4 time.
Word 2616 ("manner ") appears 1 time.
Word 3207 ("allows ") appears 1 time.
Word 3436 ("fictional ") appears 1 time.
Word 4094 ("community ") appears 1 time.
Word 4754 ("plague ") appears 1 time.
Word 5139 ("humorous ") appears 1 time.
Word 5726 ("outcome ") appears 1 time.
Word 5906 ("african ") appears 4 time.
Word 6266 ("undesirable ") appears 1 time.


In [11]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.04387515478473986),
 (1, 0.12152012766136024),
 (2, 0.10218726083118981),
 (3, 0.11813930243915195),
 (4, 0.06609045079801779),
 (5, 0.13461248540772194),
 (6, 0.04468545483405076),
 (7, 0.08423763300596263),
 (8, 0.10593650300462389),
 (9, 0.10606725353057242),
 (10, 0.04816121778078706),
 (11, 0.08889905318025004),
 (12, 0.06226718332872643),
 (13, 0.08154234559512293),
 (14, 0.11809759539084436),
 (15, 0.06174669799607301),
 (16, 0.04792185290435016),
 (17, 0.09730847898279013),
 (18, 0.11382201689103133),
 (19, 0.07948455629483547),
 (20, 0.09899780709388842),
 (21, 0.1282001411512331),
 (22, 0.04888268357002287),
 (23, 0.07576842559866823),
 (24, 0.11342248671392115),
 (25, 0.06603373744475663),
 (26, 0.10661056741866674),
 (27, 0.05964405706530004),
 (28, 0.05474953197144464),
 (29, 0.10327717744045185),
 (30, 0.046076324401779015),
 (31, 0.0378890969244496),
 (32, 0.11001346051640104),
 (33, 0.06096112126337572),
 (34, 0.1912882022424186),
 (35, 0.12348332542600231),
 (36

In [12]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [13]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"time " + 0.008*"story " + 0.007*"character " + 0.006*"like " + 0.006*"people " + 0.005*"world " + 0.005*"watch " + 0.005*"great " + 0.005*"life " + 0.004*"work "
Topic: 1 
Words: 0.006*"great " + 0.006*"horror " + 0.006*"time " + 0.006*"star " + 0.005*"scene " + 0.005*"best " + 0.005*"like " + 0.005*"musical " + 0.005*"music " + 0.004*"good "
Topic: 2 
Words: 0.012*"story " + 0.009*"life " + 0.009*"character " + 0.007*"love " + 0.006*"performance " + 0.005*"time " + 0.004*"young " + 0.004*"like " + 0.004*"woman " + 0.004*"best "
Topic: 3 
Words: 0.006*"like " + 0.005*"scene " + 0.005*"western " + 0.004*"great " + 0.004*"director " + 0.004*"work " + 0.003*"time " + 0.003*"character " + 0.003*"shot " + 0.003*"john "
Topic: 4 
Words: 0.013*"character " + 0.008*"like " + 0.006*"go " + 0.006*"time " + 0.005*"kill " + 0.005*"" + 0.005*"story " + 0.005*"get " + 0.004*"scene " + 0.004*"plot "
Topic: 5 
Words: 0.011*"like " + 0.010*"character " + 0.010*"scene " + 0.007*"

In [14]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.004*"columbo " + 0.002*"scooby " + 0.002*"muppets " + 0.002*"great " + 0.002*"sinatra " + 0.002*"freddy " + 0.002*"preminger " + 0.002*"gandhi " + 0.002*"muppet " + 0.002*"like "
Topic: 1 Word: 0.002*"character " + 0.002*"good " + 0.002*"like " + 0.002*"watch " + 0.002*"story " + 0.002*"time " + 0.002*"" + 0.002*"think " + 0.002*"people " + 0.002*"great "
Topic: 2 Word: 0.002*"good " + 0.002*"like " + 0.002*"story " + 0.002*"great " + 0.002*"scene " + 0.002*"character " + 0.002*"plot " + 0.002*"look " + 0.002*"time " + 0.002*"watch "
Topic: 3 Word: 0.002*"character " + 0.002*"story " + 0.002*"great " + 0.002*"love " + 0.002*"good " + 0.002*"like " + 0.002*"time " + 0.002*"watch " + 0.002*"scene " + 0.002*"life "
Topic: 4 Word: 0.002*"like " + 0.002*"watch " + 0.002*"good " + 0.002*"great " + 0.002*"time " + 0.002*"story " + 0.002*"" + 0.002*"character " + 0.002*"people " + 0.002*"see "
Topic: 5 Word: 0.002*"love " + 0.002*"great " + 0.002*"story " + 0.002*"life " + 0.0

In [15]:
processed_docs[4310]

['especially ',
 'african ',
 'american ',
 'time ',
 'movie ',
 'express ',
 'show ',
 'concern ',
 'go ',
 'african ',
 'american ',
 'relationship ',
 'allows ',
 'culture ',
 'fictional ',
 'humorous ',
 'manner ',
 'positive ',
 'african ',
 'american ',
 'relationship ',
 'outcome ',
 'instead ',
 'undesirable ',
 'stereotype ',
 'plague ',
 'african ',
 'american ',
 'community ',
 'love ',
 'film ']

In [16]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5503628253936768	 
Topic: 0.008*"time " + 0.008*"story " + 0.007*"character " + 0.006*"like " + 0.006*"people " + 0.005*"world " + 0.005*"watch " + 0.005*"great " + 0.005*"life " + 0.004*"work "

Score: 0.4229656159877777	 
Topic: 0.012*"story " + 0.009*"life " + 0.009*"character " + 0.007*"love " + 0.006*"performance " + 0.005*"time " + 0.004*"young " + 0.004*"like " + 0.004*"woman " + 0.004*"best "


In [17]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9699935913085938	 
Topic: 0.002*"character " + 0.002*"good " + 0.002*"like " + 0.002*"watch " + 0.002*"story " + 0.002*"time " + 0.002*"" + 0.002*"think " + 0.002*"people " + 0.002*"great "


In [18]:
unseen_document = "The movie was very touching and heart whelming"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6999570727348328	 Topic: 0.012*"story " + 0.009*"life " + 0.009*"character " + 0.007*"love " + 0.006*"performance "
Score: 0.033342648297548294	 Topic: 0.010*"great " + 0.010*"love " + 0.010*"like " + 0.009*"series " + 0.009*"time "
Score: 0.03334103524684906	 Topic: 0.008*"time " + 0.008*"story " + 0.007*"character " + 0.006*"like " + 0.006*"people "
Score: 0.033340658992528915	 Topic: 0.009*"like " + 0.008*"life " + 0.006*"come " + 0.006*"time " + 0.005*"good "
Score: 0.03333787992596626	 Topic: 0.006*"great " + 0.006*"horror " + 0.006*"time " + 0.006*"star " + 0.005*"scene "
Score: 0.033336907625198364	 Topic: 0.013*"character " + 0.008*"like " + 0.006*"go " + 0.006*"time " + 0.005*"kill "
Score: 0.03333652392029762	 Topic: 0.028*"like " + 0.022*"good " + 0.019*"watch " + 0.012*"think " + 0.012*"time "
Score: 0.03333616629242897	 Topic: 0.006*"like " + 0.005*"scene " + 0.005*"western " + 0.004*"great " + 0.004*"director "
Score: 0.03333616629242897	 Topic: 0.011*"like " + 0