In [1]:
import os
import logging
import pandas as pd
import gensim
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from smart_open import open
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.test.utils import datapath
from gensim.corpora import Dictionary
from gensim.test.utils import common_corpus, common_dictionary
from nltk.stem import PorterStemmer
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# logging.root.setLevel(level=logging.DEBUG)

In [2]:
# load dictionary
dct = corpora.Dictionary.load('dictionary.dict')

# load model
lda = LdaMulticore.load(datapath('lda_model'))

In [3]:
def preprocess(document):
    # convert to lower case
    document = document.lower()

    # tokenize document
    tk = RegexpTokenizer(r'[a-zA-Z\'\-\_]+')
    tokens = [token for token in tk.tokenize(document)]
    tokens = [token for token in tokens if token != 'br']

    # determine stop words
    stoplist = set(stopwords.words('english'))

    # remove stop words
    tokens = [token for token in tokens if token not in stoplist]

    # stemmer
    porter = PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]

    # remove words with length 1
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

In [4]:
df = pd.read_csv('..\data\IMDB Dataset.csv')
df = df.drop_duplicates()
df['clean_review'] = df['review'].apply(preprocess)

In [5]:
reviews = df['review'].tolist()

X = df['clean_review'].tolist()
y = df['sentiment'].tolist()

In [6]:
print(dct)
len(X)

Dictionary(28564 unique tokens: ['accustom', 'agenda', 'agreement', 'appeal', 'around']...)


49582

In [7]:
doc_number = 3

In [8]:
X_test_corpus = dct.doc2bow(X[doc_number])
print(reviews[doc_number])
print()
print(y[doc_number])

Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.

negative


In [9]:
# topic probability distribution of unseen document
vector = lda.get_document_topics(X_test_corpus, minimum_probability=0.1)
vector.sort(key = lambda tup: tup[1], reverse=True)
print(vector)

[(290, 0.23004244), (110, 0.1850052), (53, 0.13856857), (64, 0.11945973)]


In [11]:
for topic in vector:
    top_terms_id = lda.get_topic_terms(topic[0])
    top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

    df2 = pd.DataFrame(top_terms_word, columns =['WORD', 'PROBABILITY'])
    print(df2)
    print()

    WORD  PROBABILITY
0  zombi     0.062570
1   dead     0.017973
2   film     0.017109
3    one     0.013389
4  spoof     0.013118
5   like     0.011950
6   make     0.009026
7   even     0.008601
8   gore     0.008356
9    bad     0.008310

       WORD  PROBABILITY
0       mom     0.049819
1    mother     0.036669
2       son     0.013870
3       cri     0.012544
4      like     0.011545
5    parent     0.011190
6     adopt     0.010693
7      time     0.010354
8  children     0.008849
9       see     0.008806

      WORD  PROBABILITY
0    opera     0.059522
1     soap     0.043055
2     film     0.019657
3    actor     0.012600
4     love     0.011325
5    watch     0.010059
6      one     0.009476
7  charact     0.007805
8    salli     0.007007
9     role     0.006722

       WORD  PROBABILITY
0    murder     0.035963
1      rape     0.031389
2       sex     0.022480
3     scene     0.021242
4        de     0.020979
5      kill     0.020349
6  thriller     0.020042
7    victim     

In [12]:
matrix = lda.get_topics()
matrix.shape

(500, 28564)

In [None]:
'''
# getting topic id with highest probability
top_topic = max(vector, key=lambda x:x[1])
print(top_topic)

# getting word representation of topic
top_terms_id = lda.get_topic_terms(top_topic[0])
print(top_terms_id)

# converting ids to words
top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

df2 = pd.DataFrame(top_terms_word, columns =['word', 'probability'])
df2
'''