In [1]:
import os
import logging
import pandas as pd
import gensim
from collections import defaultdict
from smart_open import open
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.test.utils import datapath
from gensim.corpora import Dictionary
from gensim.test.utils import common_corpus, common_dictionary
from nltk.stem import PorterStemmer
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.DEBUG)

In [2]:
# load dictionary
dct = corpora.Dictionary.load('dictionary.dict')

# load model
lda = LdaMulticore.load(datapath('lda_model'))

2020-11-18 15:19:52,197 : INFO : loading Dictionary object from dictionary.dict
2020-11-18 15:19:52,198 : DEBUG : {'uri': 'dictionary.dict', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-18 15:19:52,219 : INFO : loaded dictionary.dict
2020-11-18 15:19:52,220 : INFO : loading LdaMulticore object from C:\Users\Terolli\anaconda3\lib\site-packages\gensim\test\test_data\lda_model
2020-11-18 15:19:52,220 : DEBUG : {'uri': 'C:\\Users\\Terolli\\anaconda3\\lib\\site-packages\\gensim\\test\\test_data\\lda_model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-18 15:19:52,221 : INFO : loading expElogbeta from C:\Users\Terolli\anaconda3\lib\site-packages\gensim\test\test_data\lda_model.expElogbeta.npy with mmap=None
2020-11-18 15:19:52,229 : INFO : settin

In [4]:
def preprocess(document):
    # convert to lower case
    document = document.lower()

    # tokenize document
    tk = RegexpTokenizer(r'[a-zA-Z]+')
    tokens = [token for token in tk.tokenize(document)]
    tokens = [token for token in tokens if token != 'br']

    # determine stop words
    stoplist = set(stopwords.words('english'))

    # remove stop words
    tokens = [token for token in tokens if token not in stoplist]

    # stemmer
    porter = PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]

    # remove words with length 1
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

In [21]:
df = pd.read_csv('IMDB Dataset.csv')
df = df.drop_duplicates()
df['clean_review'] = df['review'].apply(preprocess)

In [6]:
reviews = df['review'].tolist()

X = df['clean_review'].tolist()
y = df['sentiment'].tolist()

In [7]:
print(dct)
len(X)

Dictionary(68996 unique tokens: ['accustom', 'agenda', 'agreement', 'appeal', 'around']...)


49582

In [12]:
doc_number = 123

In [14]:
X_test_corpus = dct.doc2bow(X[doc_number])
print(reviews[doc_number])
print()
print(y[doc_number])

Ah yes the 1980s , a time of Reaganomics and Sly , Chuck and a host of other action stars hiding in a remote jungle blowing away commies . At the time I couldn`t believe how movies like RAMBO , MISSING IN ACTION and UNCOMMON VALOR ( And who can forget the ridiculous RED DAWN ? ) made money at the box office , they`re turgid action crap fests with a rather off putting right wing agenda and they have dated very badly . TROMA`S WAR is a tongue in cheek take on these type of movies but you`ve got to ask yourself did they need spoofing in the first place ? Of course not . TROMA`S WAR lacks any sort of sophistication - though it does make the point that there`s no real difference between right wing tyrants and left wing ones - and sometimes feels more like a grade z movie than a send up . Maybe it is ?

negative


In [15]:
# topic probability distribution of unseen document
vector = lda[X_test_corpus]
vector.sort(key = lambda tup: tup[1], reverse=True)
print(vector)

[(33, 0.38143355), (12, 0.35148922), (65, 0.101080865), (43, 0.08851358), (56, 0.035207417), (6, 0.030651374)]


In [18]:
top_topics = [tup for tup in vector if tup[1] >= 0.1]
print(top_topics)

[(33, 0.38143355), (12, 0.35148922), (65, 0.101080865)]


In [20]:
for topic in top_topics:
    top_terms_id = lda.get_topic_terms(topic[0])
    top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

    df2 = pd.DataFrame(top_terms_word, columns =['WORD', 'PROBABILITY'])
    print(df2)
    print()

     WORD  PROBABILITY
0    film     0.013625
1     one     0.013470
2    like     0.006132
3    time     0.006012
4    movi     0.005167
5   scene     0.004627
6     get     0.004366
7  keaton     0.004163
8    play     0.003813
9    take     0.003535

     WORD  PROBABILITY
0    movi     0.066134
1    like     0.014655
2   watch     0.014458
3     one     0.013850
4     bad     0.011245
5    good     0.010090
6  realli     0.009346
7    time     0.009193
8     see     0.008894
9    make     0.008801

       WORD  PROBABILITY
0       war     0.028516
1   soldier     0.013621
2       one     0.008126
3   russian     0.006540
4  militari     0.006310
5     world     0.006218
6      film     0.006209
7      armi     0.006153
8      movi     0.004372
9       man     0.004098



In [None]:
'''
# getting topic id with highest probability
top_topic = max(vector, key=lambda x:x[1])
print(top_topic)

# getting word representation of topic
top_terms_id = lda.get_topic_terms(top_topic[0])
print(top_terms_id)

# converting ids to words
top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

df2 = pd.DataFrame(top_terms_word, columns =['word', 'probability'])
df2
'''