In [1]:
import os
import logging
import pandas as pd
import gensim
from collections import defaultdict
from smart_open import open
from gensim import corpora, utils
from gensim.models import LdaMulticore
from gensim.test.utils import datapath
from gensim.corpora import Dictionary
from gensim.test.utils import common_corpus, common_dictionary
from nltk.stem import PorterStemmer
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.DEBUG)

In [2]:
# load dictionary
dct = corpora.Dictionary.load('dictionary.dict')

# load model
lda = LdaMulticore.load(datapath('lda_model'))

2020-11-18 15:19:52,197 : INFO : loading Dictionary object from dictionary.dict
2020-11-18 15:19:52,198 : DEBUG : {'uri': 'dictionary.dict', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-18 15:19:52,219 : INFO : loaded dictionary.dict
2020-11-18 15:19:52,220 : INFO : loading LdaMulticore object from C:\Users\Terolli\anaconda3\lib\site-packages\gensim\test\test_data\lda_model
2020-11-18 15:19:52,220 : DEBUG : {'uri': 'C:\\Users\\Terolli\\anaconda3\\lib\\site-packages\\gensim\\test\\test_data\\lda_model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-18 15:19:52,221 : INFO : loading expElogbeta from C:\Users\Terolli\anaconda3\lib\site-packages\gensim\test\test_data\lda_model.expElogbeta.npy with mmap=None
2020-11-18 15:19:52,229 : INFO : settin

In [4]:
def preprocess(document):
    # convert to lower case
    document = document.lower()

    # tokenize document
    tk = RegexpTokenizer(r'[a-zA-Z]+')
    tokens = [token for token in tk.tokenize(document)]
    tokens = [token for token in tokens if token != 'br']

    # determine stop words
    stoplist = set(stopwords.words('english'))

    # remove stop words
    tokens = [token for token in tokens if token not in stoplist]

    # stemmer
    porter = PorterStemmer()
    tokens = [porter.stem(token) for token in tokens]

    # remove words with length 1
    tokens = [token for token in tokens if len(token) > 1]

    return tokens

In [5]:
df = pd.read_csv(
    "C:\\Users\\Terolli\\Desktop\\sentiment-analysis-movie-reviews\\llda\\IMDB Dataset.csv")
df = df.drop_duplicates()
df['clean_review'] = df['review'].apply(preprocess)

In [6]:
reviews = df['review'].tolist()

X = df['clean_review'].tolist()
y = df['sentiment'].tolist()

In [7]:
print(dct)
len(X)

Dictionary(68996 unique tokens: ['accustom', 'agenda', 'agreement', 'appeal', 'around']...)


49582

In [9]:
X_test_corpus = dct.doc2bow(X[100])
print(reviews[100])
print(y[100])

This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story.
positive


In [10]:
# topic probability distribution of unseen document
vector = lda[X_test_corpus]
vector.sort(key = lambda tup: tup[1], reverse=True)
print(vector)

[(71, 0.7810817), (17, 0.114477724), (27, 0.091832176)]


In [11]:
for topic in vector:
    top_terms_id = lda.get_topic_terms(topic[0])
    top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

    df2 = pd.DataFrame(top_terms_word, columns =['word', 'probability'])
    print(df2)
    print()

     word  probability
0    film     0.031236
1     one     0.009177
2    movi     0.007480
3    like     0.006537
4  horror     0.006221
5    look     0.005731
6    make     0.005664
7   scene     0.005532
8    plot     0.005263
9    good     0.005164

      word  probability
0     danc     0.042316
1    music     0.030529
2    kelli     0.017657
3     sing     0.016779
4     song     0.014599
5   number     0.014367
6  cartoon     0.012870
7      cat     0.012810
8    jerri     0.010733
9  sinatra     0.009390

     word  probability
0    show     0.071591
1    like     0.018460
2   watch     0.014236
3   funni     0.009991
4     one     0.009176
5    time     0.009123
6      tv     0.009071
7  episod     0.008236
8   great     0.007751
9    love     0.007412



In [None]:
'''
# getting topic id with highest probability
top_topic = max(vector, key=lambda x:x[1])
print(top_topic)

# getting word representation of topic
top_terms_id = lda.get_topic_terms(top_topic[0])
print(top_terms_id)

# converting ids to words
top_terms_word = [(lda.id2word[id], prob) for id, prob in top_terms_id]

df2 = pd.DataFrame(top_terms_word, columns =['word', 'probability'])
df2
'''