In [1]:
import pickle
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
def preprocess(document):
    document = document.lower()
    tk = RegexpTokenizer(r'[a-zA-Z]+')
    tokens = [token for token in tk.tokenize(document)]
    tokens = [token for token in tokens if token != 'br']

    # These preprocesing steps increase perplexity
    
    # stoplist = set(stopwords.words('english'))
    # tokens = [token for token in tokens if token not in stoplist]
    # porter = PorterStemmer()
    # tokens = [porter.stem(token) for token in tokens]

    return ' '.join(tokens)

In [3]:
# loading the trained model
llda_model = pickle.load(open('llda_model.pkl', 'rb'))

# preparing test set
df = pd.read_csv('IMDB Dataset.csv')
df = df.drop_duplicates()
df['clean_review'] = df['review'].apply(preprocess)


X = df['clean_review'].tolist()
y = df['sentiment'].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [4]:
print(len(X_test))
print(len(X_train))

9917
39665


In [5]:
# picking a test instance
id_num = 100
unseen_doc = X_test[id_num]
print(unseen_doc)

# true label
y_test[id_num]

i saw this movie when i was about years old and i liked it but it wasn t until i watched it again at the age of that i really understood it for what it is a cartoon about a criminal dog with a real heart of gold adopts a little girl in order to exploit her for her talents to talk to animals the dog star charlie b barkin is murdered by his formal business partner carface who is absolutely diabolical by the way his soul then goes to where else but heaven only to find a golden watch that is really his life s time which charlie being the sneaky but lovable cad that he is steals and rewinds sending him back to earth once back on earth charlie goes about seeking revenge on the evil carface this is how he comes upon young anne marie the lonely little orphan that can talk to animals whom charlie plans to scam for her talents in order to get back at his enemy carface but scoundrel charlie actually comes to care for young anne marie and his plans unfoil as he must make up his mind to do what is 

'positive'

In [35]:
# predict labels of unseen document
topics = llda_model.inference(document=unseen_doc, iteration=50, times=20)

In [37]:
df = pd.DataFrame(topics, columns =['topic', 'probability'])
df.head()

Unnamed: 0,topic,probability
0,positive,0.643678
1,common_topic,0.356289
2,negative,3.3e-05


In [38]:
test_num = 6000

In [39]:
y_pred_proba = [llda_model.inference(document=unseen_doc, iteration=50, times=20) for unseen_doc in X_test[:test_num]]

In [40]:
pickle.dump(y_pred_proba, open('y_pred_proba.pkl', 'wb'))

In [41]:
y_pred_proba = pickle.load(open('y_pred_proba.pkl', 'rb'))

In [42]:
for x in y_pred_proba:
    print(x)

[('negative', 0.7580088713652045), ('common_topic', 0.24192072097444198), ('positive', 7.040766035344644e-05)]
[('positive', 0.5694287895865906), ('common_topic', 0.3136385121623096), ('negative', 0.11693269825109988)]
[('common_topic', 0.6342507547561619), ('positive', 0.305272910243918), ('negative', 0.06047633499992013)]
[('negative', 0.5594434515457217), ('common_topic', 0.44009866623340177), ('positive', 0.0004578822208764752)]
[('positive', 0.6196214828199937), ('common_topic', 0.30570764778609566), ('negative', 0.07467086939391057)]
[('positive', 0.5777713605024726), ('common_topic', 0.3947586903945421), ('negative', 0.027469949102985232)]
[('common_topic', 0.8844307044662238), ('negative', 0.11550351904229432), ('positive', 6.577649148194436e-05)]
[('common_topic', 0.9997436883250033), ('negative', 0.00012815583749839805), ('positive', 0.00012815583749839805)]
[('negative', 0.7375021182850363), ('common_topic', 0.2604643280799864), ('positive', 0.002033553634977122)]
[('negativ

[('common_topic', 0.6399161681875736), ('negative', 0.36004016941012096), ('positive', 4.3662402305374835e-05)]
[('common_topic', 0.5161515221938183), ('negative', 0.483616081803393), ('positive', 0.00023239600278875203)]
[('positive', 0.4738703480961528), ('common_topic', 0.3910202323034666), ('negative', 0.13510941960038061)]
[('common_topic', 0.605758915161138), ('positive', 0.39419817190919626), ('negative', 4.291292966570828e-05)]
[('negative', 0.5661419557670895), ('common_topic', 0.43383621160186014), ('positive', 2.1832631050367882e-05)]
[('negative', 0.8215852913403323), ('positive', 0.1783591623618286), ('common_topic', 5.5546297839249025e-05)]
[('negative', 0.5148153934281101), ('common_topic', 0.38618965786595844), ('positive', 0.09899494870593135)]
[('positive', 0.5345037795659595), ('negative', 0.4654149394456636), ('common_topic', 8.128098837681866e-05)]
[('positive', 0.874363164471787), ('negative', 0.12554070941074688), ('common_topic', 9.612611746611554e-05)]
[('negat

In [43]:
y_pred = [[tup for tup in doc_prob if any(i in tup for i in ['negative', 'positive'])] for doc_prob in y_pred_proba]

In [44]:
for x in y_pred:
    print(x)

[('negative', 0.7580088713652045), ('positive', 7.040766035344644e-05)]
[('positive', 0.5694287895865906), ('negative', 0.11693269825109988)]
[('positive', 0.305272910243918), ('negative', 0.06047633499992013)]
[('negative', 0.5594434515457217), ('positive', 0.0004578822208764752)]
[('positive', 0.6196214828199937), ('negative', 0.07467086939391057)]
[('positive', 0.5777713605024726), ('negative', 0.027469949102985232)]
[('negative', 0.11550351904229432), ('positive', 6.577649148194436e-05)]
[('negative', 0.00012815583749839805), ('positive', 0.00012815583749839805)]
[('negative', 0.7375021182850363), ('positive', 0.002033553634977122)]
[('negative', 0.3962175163891902), ('positive', 0.3066075555345942)]
[('negative', 0.713832548303374), ('positive', 0.2860713255791598)]
[('negative', 0.27910076955705015), ('positive', 0.14730655032418347)]
[('negative', 0.6933919824052784), ('positive', 0.3060081975407378)]
[('negative', 0.430222115787583), ('positive', 0.10483146740106566)]
[('positi

[('positive', 0.8025811566753293), ('negative', 0.007972514645641547)]
[('negative', 7.405761682589055e-05), ('positive', 7.405761682589055e-05)]
[('negative', 0.8853026844052586), ('positive', 7.872156183578682e-05)]
[('negative', 0.6853437608275998), ('positive', 4.949759936643074e-05)]
[('negative', 0.5034486446024702), ('positive', 0.0751751055980324)]
[('negative', 0.8324064237955383), ('positive', 6.248828344685373e-05)]
[('negative', 0.21985928159551393), ('positive', 0.06406390520023278)]
[('positive', 0.5903104873594155), ('negative', 0.21061232336825916)]
[('positive', 0.6766658192808519), ('negative', 0.00016947716295229217)]
[('negative', 0.4853285150303825), ('positive', 0.0778560201972699)]
[('negative', 0.24380550208035767), ('positive', 0.1972303297522201)]
[('negative', 0.8520914504153494), ('positive', 7.351319561861352e-05)]
[('negative', 0.36004016941012096), ('positive', 4.3662402305374835e-05)]
[('negative', 0.483616081803393), ('positive', 0.00023239600278875203)

In [45]:
y_pred = [x[0][0] for x in y_pred]

In [46]:
accuracy = accuracy_score(y_test[:test_num], y_pred)

In [47]:
print(accuracy)

0.7861666666666667
