## Demonstration of Building Dictionary and Transition Probability Matrix

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Comment on below dataframe:
- At this point i had tokenized the data i will be using to train my models
- Model_1 will be trained with Summary data (shorter sentences)
- Model_2 will be trained with Review data (each review is a lot longer than a summary)

In [2]:
df = pd.read_pickle(r'C:\Users\nishi\source\repos\HiddenMarkovModel\archives\my_df.pkl',compression='gzip')
df = df[['tokenized_Summary', 'tokenized_Reviews']]
df.head()

Unnamed: 0,tokenized_Summary,tokenized_Reviews
0,"[Good, Quality, Dog, Food]","[I, have, bought, several, of, the, Vitality, ..."
1,"[Not, as, Advertised]","[Product, arrived, labeled, as, Jumbo, Salted,..."
2,"[``, Delight, '', says, it, all]","[This, is, a, confection, that, has, been, aro..."
3,"[Cough, Medicine]","[If, you, are, looking, for, the, secret, ingr..."
4,"[Great, taffy]","[Great, taffy, at, a, great, price, ., There, ..."


In [3]:
# Getting the list of all words 
words = []
for row in df['tokenized_Summary']:
    for i in range(len(row)):
        words.append(row[i])

### Creating dictionary dataframe
- This dataframe will serve as a dictionary
- column "lead" will store 1st word
- colum "follow" will store the word that follows the corresponding lead word
- This would eventually result in a 2nd Order Markov Model

In [4]:
lead = []
follow = []
count = 0

for row in df['tokenized_Summary']:
    for i in range(len(row)-1):
        lead.append(row[i])
        follow.append(row[i+1])

In [5]:
dict_df = pd.DataFrame(columns = ['lead', 'follow', 'freq'])
dict_df['lead'] = lead
dict_df['follow'] = follow
dict_df['freq']= dict_df.groupby(by=['lead','follow'])['lead','follow'].transform('count').copy()

In [6]:
dict_df.head()

Unnamed: 0,lead,follow,freq
0,Good,Quality,162
1,Quality,Dog,53
2,Dog,Food,1008
3,Not,as,1216
4,as,Advertised,28


In [7]:
dict_df.shape

(2171388, 3)

In [8]:
dict_df = dict_df.drop_duplicates()
dict_df = dict_df.dropna()
dict_df.shape

(416254, 3)

In [9]:
small_dict_df = dict_df[0:50000]

### Creating a Transition Probability Matrix from our Dictionary
- Since i used pandas making a Transition Probability Matrix was fairly easy (use pivot() on the df)
- Now we have "lead" words as rows, "follow" words as columns and values will be the probability of transition from "lead" to "follow"  

In [10]:
prob_matrix = small_dict_df.pivot(index = 'lead', columns= 'follow', values='freq')
sum_words = prob_matrix.sum(axis=1)
prob_matrix = prob_matrix.apply(lambda x: x/sum_words)

In [11]:
prob_matrix.head()

follow,!,#,$,%,&,','','Bit-O-Honey,'Chowdah,'Delicious,...,zest,zing,ziwi,zots,zuckerman,{,~,~~,«,»
lead,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!,0.880344,,,,,0.00016,0.003327,,,,...,,,,,,,0.000658,,,
#,,,,,,,,,,,...,,,,,,,,,,
$,0.031933,,0.611765,,,,,,,,...,,,,,,,,,,
%,,,,,,,,,,,...,,,,,,,,,,
&,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# end words will be used to end our generated sentences (attempt to make sentences logically resonable)
end_words = []
for word in words:
    if word[-1] in ['.','!','?'] and word != '.':
        end_words.append(word)

### heuristic  to generate new sentences using Transition Probability Matrix

In [13]:
from numpy.random import choice
def make_a_sentence(start):
    word= start
    # our sentence starts from word
    sentence=[word]
    # restrict length of our sentence
    while len(sentence) < 30:
        # numpy choice picks a word using transition probability matrix
        next_word = choice(a = list(prob_matrix.columns), p = (prob_matrix.iloc[prob_matrix.index ==word].fillna(0).values)[0])
        if next_word == 'EndWord':
                continue
        elif next_word in end_words:
            if len(sentence) > 2:    
                sentence.append(next_word)
                break
            else :
                continue
        else :
            sentence.append(next_word)
        word=next_word
    sentence = ' '.join(sentence)
    return sentence

### generate a sentence

In [19]:
# Food will be starting word
sentence = make_a_sentence('Food')
print(sentence)

Food I like it 's like crazy for a joke ?


## Comments:
- This is just a demonstration of how the approach came through
- This would be developed into resuable modules which can output persistant datastructures and use them