# Metadata

```
Course:  DS 5001
Module:  03 Lab
Topic:   Simple Language Model
Author:  R.C. Alvarado
Date:    31 January 2023
```

**Purpose**: Demonstrates use of simple langauge model based on Berkeley restaurant data (from Jurafsky and Martin).

In [160]:
import pandas as pd
import numpy as np

We convert these data into Pandas dataframes and use them to predict and generate sentences.

<img src="lm-data.png">

In [48]:
data1 = dict(
    w0 = "i, want, to, eat, chinese, food, lunch, spend".split(", "),
    n = [2533, 927, 2417, 746, 158, 1093, 341, 278]
)

In [54]:
df1 = pd.DataFrame(data1).set_index(['w0'])
df1.n = df1.n.astype('int')

In [151]:
df1['p'] = df1.n / df1.n.sum()

In [153]:
df1

Unnamed: 0_level_0,n,p
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
i,2533,0.298246
want,927,0.109149
to,2417,0.284587
eat,746,0.087837
chinese,158,0.018604
food,1093,0.128694
lunch,341,0.040151
spend,278,0.032733


In [161]:
data2 = [row.split(", ") for row in """
i, i, 5
i, want, 837
i, eat, 9
i, spend, 2
want, i, 2
want, to, 608
want, eat, 1
want, chinese, 6
want, food, 6
want, lunch, 5
want, spend, 1
to, i, 2
to, to, 4
to, eat, 686
to, chinese, 2
to, lunch, 6
to, spend, 211
eat, to, 2
eat, chinese, 16
eat, food, 2
eat, lunch, 42
chinese, i, 1
chinese, food, 82
chinese, lunch, 1
food, i, 15
food, to, 15
food, chinese, 1
food, food, 4
lunch, i, 2
lunch, food, 1
spend, i, 1
spend, to, 1
""".split("\n")[1:-1]]

In [162]:
df2 = pd.DataFrame(data2, columns = ['w0', 'w1', 'n']).set_index(['w0','w1'])#.unstack(fill_value=0)

In [163]:
df2.n = df2.n.astype('int')

In [165]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,n
w0,w1,Unnamed: 2_level_1
i,i,5
i,want,837
i,eat,9
i,spend,2
want,i,2
want,to,608
want,eat,1
want,chinese,6
want,food,6
want,lunch,5


In [166]:
df2['p'] = df2.n / df1.n

In [169]:
df2.sort_values('p', ascending=False).style.background_gradient(cmap='YlGnBu')

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1
want,to,608,0.655879
chinese,food,82,0.518987
i,want,837,0.330438
to,eat,686,0.283823
to,spend,211,0.087298
eat,lunch,42,0.0563
eat,chinese,16,0.021448
food,to,15,0.013724
food,i,15,0.013724
want,chinese,6,0.006472


In [170]:
df3 = df2.p.unstack(fill_value=0) #(df2.n / df1.n).unstack(fill_value=0)

In [171]:
df3.style.background_gradient(cmap='YlGnBu', axis=None)

w1,chinese,eat,food,i,lunch,spend,to,want
w0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chinese,0.0,0.0,0.518987,0.006329,0.006329,0.0,0.0,0.0
eat,0.021448,0.0,0.002681,0.0,0.0563,0.0,0.002681,0.0
food,0.000915,0.0,0.00366,0.013724,0.0,0.0,0.013724,0.0
i,0.0,0.003553,0.0,0.001974,0.0,0.00079,0.0,0.330438
lunch,0.0,0.0,0.002933,0.005865,0.0,0.0,0.0,0.0
spend,0.0,0.0,0.0,0.003597,0.0,0.0,0.003597,0.0
to,0.000827,0.283823,0.0,0.000827,0.002482,0.087298,0.001655,0.0
want,0.006472,0.001079,0.006472,0.002157,0.005394,0.001079,0.655879,0.0


In [159]:
N = 20
M = 10

# Generate N sentences
for i in range(N):
    print(str(i+1).zfill(2), end = ' ')
    w0 = df1.sample(weights='n').index[0]
    ll = np.log(df1.loc[w0].p)
    
    # Select M words for each sentence
    for j in range(M):
        print(w0, end=' ')
        w1 = df3.loc[w0].sample(weights=df3.loc[w0].values).index[0]
        ll += np.log(df3.loc[w0, w1])
        w0 = w1
    
    # Add log likelihood
    print(round(ll))

01 to eat chinese food to spend i want to eat -26
02 to eat lunch i want to spend i want food -31
03 eat lunch i want to eat chinese food to spend -30
04 i want to eat lunch food i want to eat -23
05 to chinese food i want to eat lunch food to -30
06 eat lunch i want to eat lunch food i want -28
07 want to eat chinese food i want to eat lunch -23
08 food i want to eat chinese food i want to -21
09 i want to spend to eat chinese food i want -22
10 to eat lunch i want to eat lunch i want -23
11 food i want to eat lunch food food i want -29
12 i want to spend to eat lunch food to eat -29
13 food food i want to spend to eat lunch food -36
14 to spend i want to eat food to eat lunch -32
15 i want to eat chinese food i want to eat -19
16 i want to spend i want to eat chinese food -22
17 spend i want to eat chinese food to eat lunch -31
18 to eat lunch i want to eat lunch i want -23
19 to eat lunch i want to eat lunch i want -23
20 food i want to spend to eat lunch food to -31


In [135]:
df1.sample(weights='n').index[0]

'to'

In [139]:
df3.loc['i','want']

0.3304382155546782