In [1]:
import pandas as pd
import numpy as np
from collections import Counter as ctr

In [50]:
data = pd.read_csv('timelymealsdiscreteannotated.csv', skiprows=1, names=['type','text', 'c3', 'c4', 'c5'], delimiter=',', encoding='ISO-8859-1')

In [51]:
data['clean'] = data.text.apply(lambda x: x.lower().split())

In [52]:
data.drop(['c3', 'c4', 'c5'], axis=1, inplace=True)

In [53]:
data

Unnamed: 0,type,text,clean
0,Valid,Generate me a meal plan for 3 days,"[generate, me, a, meal, plan, for, 3, days]"
1,Valid,Give me a meal plan for a week and 2 meals per...,"[give, me, a, meal, plan, for, a, week, and, 2..."
2,Valid,Meal plan for the week,"[meal, plan, for, the, week]"
3,Valid,Give me a meal plan for the day,"[give, me, a, meal, plan, for, the, day]"
4,Valid,Generate me a meal plan for the weekend,"[generate, me, a, meal, plan, for, the, weekend]"
...,...,...,...
203,Invalid,Help me eat more vegetables.,"[help, me, eat, more, vegetables.]"
204,Invalid,What meals freeze well for later?,"[what, meals, freeze, well, for, later?]"
205,Invalid,Suggest food that helps with energy levels.,"[suggest, food, that, helps, with, energy, lev..."
206,Invalid,Recommend comfort food dishes.,"[recommend, comfort, food, dishes.]"


In [54]:
data.groupby('type').count()

Unnamed: 0_level_0,text,clean
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Invalid,59,59
Valid,149,149


In [56]:
type_ctr = ctr(data.type)

type_ctr['Invalid']/len(data), type_ctr['Valid']/len(data)

(0.28365384615384615, 0.7163461538461539)

In [57]:
test = data.sample(frac=0.1)
train = data[~data.index.isin(test.index)]

train.shape, test.shape

((187, 3), (21, 3))

### Prior probability

In [58]:
ham_spam_ctr = ctr(train.type)

def Pa(X=''):
    return ham_spam_ctr[X] / len(train)

In [60]:
Pa('Valid')

0.7058823529411765

### Normalization

In [61]:
words_ctr = ctr([word for row in train.clean for word in row])

def Pb(W=''):
    if W not in words_ctr: return 0.000001
    return words_ctr[W] / sum(words_ctr.values())


words_ctr

Counter({'for': 149,
         'a': 145,
         'plan': 121,
         'meal': 111,
         'the': 77,
         'next': 54,
         'meals': 47,
         'days': 29,
         'me': 25,
         'week': 24,
         'with': 24,
         'days,': 19,
         'generate': 17,
         'create': 17,
         'suggest': 15,
         'i': 15,
         'what': 13,
         'give': 12,
         'and': 12,
         'seven': 12,
         'on': 12,
         'five': 11,
         'including': 11,
         'weekend': 10,
         'three': 10,
         '7': 9,
         'focusing': 9,
         'four': 9,
         'make': 9,
         'can': 9,
         'day': 8,
         'week,': 8,
         'provide': 8,
         'how': 8,
         'schedule': 7,
         'one': 7,
         'dinners': 7,
         '3': 6,
         'healthy': 6,
         'design': 6,
         'dinner': 6,
         'outline': 6,
         'lunches': 6,
         'to': 6,
         'all': 6,
         'six': 6,
         'is': 6,
         'b

### Maximum Likelihood

In [64]:
words_ham_spam = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name] #masking data type to only be ham sub-datatype
    words_ham_spam[col_name] = []
    for row in sub_df.clean:
        for word in row:
            words_ham_spam[col_name].append(word)
    words_ham_spam[col_name] = ctr(words_ham_spam[col_name])

def Pba(W='', X=''):
    t = words_ham_spam[X]
    if W not in t: return 0.0000001
    return t[W] / sum(t.values())


words_ham_spam['Valid']

Counter({'for': 134,
         'a': 123,
         'plan': 120,
         'meal': 110,
         'the': 72,
         'next': 54,
         'meals': 38,
         'days': 29,
         'week': 24,
         'with': 20,
         'days,': 19,
         'generate': 17,
         'me': 16,
         'create': 15,
         'and': 12,
         'seven': 12,
         'five': 11,
         'including': 11,
         'weekend': 10,
         'three': 10,
         'on': 10,
         'give': 9,
         '7': 9,
         'suggest': 9,
         'focusing': 9,
         'four': 9,
         'day': 8,
         'week,': 8,
         'provide': 8,
         'one': 7,
         'dinners': 7,
         '3': 6,
         'schedule': 6,
         'design': 6,
         'outline': 6,
         'lunches': 6,
         'six': 6,
         'balanced': 5,
         'daily': 5,
         'healthy': 5,
         'monday': 5,
         'snacks': 5,
         'family': 5,
         'make': 5,
         'all': 5,
         'per': 4,
         'upcoming

In [69]:
Pba(W='the', X='Valid')

0.05741626794258373

In [70]:
def Pab(X='', W=''):
    return Pba(W, X) * Pa(X) / Pb(W)

In [79]:
Pab(X='Valid', W='the')

0.8469009178268959

In [80]:
def Ps(T, X=''):
    return np.prod([Pab(X=X, W=word) for word in T])

In [83]:
Ps(['make', 'meal', 'plan', 'for', 'week'], 'Valid')

np.float64(0.29927178109619446)

### Evaluate

In [84]:
test['Valid'] = test.clean.apply(lambda x: Ps(x, 'Valid'))
test['Invalid'] = test.clean.apply(lambda x: Ps(x, 'Invalid'))

In [85]:
test['winner'] = test.Valid /  test.Invalid
test['winner'] = test.winner.apply(lambda x: 'Valid' if x>1 else 'Invalid')

In [86]:
result = test.winner == test.type

In [87]:
sum(result) / len(test)

0.9523809523809523

In [88]:
test

Unnamed: 0,type,text,clean,Valid,Invalid,winner
133,Valid,Prepare a meal plan for the next four breakfasts,"[prepare, a, meal, plan, for, the, next, four,...",0.2875282,5.3468190000000005e-28,Valid
142,Valid,"Organize a meal plan for the next five days, f...","[organize, a, meal, plan, for, the, next, five...",0.1780212,9.546896e-40,Valid
43,Valid,Give me a vegetarian meal plan for 5 days,"[give, me, a, vegetarian, meal, plan, for, 5, ...",0.04919927,1.970926e-17,Valid
66,Valid,I need meals planned for the next three days,"[i, need, meals, planned, for, the, next, thre...",0.005795271,9.102109e-27,Valid
120,Valid,Schedule a meal plan for the next three breakf...,"[schedule, a, meal, plan, for, the, next, thre...",0.2464528,5.809421e-24,Valid
33,Valid,Create a meal plan for the next week with brea...,"[create, a, meal, plan, for, the, next, week, ...",0.07113362,2.754609e-35,Valid
11,Valid,Generate a high-protein meal plan for 7 days,"[generate, a, high-protein, meal, plan, for, 7...",0.02645997,2.522955e-24,Valid
178,Invalid,Find me a food blog.,"[find, me, a, food, blog.]",8.110317e-11,0.005073099,Invalid
115,Valid,Build a meal plan for five consecutive breakfasts,"[build, a, meal, plan, for, five, consecutive,...",0.3395063,4.09357e-25,Valid
68,Valid,Help me plan meals for the week,"[help, me, plan, meals, for, the, week]",0.1618151,1.030255e-11,Valid
