### Probabilities

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'male': [240, 200, 100], 
                   'female': [150, 50, 260]}, index=['cricket', 'football', 'others'])

In [2]:
class probDF(object):
    def __init__(self, df):
        self.df = df.copy()
        total_col = self.df.sum(axis=1)
        total_row = self.df.sum(axis=0)
        self.df.loc['total'] = total_row
        self.df['total'] = total_col
        self.df.loc['total', 'total'] = total_col.sum()
        self.norm_df = self.df / self.df.loc['total', 'total']
        
    def marginal_prob(self, var, col=True):
        if col:
            print(f'marginal prob. for {var}')
            return self.norm_df.loc['total', var]
        else:
            print(f'marginal prob. for {var}')
            return self.norm_df.loc[var, 'total']
        
    def joint_prob(self, row, col):
        print(f'joint prob for {row} and {col}')
        return self.norm_df.loc[row, col]
    
    def conditional_prob(self, value, given, given_col=True):
        print(f'conditional prob of {value} given {given}')
        if given_col:
            return self.norm_df.loc[value, given] / self.norm_df.loc['total', given]
        else:
            return self.norm_df.loc[given, value] / self.norm_df.loc[given, 'total']
        
    def show_df(self):
        return self.df
    
    def show_norm_df(self):
        return self.norm_df

In [3]:
prob_df = probDF(df)

In [4]:
prob_df.show_df()

Unnamed: 0,male,female,total
cricket,240,150,390.0
football,200,50,250.0
others,100,260,360.0
total,540,460,1000.0


In [5]:
prob_df.show_norm_df()

Unnamed: 0,male,female,total
cricket,0.24,0.15,0.39
football,0.2,0.05,0.25
others,0.1,0.26,0.36
total,0.54,0.46,1.0


In [6]:
prob_df.joint_prob('cricket', 'female')

joint prob for cricket and female


0.15

In [7]:
prob_df.joint_prob('others', 'male')

joint prob for others and male


0.1

In [8]:
prob_df.marginal_prob('male')

marginal prob. for male


0.54

In [9]:
prob_df.marginal_prob('cricket', False)

marginal prob. for cricket


0.39

In [10]:
prob_df.conditional_prob('cricket', 'male')

conditional prob of cricket given male


0.4444444444444444

In [11]:
prob_df.conditional_prob('male', 'cricket', False) #NOTE is not symmetrical!

conditional prob of male given cricket


0.6153846153846153

### Bayes

In [12]:
prob_spam = 0.03
detection_rate = 0.99   #find spam if there is spam
false_positive = 0.002  #classify not spam as spam

In [13]:
class bayesDF(object):
    def __init__(self, columns, rows):
        self.df = pd.DataFrame(np.zeros((len(rows), len(columns))), 
                                    columns=columns, index=rows)
        self.rows = rows
        self.cols = columns
        self.bayes_df = None
        self.flag = False
        
    def ret(self):
        if not self.flag:
            print('Need to populated dataframe first')
        else:
            return self.bayes_df
    
    def populate(self, positive_prob, detection_rate, fp_rate):
        self.bayes_df = self.df.copy()
        self.bayes_df.loc[self.rows[-1], self.cols[1]] = positive_prob
        self.bayes_df.loc[self.rows[-1], self.cols[0]] = 1 - positive_prob

        self.bayes_df.loc[self.rows[0], self.cols[1]] = positive_prob * detection_rate
        self.bayes_df.loc[self.rows[1], self.cols[1]] = positive_prob - self.bayes_df.loc[self.rows[0], self.cols[1]]

        self.bayes_df.loc[self.rows[0], self.cols[0]] = (1 - positive_prob) * fp_rate
        self.bayes_df.loc[self.rows[1], self.cols[0]] = (1 - positive_prob) * (1 - fp_rate)

        self.bayes_df.loc[:, self.cols[-1]] = self.bayes_df.sum(axis=1)
        self.flag = True
        
    def bayes_rule(self, tl, pred):
        print(f'P({tl}|{pred}) = P({pred}|{tl}) * P({tl}) / P({pred})')
        return self.bayes_df.loc[pred, tl] / self.bayes_df.loc[pred, self.cols[-1]]

In [14]:
columns= ['tl_not_spam', 'tl_spam', 'total'] #columns, ground truth    Order of negative and positive and total is important!
rows = ['pred_spam', 'pred_not_spam', 'total'] #rows, predictions      follow the convention here
#TL = True Label or ground truth

bdf =  bayesDF(columns, rows)
bdf.ret()

Need to populated dataframe first


In [15]:
bdf.populate(prob_spam, detection_rate, false_positive)
bdf.ret()

Unnamed: 0,tl_not_spam,tl_spam,total
pred_spam,0.00194,0.0297,0.03164
pred_not_spam,0.96806,0.0003,0.96836
total,0.97,0.03,1.0


In [16]:
bdf.bayes_rule('tl_spam', 'pred_spam')

P(tl_spam|pred_spam) = P(pred_spam|tl_spam) * P(tl_spam) / P(pred_spam)


0.9386852085967131

In [17]:
bdf.bayes_rule('tl_not_spam', 'pred_spam')

P(tl_not_spam|pred_spam) = P(pred_spam|tl_not_spam) * P(tl_not_spam) / P(pred_spam)


0.06131479140328699

In [18]:
bdf.bayes_rule('tl_spam', 'pred_not_spam')

P(tl_spam|pred_not_spam) = P(pred_not_spam|tl_spam) * P(tl_spam) / P(pred_not_spam)


0.00030980213970011327

In [19]:
bdf.bayes_rule('tl_not_spam', 'pred_not_spam')

P(tl_not_spam|pred_not_spam) = P(pred_not_spam|tl_not_spam) * P(tl_not_spam) / P(pred_not_spam)


0.9996901978602999

In [20]:
bdf.populate(0.4, detection_rate, false_positive)

In [21]:
bdf.ret()

Unnamed: 0,tl_not_spam,tl_spam,total
pred_spam,0.0012,0.396,0.3972
pred_not_spam,0.5988,0.004,0.6028
total,0.6,0.4,1.0


In [22]:
bdf.bayes_rule('tl_spam', 'pred_spam')

P(tl_spam|pred_spam) = P(pred_spam|tl_spam) * P(tl_spam) / P(pred_spam)


0.9969788519637462

In [23]:
bdf.bayes_rule('tl_not_spam', 'pred_spam')

P(tl_not_spam|pred_spam) = P(pred_spam|tl_not_spam) * P(tl_not_spam) / P(pred_spam)


0.003021148036253776

In [24]:
bdf.bayes_rule('tl_spam', 'pred_not_spam')

P(tl_spam|pred_not_spam) = P(pred_not_spam|tl_spam) * P(tl_spam) / P(pred_not_spam)


0.006635700066357006

In [25]:
bdf.bayes_rule('tl_not_spam', 'pred_not_spam')

P(tl_not_spam|pred_not_spam) = P(pred_not_spam|tl_not_spam) * P(tl_not_spam) / P(pred_not_spam)


0.993364299933643

In [26]:
bdf.populate(0.4, detection_rate, false_positive)