### Maximum likelihood classifier based on Bayes' decision rule. The classifier is optimal with repect to minimizing the classification error probability. 

### - The model assumes training samples were generated by a multivariate gaussian distribution

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
import math

In [3]:
from sklearn.metrics import accuracy_score

+ Get the data
+ Split into x_train, x_test, y_train, y_test using train_test_split()
+ Compute $\mu_i$ and $\Sigma_i$ for some subset of x_train.
    + What is a subset? In case of breast cancer data, there are two: benign and malignant
    + x_train[y_train==0] -- benign cases, x_train[y_train==1] -- malignant cases
    + get_dist_params(data), where data -- is the subset of x_train. The function returns a tuple: ($\mu$, $\Sigma$)
+ Compute the prior probability $P(\omega_i)$
    + The size of the subset/ the size of x_train
+ Compute $g_i$
    + get_g(x,dist_params,log_prior), where x -- a data point, dist_params -- see above, log_prior = $log(P\omega_i))$. The fuction returns $g_i$ for the point x
    + g_scores = [(t,get_g(...)) for t in np.unique(y_train)], i.e. compute g scores for all target labels
    + y_pred = sort(g_scores, key = lambda x:x[1])[-1][0], find the label with the highest score

#### Loading the data

In [4]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
t_target = pd.Series(data.target)


df2 = df.copy(deep=True)
df2['target']=t_target
df2['target']=df2.target.replace([0, 1, 2], data.target_names)

In [5]:
df2.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


#### Splitting the data

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df, t_target, test_size=0.3, stratify=t_target)

#### Splitting the training data by class

In [7]:
subsets = []
for i in range(len(data.target_names)):
    subsets.append(x_train[y_train == i])

#### Function to ompute ùúáùëñ and Œ£ùëñ for each subset of x_train

In [8]:
def get_dist_params(data):
    mu = np.array(data.mean())
    sigma = np.cov(data.T)
    return (mu,sigma)

#### Compute ùúáùëñ and Œ£ùëñ for each subset of x_train

In [9]:
means = []
covs = []
for j in range(len(subsets)):
    means.append(get_dist_params(subsets[j])[0])
    covs.append(get_dist_params(subsets[j])[1])

#### Compute prior probablities

In [10]:
prior_probs = []
for i in range(len(subsets)):
    prior_probs.append(subsets[i].shape[0]/len(x_train))

#### Compute the log of each prior probability

In [11]:
log_prior = []
for i in range(len(subsets)):
    log_prior.append(math.log(prior_probs[i]))

#### Function to find the value of g = log(class conditional probablility density function) * log(prior probablitiy for the class)

In [12]:
def get_g(X,means,cov,log_prior): # Function to find the g-score of the point X
    one = np.matmul((X-means),np.linalg.inv(cov))
    result = -1/2*np.matmul(one,(X-means))
    result += (log_prior - len(means)/2*math.log(2*math.pi) - 1/2*math.log(np.linalg.det(cov)))
    return result

#### For each row of test data, calculate g-scores for each class and assign to the class with the highest g-score

In [13]:
# y_pred for set of test data
y_pred = []
for i in range(len(x_test)):
    g_scores = [(t,get_g(np.array(x_test.iloc[i,:]),means[t],covs[t],log_prior[t])) for t in np.unique(y_train)]
    y_pred.append(sorted(g_scores, key = lambda x:x[1])[-1][0])

In [14]:
accuracy_score(y_test, y_pred)

0.9555555555555556