In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import requests
import io

In [2]:
df=pd.read_csv( "https://github.com/huynhthanh98/ML/raw/ML-2022/Lab04/lienminh.csv")

In [3]:
df.head()

Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red


## UTILS

In [4]:
def onehot(x):
    _dict = dict([(i[1],i[0]) for i in enumerate(np.sort(np.unique(x)))]) # adding sort to make sure target var are properly encoding 
    nb_classes = len(np.unique(x))
    _x = np.vectorize(_dict.get)(x)
    one_hot_vector = np.eye(nb_classes)[_x.astype(int)]
    return (one_hot_vector , _dict) 

## CALCULATE  P(X|Y) 

- For Categorical variable: 

In [5]:
def categorical_likelihood_x_given_y(x, y, label):
    '''
    - input: vector feature x, vector target y, label of target (0,1)
    - output: vector p(x|yi)
    '''
    
    filter_yi=(y==label)
    if label==1:
        y_temp = y
    else:
        y_temp = 1-y
    _x_onehot=onehot(x)[0]
    _p_x_given_y =np.dot(_x_onehot.T, (y_temp.reshape(-1, 1))) / y[filter_yi].shape[0] # k scalar with k is 
    _p_xj_given_y = np.dot(_x_onehot, _p_x_given_y).reshape(-1)
    return _p_xj_given_y

- For Numerical Variable

In [6]:
def gaussian_likelihood_x_given_y (x, y, label):
    '''
    - input: vector feature x, vector target y, lable of target (0,1)
    - output: vector p(x|yi)
    '''
    filter_yi=(y==label)
    mean, std = np.mean(x[filter_yi]), np.std(x[filter_yi])
    p=(1/(np.sqrt(2*np.pi)*std)) * np.exp(-np.square(x-mean)/(2*np.square(std)))
    return p

In [7]:
def likelihood_calc(y, X_categorical=None, X_numerical=None):
    # calculating the likelihood P(x|y) matrix by using categorical and gaussian likelihood function given each yi
    num_labels = len(list(np.unique(y)))
    num_observations=y.shape[0]
    _likelihood=np.zeros([num_observations, num_labels]) 
    
    for label in list(np.unique(y)):
        if isinstance(X_categorical, np.ndarray): 
            _Likelihood_X_categorical=np.apply_along_axis(
                lambda x: categorical_likelihood_x_given_y (x, y, label) ,0 , X_categorical)
        else: 
             _Likelihood_X_categorical=np.zeros([y.shape[0],0]) 
                
        if isinstance(X_numerical, np.ndarray):  
            _Likelihood_X_numerical=np.apply_along_axis(
                lambda x: gaussian_likelihood_x_given_y (x, y, label) ,0, X_numerical)
        else: 
            _Likelihood_X_numerical=np.zeros([y.shape[0],0]) 

        _likelihood_matrix=np.concatenate(
            [_Likelihood_X_categorical, _Likelihood_X_numerical], axis=1) 
        
        _likelihood[:, label]=_likelihood_matrix[:, 0]
        for var in range(1, _likelihood_matrix.shape[1]):
            _likelihood[:, label]*=_likelihood_matrix[:, var] 
            
    return _likelihood  

##  Build the Classifier, calculate  P(Y|X) 


**Posterior density ∝ Likelihood × Prior density**

In [8]:
def posterior_probability_calc(likelihood, prior, _dict_label2idx):
    '''
    Posterior density ∝ Likelihood × Prior density  
    '''
    
    prior_dict = dict([(_dict_label2idx.get(i), j)  for i, j in prior.items()]) 
    posterior = np.zeros([likelihood.shape[0], likelihood.shape[1]])
    
    for i in range(likelihood.shape[1]):
        posterior[:, i] =prior_dict.get(i)*likelihood[:, i] 
    return posterior 

In [9]:
def classifying(posterior, _y_idx_label):
    r=np.argmax(posterior, axis=1).reshape(-1)
    return  np.vectorize(lambda x: _y_idx_label.get(x))(r)

In [10]:
def naive_bayes_prediction(
    df, 
    target_variable, 
    prior, 
    categorical_variables=None,
    numerical_variables=None):
    
    # convert pandas table to seperated numpy array of categorical and numerical variable 
    if categorical_variables != None: 
        X_categorical=df[categorical_variables].values
    else:
        X_categorical=None
    if numerical_variables != None: 
        X_numerical=df[numerical_variables].values
    else:
        X_numerical=None
    target=df[target_variable].values
    
    _dict_label2idx = dict([(i[1],i[0]) for i in enumerate(np.sort(np.unique(target)))])
    _dict_idx2label = dict([(i[0],i[1]) for i in enumerate(np.sort(np.unique(target)))])     
    
    y = np.vectorize(lambda x: _dict_label2idx.get(x))(target).astype(int) 
    
    likelihood=likelihood_calc(y, X_categorical, X_numerical) 
    posterior=posterior_probability_calc(likelihood, prior, _dict_label2idx)
    y_pred=classifying(posterior, _dict_idx2label) 
    return y_pred
    

In [42]:
def get_accuracy(y_target, y_predict):
    '''
    y_predict_score_matrix: result is list if float in (0,1)
    '''
    acc_score=np.mean((y_target.reshape(-1)==y_predict))
    return acc_score  

## Prediction and Accuracy

In [43]:
categorical_variables=['firstBlood', 'heralds', 'dragons']
numerical_variables=['killsDiff', 'minionsKilledDiff', 'wardPlacedDiff'] 
target_variable='teamWins'
y_target=df[target_variable].values

- Chỉ sử dụng các biến categorical

In [45]:
prior={'blue': 0.5, 'red': 0.5} 
prior 
y_nb_categical_pred = naive_bayes_prediction(
    df, 
    target_variable, 
    prior,  
    categorical_variables, 
    numerical_variables=None)  
acc = get_accuracy(y_target, y_nb_categical_pred) 
print("Accuracy if using categical variable only: ", acc)

Accuracy if using categical variable only:  0.6298208320680231


- Chỉ sử dụng các biến continuous

In [49]:
prior={'blue': 0.5, 'red': 0.5} 
prior 
y_nb_gaussian_pred = naive_bayes_prediction(
    df, 
    target_variable, 
    prior,   
    numerical_variables)  
acc = get_accuracy(y_target, y_nb_gaussian_pred)  
print("Accuracy if using continuous variable only: ", acc) 

Accuracy if using continuous variable only:  0.7250733879947363


- Sử dụng cả biến categorical và continuous

In [50]:
prior={'blue': 0.5, 'red': 0.5} 
prior 
y_nb_pred = naive_bayes_prediction(
    df, 
    target_variable, 
    prior,  
    categorical_variables, 
    numerical_variables) 
acc =  get_accuracy(y, y_nb_pred) 
print("Accuracy if using both: ", get_accuracy(y, y_nb_pred))  

Accuracy if using both:  0.7116104868913857


## SKlearn

In [51]:
from sklearn.naive_bayes import MultinomialNB 

In [52]:
MultinomialNB()

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
df.columns.values

array(['killsDiff', 'minionsKilledDiff', 'wardPlacedDiff', 'firstBlood',
       'heralds', 'dragons', 'teamWins'], dtype=object)

In [54]:
X=df[[col for col in df.columns.values if col != 'teamWins']].values
X_categorical=df[categorical_variables].values  
X_numerical=df[numerical_variables].values    

In [55]:
y=df['teamWins'].values

In [56]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import BernoulliNB 
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
 

In [57]:
# df[categorical_variables]

In [58]:
label_encoder=LabelEncoder()
label_encoded=label_encoder.fit_transform(df['teamWins'])


In [59]:
encoder = OrdinalEncoder()
X_categorical_encoded = encoder.fit_transform(df[categorical_variables])


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_int = np.zeros((n_samples, n_features), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_mask = np.ones((n_samples, n_features), dtype=np.bool)


In [60]:
bernoulli_clf = BernoulliNB(class_prior=[0.5, 0.5]) 
bernoulli_clf.fit(X_categorical_encoded, y) 

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=[0.5, 0.5], fit_prior=True)

- Chỉ sử dụng biến categorical

In [61]:
y_pred_categorical = bernoulli_clf.predict(X_categorical_encoded)
np.mean(y_pred_categorical==y) 

0.6133211863548942

- Chỉ sử dụng biến continous 

In [62]:
gaussian_clf = GaussianNB(priors=[0.5, 0.5]) 
gaussian_clf.fit(X_numerical, y)
y_pred_gaussian = gaussian_clf.predict(X_numerical) 
np.mean(y_pred_gaussian==y) 

0.7079663933596518

In [63]:
y_prob_pred_categorical = bernoulli_clf.predict_proba(X_categorical_encoded) 

In [64]:
y_prob_pred_gaussian = gaussian_clf.predict_proba(X_numerical)

In [65]:
y_prob_pred_categorical

array([[0.4977507 , 0.5022493 ],
       [0.30427725, 0.69572275],
       [0.52018688, 0.47981312],
       ...,
       [0.30427725, 0.69572275],
       [0.7107041 , 0.2892959 ],
       [0.4977507 , 0.5022493 ]])

In [66]:
y_prob_pred_gaussian

array([[0.70490043, 0.29509957],
       [0.2087349 , 0.7912651 ],
       [0.17662248, 0.82337752],
       ...,
       [0.20840452, 0.79159548],
       [0.2703737 , 0.7296263 ],
       [0.53971591, 0.46028409]])

- Sử dụng cả categorical và numerical

In [67]:
y_prob_pred_combined=y_prob_pred_categorical*y_prob_pred_gaussian/0.5
y_pred_argmax = np.argmax(y_prob_pred_combined, axis=1) 
y_pred=label_encoder.inverse_transform(y_pred_argmax)  

In [68]:
np.mean(y_pred==y)

0.7100921145864966