In [7]:
import numpy as np
import pandas as pd
from sklearn import metrics

In [205]:
X_train = pd.read_csv('X_train.csv')
Y_train = pd.read_csv('Y_train.csv')
X_valid = pd.read_csv('X_valid.csv')
Y_valid = pd.read_csv('Y_valid.csv')

In [128]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors =  np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y==c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
            
        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]
            

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator


In [129]:
nb = NaiveBayes()
nb.fit(X_train, np.array(Y_train))

predictions = nb.predict(np.array(X_valid))

In [812]:
classes = Y_train["passenger_survived"].unique()
means = []
variances = []
priors = []
for c in classes:
    f = X_train[np.array(Y_train)==c]
    mean_s = f.mean()
    var = f.std()
    prior = f.shape[0]/len(X_train)
    means.append(np.array(mean_s))
    variances.append(np.array(var))
    priors.append(np.array(prior))

info = {
   0:list(zip(means[1],variances[1])),
   1:list(zip(means[0],variances[0]))
}
info

{0: [(0.6867816091954023, 0.4644701837487957),
  (0.9741379310344828, 0.15895227365661044),
  (0.6752873563218391, 0.46894163522342186),
  (0.14655172413793102, 0.3541677018104052),
  (0.7758620689655172, 0.4176137832812562),
  (0.07758620689655173, 0.2679045187444013)],
 1: [(0.49321266968325794, 0.5010889006795146),
  (0.8959276018099548, 0.30604764068857054),
  (0.36199095022624433, 0.48166719924323365),
  (0.6561085972850679, 0.4760837082320005),
  (0.665158371040724, 0.47300643594032665),
  (0.16289592760180996, 0.37010899258519975)]}

In [268]:
# Calculate Gaussian Probability Density Function 
def calculateGaussianProbability(x, mean, stdev): 
    expo = np.exp(-((x - mean)**2 / (2 * (stdev**2)))) 
    return (1 / (np.sqrt(2 * np.pi) * stdev)) * expo

In [168]:
# Calculate Class Probabilities 
def calculateClassProbabilities(info, test): 
    probabilities = {} 
    for classValue, classSummaries in info.items(): 
        probabilities[classValue] = 1
        for i in range(len(classSummaries)): 
            mean, std_dev = classSummaries[i] 
            x = test[i] 
            probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev) 
    return probabilities

In [787]:
def predict(info, test): 
    probabilities = calculateClassProbabilities(info, test) 
    print(probabilities)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items(): 
        if bestLabel is None or probability > bestProb: 
            bestProb = probability 
            bestLabel = classValue 
    return bestLabel

In [788]:
# returns predictions for a set of examples 
def getPredictions(info, test): 
    predictions = [] 
    for i in range(len(test)): 
        result = predict(info, test[i]) 
        predictions.append(result) 
    return predictions

In [789]:
# prepare model 
test = np.array(X_valid)

# test model 
predictions = getPredictions(info, test)

{0: 0.03482045318425356, 1: 0.0861930993346523}
{0: 0.6243380624252367, 1: 0.07637604425946255}
{0: 1.3854667232098163, 1: 0.04213214270921613}
{0: 0.08276511415546238, 1: 0.08389438325802143}
{0: 4.361636338704468e-05, 1: 0.013336268989321987}
{0: 0.03482045318425356, 1: 0.0861930993346523}
{0: 0.05400755315024579, 1: 0.03750646725405985}
{0: 1.3854667232098163, 1: 0.04213214270921613}
{0: 0.037296753608422425, 1: 0.15208153957556392}
{0: 1.3854667232098163, 1: 0.04213214270921613}
{0: 1.3854667232098163, 1: 0.04213214270921613}
{0: 0.0016202219821762505, 1: 0.003694633746046651}
{0: 1.3854667232098163, 1: 0.04213214270921613}
{0: 0.2848680140361448, 1: 0.02013830589672188}
{0: 4.730273232287483e-05, 1: 0.0034226343688205994}
{0: 0.1198479993370759, 1: 0.02069009787281318}
{0: 0.0016202219821762505, 1: 0.003694633746046651}
{0: 0.0016202219821762505, 1: 0.003694633746046651}
{0: 0.26266784617720174, 1: 0.0784687569535738}
{0: 0.08276511415546238, 1: 0.08389438325802143}
{0: 2.28425649

In [304]:
from sklearn import metricsaccuracy_score

In [309]:
metrics.recall_score(Y_valid,predictions)

0.7884615384615384

### Teorema de Bayes

$P(y|X) = \frac{P(X|y).P(y)}{P(X)}$

* Para multiples caracteristicas del vector X

$X = (X_1,X_2,X_3.....X_n)$

$P(y|X) = \frac{P(X_1|y).P(X_2|y).P(X_3|y)...P(X_n|y).P(y)}{P(X)}$

In [507]:
rating_probs = data.groupby('passenger_survived').size().div(len(data))
rating_probs

passenger_survived
0    0.611599
1    0.388401
dtype: float64

In [683]:
columns = X_train.columns.values
s0 = []
s1 = []
p0 = []
p1 = []

for d in data[columns]:
    a = data.groupby([d, 'passenger_survived']).size().div(len(data)).div(rating_probs, axis=0, level='passenger_survived')
    #a = pd.crosstab(data[d],data.passenger_survived,normalize='columns')
    po = data.groupby(d).size().div(len(data))
    p0.append(po[0])
    p1.append(po[1])
    s0.append({
        0:a[0][0],
        1:a[0][1]
    })
    s1.append({
        0:a[1][0],
        1:a[1][1]
    })

In [738]:
probs_c = [s1,s0]
#probs_c[0]

[{0: 0.6867816091954023, 1: 0.49321266968325794},
 {0: 0.9741379310344828, 1: 0.8959276018099548},
 {0: 0.6752873563218391, 1: 0.3619909502262444},
 {0: 0.14655172413793105, 1: 0.6561085972850679},
 {0: 0.7758620689655172, 1: 0.6651583710407241},
 {0: 0.07758620689655171, 1: 0.16289592760180996}]

In [739]:
def predictPior(x,probs_priori):
    prob = 1
    for i,v in enumerate(probs_priori):
        prob *= v[x[i]]
    return prob

In [791]:
def predictP(x,classes,probs_c,piors):
    probabilities = []
    for i, c in enumerate(classes):
        prior = priors[i]
        prior_pred = predictPior(x,probs_c[i])
        probabilities.append(prior*prior_pred)
    print(probabilities,classes[np.argmax(probabilities)])
    return classes[np.argmax(probabilities)]

In [768]:
def getProbs(X,clasess,probs_c,priors):
    probabilities = []
    for d in X:
        prob = predictP(d,classes,probs_c,priors)
        probabilities.append(prob)
    return probabilities

In [820]:
X_test = pd.read_csv('X_test.csv')
Y_test = pd.read_csv('Y_test.csv')


In [821]:
pp = getProbs(np.array(X_test),classes,probs_c,priors)

[0.0005480811099228432, 0.0036313949092306604] 0
[0.00046987828488709527, 0.0054249728334208125] 0
[0.0016023411042392609, 0.002036801244565984] 0
[0.006930307516293635, 0.00011437712144834662] 1
[0.0034167567663925405, 0.0009043553219115799] 1
[0.0008765491639946585, 0.002761022388705836] 0
[0.004266862166385168, 0.00027646600993628926] 1
[0.005941458208538918, 0.0001708689889510281] 1
[0.0014237055294381675, 0.0011422662523591458] 1
[0.0024537461455688644, 0.0014632468357684315] 1
[0.0006542892842310314, 0.003352888202593236] 0
[0.003924289842531316, 0.0011125359193212968] 1
[0.0005480811099228432, 0.0036313949092306604] 0
[0.011472847368379016, 0.000624006642554349] 1
[0.004577417603093354, 0.0007447147474860405] 1
[0.004416679425356996, 0.0019837884034636694] 1
[0.00046987828488709527, 0.0054249728334208125] 0
[0.0012205644388124503, 0.0017064416133398728] 0
[0.00046987828488709527, 0.0054249728334208125] 0
[0.0016023411042392609, 0.002036801244565984] 0
[0.0008765491639946585, 0.0

In [824]:
from sklearn import metrics

metrics.recall_score(Y_test,pp)

0.7391304347826086

In [795]:
from sklearn.naive_bayes import GaussianNB

In [797]:
gnb = GaussianNB()
fitx = gnb.fit(X_train, Y_train.values.ravel())

In [825]:
pred = fitx.predict(X_test)

In [828]:
metrics.recall_score(Y_test,pred)

#fitx.predict_proba(X_valid)[0]

0.7246376811594203

In [819]:
print(np.exp(fitx.predict_log_proba(X_valid)[1]))
print(pred[1])

[0.92793499 0.07206501]
0
