In [None]:
from collections import Counter
from functools import partial
from linear_algebra import *
from stats import median, standard_deviation
from probability import normal_cdf
from gradient_descent import maximize_batch, maximize_stochastic, minimize_stochastic
from simple_linear_regression import total_sum_of_squares
from working_with_data import rescale
from machine_learning import train_test_split
import math, random
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def precision(tp, fp, fn, tn):
    return tp / (tp + fp)

def recall(tp, fp, fn, tn):
    return tp / (tp + fn)

def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2 * p * r / (p + r)

def predict(x_i, beta):
    return dot(x_i, beta)

def error(x_i, y_i, beta):
    return y_i - predict(x_i, beta)

def squared_error(x_i, y_i, beta):
    return error(x_i, y_i, beta) ** 2

def squared_error_gradient(x_i, y_i, beta):
    """the gradient corresponding to the ith squared error term"""
    return [-2 * x_ij * error(x_i, y_i, beta)
            for x_ij in x_i]

def estimate_beta(x, y):
    beta_initial = [random.random() for x_i in x[0]]
    return minimize_stochastic(squared_error,
                               squared_error_gradient,
                               x, y,
                               beta_initial,
                               0.001)

def multiple_r_squared(x, y, beta):
    sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2
                                for x_i, y_i in zip(x, y))
    return 1.0 - sum_of_squared_errors / total_sum_of_squares(y)

def logistic(x):
    return 1.0 / (1 + math.exp(-x))

def logistic_log_likelihood_i(x_i,y_i,beta):
    if y_i == 1:
        return math.log(logistic(dot(x_i,beta)))
    else:
        return math.log(1 - logistic(dot(x_i,beta)))
    
def logistic_log_likelihood(x,y,beta):
    return sum(logistic_log_likelihood_i(x_i,y_i,beta) for x_i,y_i in zip(x,y))

def logistic_log_partial_ij(x_i,y_i,beta,j):
    return(y_i - logistic(dot(x_i,beta))) * x_i[j]

def logistic_log_gradient_i(x_i,y_i,beta):
    return [logistic_log_partial_ij(x_i,y_i,beta,j) for j,_ in enumerate(beta)]

def logistic_log_gradient(x,y,beta):
    return reduce(vector_add, [logistic_log_gradient_i(x_i,y_i,beta) for x_i,y_i in zip(x,y)])



In [None]:
df=pd.read_csv('titanic.csv', sep=',',skiprows=[0],header=None)


In [None]:
datay = [ 1 if y[3] == 'yes' else 0 for y in df.values]

In [None]:
datax = [[1,1 if x[0] == '1st class' else 2 if x[0] == '2nd class' else 3,
         0 if x[1] == 'adults' else 1 ,
         0 if x[2] == 'man' else 1] for x in df.values]

In [None]:
random.seed(0)
beta = estimate_beta(datax,datay)
beta

In [None]:
print("R2 = ", multiple_r_squared(datax,datay,beta))

In [None]:
random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(datax, datay, 0.33)
beta = estimate_beta(datax,datay)

In [None]:
tp = fp = tn = fn = 0
for x_i, y_i in zip(x_test, y_test):
    predict1 = predict(x_i,beta)
    if y_i == 1 and predict1 >= 0.5:
        tp += 1
    elif y_i == 1:
        fn += 1
    elif predict1 >= 0.5:
        fp += 1
    else:
        tn += 1

In [None]:
precision1 = precision(tp, fp, fn, tn)
recall1 = recall(tp, fp, fn, tn)
f1 = f1_score(tp, fp, fn, tn)

In [None]:
print(precision1, recall1,f1)

In [None]:
rescaled_x = rescale(datax)
beta = estimate_beta(rescaled_x, datay)


In [None]:
predictions = [predict(xi, beta) for xi in rescaled_x]

In [None]:
plt.scatter(predictions, datay)
plt.xlabel('predito')
plt.ylabel('real')
plt.show()

In [None]:
random.seed(0)
x_train, x_test, y_train, Y_test = train_test_split(rescaled_x, datay, 0.33)

In [None]:
fn = partial(logistic_log_likelihood, x_train, y_train)

In [None]:
gradient_fn = partial(logistic_log_gradient, x_train,y_train)

In [None]:
beta_0 = [random.random() for _ in range(3)]

In [None]:
beta_hat = maximize_batch(fn, gradient_fn, beta_0)

In [None]:
beta_hat

In [None]:
beta_hat = maximize_stochastic(logistic_log_likelihood_i, logistic_log_gradient_i, x_train, y_train, beta_0)

In [None]:
beta_hat

In [None]:
predictions = [logistic(dot(beta_hat, x_i)) for x_i in x_test]
plt.scatter(predictions, Y_test)
plt.xlabel("probabilidade predita")
plt.ylabel("saida real")
plt.title("Regressao logistica predita x real")
plt.show()

In [None]:
tp = fp = tn = fn = 0
for x_i, y_i in zip(x_test, Y_test):
    predict = logistic(dot(beta_hat, x_i))
    if y_i == 1 and predict >= 0.5:
        tp += 1
    elif y_i == 1:
        fn += 1
    elif predict >= 0.5:
        fp += 1
    else:
        tn += 1
        

precision1 = precision(tp, fp, fn, tn)
recall1 = recall(tp, fp, fn, tn)


In [None]:
precision1


In [None]:
recall1

In [None]:

f1 = f1_score(tp, fp, fn, tn)
f1