#Cargar las librerías

In [84]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Cargar los datos

In [85]:
test_data = pd.read_csv('/content/gdrive/MyDrive/ITA/Proyecto/test.csv')
train_data = pd.read_csv('/content/gdrive/MyDrive/ITA/Proyecto/train.csv')

#Llenamos los datos vacios

In [86]:
#Llenamos los embarcamientos vacios con el de mayor cantidad
train_data["Embarked"].fillna("S", inplace = True)

#Llenamos la única tarifa vacía
test_data = test_data.fillna({'Fare': 8.0500})

# Primero extraemos los titulos de de la columna Name
train_data['Title'] = train_data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))

test_data['Title'] = test_data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
#La funcion  (=re.search(' ([A-Za-z]+)\.', x) encuentra la relacion de un espacio, luego uno o más caracteres alfabéticos en mayúsculas o minúsculas, seguidos de un punto.

# Calculamos la media para data titulo
median_age_by_title = train_data.groupby('Title')['Age'].median()

# Ponemos las edades en donde falten
for title, median_age in median_age_by_title.items():
    train_data.loc[(train_data['Age'].isnull()) & (train_data['Title'] == title), 'Age'] = median_age

for title, median_age in median_age_by_title.items():
    test_data.loc[(test_data['Age'].isnull()) & (test_data['Title'] == title), 'Age'] = median_age

# Comenzamos a cambiar valores categoricos por numericos


In [87]:
#Cambiamos el sexo a valores binarios
train_data['Sex'] = train_data['Sex'].map({"male":1,"female":0})
test_data['Sex'] = test_data['Sex'].map({"male":1,"female":0})

#Cambiamos los datos de cabina a binarios si es que tenía o no
train_data['Cabin'] = np.where(train_data['Cabin'].isnull(), 0, 1)
test_data['Cabin'] = np.where(test_data['Cabin'].isnull(), 0, 1)

#Cambiamos el los embarcamientos a valores numéricos
replace_values = {"S":1, "C":2, "Q":3}
train_data['Embarked'] = train_data['Embarked'].map(replace_values)
test_data['Embarked'] = test_data['Embarked'].map(replace_values)

#Agrupamos las edades y las colocamos en un nuevo feature numerico

In [88]:
train_data['Childhood'] = train_data.loc[(train_data['Age'] >= 0) & (train_data['Age'] <= 9), 'Age']
train_data['Childhood'] = np.where(train_data['Childhood'].isnull(), 0, 1)
train_data['Adolescence'] = train_data.loc[(train_data['Age'] >= 10) & (train_data['Age'] <= 20), 'Age']
train_data['Adolescence'] = np.where(train_data['Adolescence'].isnull(), 0, 1)
train_data['Adulthood'] = train_data.loc[(train_data['Age'] >= 21) & (train_data['Age'] <= 39), 'Age']
train_data['Adulthood'] = np.where(train_data['Adulthood'].isnull(), 0, 1)
train_data['MiddleAge'] = train_data.loc[(train_data['Age'] >= 40) & (train_data['Age'] <= 59), 'Age']
train_data['MiddleAge'] = np.where(train_data['MiddleAge'].isnull(), 0, 1)
train_data['Elder'] = train_data.loc[(train_data['Age'] >= 60) & (train_data['Age'] <= 80), 'Age']
train_data['Elder'] = np.where(train_data['Elder'].isnull(), 0, 1)

test_data['Childhood'] = test_data.loc[(test_data['Age'] >= 0) & (test_data['Age'] <= 9), 'Age']
test_data['Childhood'] = np.where(test_data['Childhood'].isnull(), 0, 1)
test_data['Adolescence'] = test_data.loc[(test_data['Age'] >= 10) & (test_data['Age'] <= 20), 'Age']
test_data['Adolescence'] = np.where(test_data['Adolescence'].isnull(), 0, 1)
test_data['Adulthood'] = test_data.loc[(test_data['Age'] >= 21) & (test_data['Age'] <= 39), 'Age']
test_data['Adulthood'] = np.where(test_data['Adulthood'].isnull(), 0, 1)
test_data['MiddleAge'] = test_data.loc[(test_data['Age'] >= 40) & (test_data['Age'] <= 59), 'Age']
test_data['MiddleAge'] = np.where(test_data['MiddleAge'].isnull(), 0, 1)
test_data['Elder'] = test_data.loc[(test_data['Age'] >= 60) & (test_data['Age'] <= 80), 'Age']
test_data['Elder'] = np.where(test_data['Elder'].isnull(), 0, 1)

def intervalsAge(x):
    if x >= 0 and x <= 9:
        return 1
    elif x >= 10 and x <= 20:
        return 2
    elif x >= 21 and x <= 39:
        return 3
    elif x >= 40 and x <= 59:
        return 4
    else:
        return 5

train_data['AgeDesignations'] = train_data.apply(lambda row: intervalsAge(row['Age']), axis=1)
test_data['AgeDesignations'] = test_data.apply(lambda row: intervalsAge(row['Age']), axis=1)

#Guardamos los datos nuevos

In [89]:
#output = train_data
#output.to_csv('/content/drive/MyDrive/ITA/Proyecto/train_data_cleaned.csv', index=False)

# Filtramos los datos a utilizar para el método de predicción

In [90]:
new_train_data_x = train_data.filter(['Pclass','Sex','Fare','Cabin','Embarked','AgeDesignations'])
new_train_data_y = train_data['Survived']
new_test_data_x = test_data.filter(['Pclass','Sex','Fare','Cabin','Embarked','AgeDesignations'])

#Comenzamos con el método

In [91]:
#Escalamos
scaler = MinMaxScaler()
new_train_data_x = scaler.fit_transform(new_train_data_x)
#Lo siguiente no es necesariamente lo más eficiente, pero es un método para elegir la cantidad de datos vecinos
#Dividimos datos en muestra y testeo
X_train, X_test, y_train, y_test = train_test_split(new_train_data_x,new_train_data_y, stratify = new_train_data_y)
acc_list = []
for i in range(1,10):
    KNN = KNeighborsClassifier(n_neighbors = i)
    KNN.fit(X_train, y_train)
    pred = KNN.predict(X_test)
    acc = accuracy_score(y_test, pred, normalize=True) * float(100)
    acc_list.append(acc)
    print('\n accuracy for k=%d is %d'%(i,acc))
nk = acc_list.index(max(acc_list))

#Realizamos el metodo y el fit
KNN = KNeighborsClassifier(n_neighbors = nk)
KNN.fit(X_train, y_train)
#Realizamos la prediccion
pred = KNN.predict(new_test_data_x)
#Lo colocamos en un dataframe para hacerlo archivo
archivoEntrega = pd.DataFrame(test_data["PassengerId"])
archivoEntrega['Survived'] = pred
#Se exporta el archivo
archivoEntrega.to_csv('Predict.csv', index=False)


 accuracy for k=1 is 74

 accuracy for k=2 is 73

 accuracy for k=3 is 75

 accuracy for k=4 is 74

 accuracy for k=5 is 75

 accuracy for k=6 is 78

 accuracy for k=7 is 78

 accuracy for k=8 is 76

 accuracy for k=9 is 78


