# Import libraries

## Base

In [149]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy as s
import sklearn

import IPython

import os
import random
import time

%run ../scripts/config.py

## Modelling

### Algorithms

In [150]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

### Helpers

In [151]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Viz

In [152]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

In [153]:
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Set up

In [154]:
PATHS = setup_paths()

In [155]:
TRAIN_PATH = os.path.join(PATHS['data_raw'], 'train.csv')
TEST_PATH = os.path.join(PATHS['data_raw'], 'test.csv')


# Import data

In [156]:
df_data  = pd.read_csv(TRAIN_PATH)

df_sub = pd.read_csv(TEST_PATH)

In [157]:
data_cleaner = [df_data, df_sub]

# Exploration

https://www.kaggle.com/c/titanic/data

In [158]:
df_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
749,750,0,3,"Connaghton, Mr. Michael",male,31.0,0,0,335097,7.75,,Q
241,242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
165,166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.52,,S
601,602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.9,,S


## Nulls values

In [159]:
print("Null values in training data:")
print(df_data.isnull().sum())
print("-"*50)
print("Null values in test data:")
print(df_test.isnull().sum())

Null values in training data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------------------------------------
Null values in test data:
PassengerId        0
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin            327
Embarked           0
FamilySize         0
IsAlone            0
Title              0
FareBin            0
AgeBin             0
Sex_Code           0
Embarked_Code      0
Title_Code         0
AgeBin_Code        0
FareBin_Code       0
dtype: int64


# Clean data

In [160]:
for dataset in data_cleaner:
    dataset['Age'] = dataset.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])
    dataset['Fare'] = dataset.groupby(['Pclass', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

drop_columns = ['PassengerId','Cabin', 'Ticket']

df_data.drop(columns=drop_columns, axis=1, inplace=True)

# Feature Engineering

In [161]:
for dataset in data_cleaner:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # Hermanos/Conyuges + /Padres/Hijos + La persona
    dataset['IsAlone'] = np.where(dataset['FamilySize'] == 1, 1, 0 )
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(". ", expand=True)[0]
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'], bins= [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

    rare_titles = dataset.value_counts('Title').loc[lambda x: x < 10].index
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if x in rare_titles else x)

df_data.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
116,0,3,"Connors, Mr. Patrick",male,70.5,0,0,7.75,Q,1,1,Mr,"(-0.001, 7.91]","(70, 80]"
370,1,1,"Harder, Mr. George Achilles",male,25.0,1,0,55.44,C,2,0,Mr,"(31.0, 512.329]","(20, 30]"
502,0,3,"O'Sullivan, Miss. Bridget Mary",female,21.5,0,0,7.63,Q,1,1,Miss,"(-0.001, 7.91]","(20, 30]"
276,0,3,"Lindblom, Miss. Augusta Charlotta",female,45.0,0,0,7.75,S,1,1,Miss,"(-0.001, 7.91]","(40, 50]"
153,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,14.5,S,3,0,Mr,"(14.454, 31.0]","(40, 50]"


In [162]:
label = LabelEncoder()

for dataset in data_cleaner:
    for code_column in ['Sex', 'Embarked', 'Title', 'AgeBin', 'FareBin']:
        dataset[f'{code_column}_Code'] = label.fit_transform(dataset[code_column])

In [163]:
# Definir variable objetivo
y_name = ['Survived']

# Variables originales con nombres entendibles
X_name = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']

# Variables en formato numérico (codificadas para Machine Learning)
X_calc = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'Age', 'Fare']

# Variables con binning (para agrupar variables continuas en categorías)
X_bin = ['Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']

# Variables después de One-Hot Encoding (se generarán con get_dummies más adelante)
X_dummy = None  # Se definirá dinámicamente

# Combinaciones de variables
X_y = y_name + X_name          # Variables originales
X_y_calc = y_name + X_calc     # Variables codificadas
X_y_bin = y_name + X_bin       # Variables con binning

# Generar variables dummy
df_dummy = pd.get_dummies(df_data[X_name])  # One-hot encoding
X_dummy = df_dummy.columns.tolist()
X_y_dummy = y_name + X_dummy

# Imprimir resultados
print("Variables originales (X_y):", X_y, '\n')
print("Variables codificadas (X_y_calc):", X_y_calc, '\n')
print("Variables con binning (X_y_bin):", X_y_bin, '\n')
print("Variables dummy (X_y_dummy):", X_y_dummy, '\n')

# Ver primeros datos transformados
df_dummy.head()

Variables originales (X_y): ['Survived', 'Sex', 'Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] 

Variables codificadas (X_y_calc): ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'Age', 'Fare'] 

Variables con binning (X_y_bin): ['Survived', 'Sex_Code', 'Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code'] 

Variables dummy (X_y_dummy): ['Survived', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Misc', 'Title_Miss', 'Title_Mr', 'Title_Mrs'] 



Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3,1,0,22.0,7.25,2,0,False,True,False,False,True,False,False,False,True,False
1,1,1,0,38.0,71.28,2,0,True,False,True,False,False,False,False,False,False,True
2,3,0,0,26.0,7.92,1,1,True,False,False,False,True,False,False,True,False,False
3,1,1,0,35.0,53.1,2,0,True,False,False,False,True,False,False,False,False,True
4,3,0,0,35.0,8.05,1,1,False,True,False,False,True,False,False,False,True,False


# Model

In [165]:
X_train_calc, X_test_calc, y_train_calc, y_test_calc = model_selection.train_test_split(df_data[y_name], df_data[X_y_calc], random_state=42)
