In [83]:
import pandas as pd 
import re
import seaborn as sns 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
from sklearn.preprocessing import StandardScaler

In [87]:
data_train=pd.read_csv('./Data/train.csv')
data_test=pd.read_csv('./Data/test.csv')
data_ids=data_test.PassengerId
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [88]:
X=data_train.drop('Survived',axis=1)
y=data_train['Survived']

In [89]:
#Procesamiento 
#1- Eliminacion de columnas
X=X.drop(['PassengerId','Name','Ticket'],axis=1)
data_test=data_test.drop(['PassengerId','Name','Ticket'],axis=1)

#2-Imputacion de nulos numericos
X['Age']=X['Age'].apply(lambda x: x*80 if x<1 else x)
data_test['Age']=data_test['Age'].apply(lambda x: x*80 if x<1 else x)
num_cols=X.select_dtypes(include='number').columns
num_imputer=SimpleImputer(strategy='median')
X[num_cols]=num_imputer.fit_transform(X[num_cols])
data_test[num_cols]=num_imputer.transform(data_test[num_cols])

#3-Imputacion de categoricos Embarked
cat_imputer=SimpleImputer(strategy='constant',fill_value='S')
X.Embarked.fillna('S',inplace=True)
data_test.Embarked.fillna('S',inplace=True)

#4-Dummies e inversion de Pclass
X['Pclass']=1/X['Pclass']
data_test['Pclass']=1/data_test['Pclass']

dummies_s=pd.get_dummies(X['Sex'],prefix='Sex').astype(int)
dummies_e=pd.get_dummies(X['Embarked'],prefix='E').astype(int)
X=pd.concat([X.drop(['Sex','Embarked'],axis=1),dummies_s,dummies_e],axis=1)

dummies_s=pd.get_dummies(data_test['Sex'],prefix='Sex').astype(int)
dummies_e=pd.get_dummies(data_test['Embarked'],prefix='E').astype(int)
data_test=pd.concat([data_test.drop(['Sex','Embarked'],axis=1),dummies_s,dummies_e],axis=1)

#5-Letra de cabina

cabs={'Z':0,'T':.25,'G':.5,'F':.75,'E':1,'D':1.25,'C':1.5,'B':1.75,'A':2}
X.Cabin.fillna('Z',inplace=True)
X['Cabin']=X['Cabin'].apply(lambda x: x[0])
X['Cabin']=X['Cabin'].map(cabs)

data_test.Cabin.fillna('Z',inplace=True)
data_test['Cabin']=data_test['Cabin'].apply(lambda x: x[0])
data_test['Cabin']=data_test['Cabin'].map(cabs)

#6-Menor

X['Menor']=X['Age'].apply(lambda x: 1 if x<18 else 0)
data_test['Menor']=data_test['Age'].apply(lambda x: 1 if x<18 else 0)

#7-Cambiando tipos
X['Age']=X['Age'].astype(int)
data_test['Age']=data_test['Age'].astype(int)


In [96]:
#FE-1
X['Solo']=X[['SibSp','Parch']].apply(lambda x: 1 if (x.SibSp + x.Parch) == 0 else 0,axis=1)
data_test['Solo']=data_test[['SibSp','Parch']].apply(lambda x: 1 if (x.SibSp + x.Parch) == 0 else 0,axis=1)

#FE-2

X['Fare']=X['Fare'].apply(lambda x: 1 if x>100 else .5 if x>30 else 0)
data_test['Fare']=data_test['Fare'].apply(lambda x: 1 if x>100 else .5 if x>30 else 0)

#FE-3


In [91]:
data_train.groupby('Survived').agg({
    'Fare': 'median',
    'Age': 'median',
    'Pclass': lambda x: x.mode().iloc[0],  # Utilizando una función lambda para calcular la moda
    'Sex':lambda x: x.mode().iloc[0]
})


Unnamed: 0_level_0,Fare,Age,Pclass,Sex
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,10.5,28.0,3,male
1,26.0,28.0,1,female


In [92]:
#Escalado
scaler=StandardScaler()
X=pd.DataFrame(scaler.fit_transform(X),columns=X.columns)
data_test=pd.DataFrame(scaler.transform(data_test),columns=data_test.columns)

In [93]:
X_train,X_eval,y_train,y_eval=train_test_split(X,y,test_size=.2,random_state=40,shuffle=True,stratify=y)

In [94]:
#RANDOM FOREST
rf=RandomForestClassifier(max_depth=4,random_state=40)
rf.fit(X_train,y_train)

print('Acc', accuracy_score(y_train,rf.predict(X_train)))
print('Acc', accuracy_score(y_eval,rf.predict(X_eval)))
print('F1', f1_score(y_eval,rf.predict(X_eval)))

Acc 0.8174157303370787
Acc 0.8491620111731844
F1 0.7906976744186046


In [95]:
pd.DataFrame(zip(rf.feature_importances_,X.columns),columns=['A','B']).sort_values(by=['A'],ascending=False)

Unnamed: 0,A,B
8,0.304639,Sex_male
7,0.257268,Sex_female
0,0.116421,Pclass
6,0.108169,Cabin
1,0.058934,Age
2,0.031188,SibSp
5,0.027901,Fare
3,0.025239,Parch
13,0.020588,Solo
12,0.017116,Menor


In [67]:
pred=rf.predict(data_test)

In [68]:
res=pd.DataFrame(zip(data_ids,pred),columns=['PassengerId','Survived'])
res.to_csv('res.csv',index=False)

In [69]:
#Resultados
mod=['Rf simple','Rf con columna Solo','Rf con Solo y Menor','Rf tratando Cabin']
acc=[.84,.855,.86,.849]
f1=[.785,.8,.8,.79]
public=[.7601,.746,.785,.782]

pruebas=pd.DataFrame(zip(mod,acc,f1,public),columns=['Modelo','Accuracy','F1-Score','Resultado Público'])
pruebas

Unnamed: 0,Modelo,Accuracy,F1-Score,Resultado Público
0,Rf simple,0.84,0.785,0.7601
1,Rf con columna Solo,0.855,0.8,0.746
2,Rf con Solo y Menor,0.86,0.8,0.785
3,Rf tratando Cabin,0.849,0.79,0.782
