# Mixed model and decision trees

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer,MissingIndicator,KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [15]:
df_train=pd.read_csv("/home/ing/Bureau/tp_AP/titanic_train.csv")
df_test=pd.read_csv("/home/ing/Bureau/tp_AP/titanic_test.csv")

df_train.drop(columns=['Name','FullName','Ticket','PassengerId'],inplace=True)
df_test.drop(columns=['Name','FullName','Ticket','PassengerId'],inplace=True)
Age=df_train['Age']
Age2=df_test['Age']
X_train2=df_train
X_test2=df_test

**1st strategy for Missing values: mean strategy**

In [18]:
print(df_train.isna().sum()*100/len(df_train))
df_train['Age']=df_train['Age'].fillna(df_train['Age'].mean())
df_test['Age']=df_test['Age'].fillna(df_test['Age'].mean())

Survived     0.000000
Pclass       0.000000
Sex          0.000000
Age          0.000000
SibSp        0.000000
Parch        0.000000
Fare         0.000000
Cabin       75.885329
Embarked     0.168634
Child       20.236088
Fare2        0.000000
dtype: float64


In [24]:
df_train['Child']=[np.nan for _ in range(len(df_train))]
df_train.loc[df_train['Age']<=18,'Child']=1
df_train.loc[df_train['Age']>18,'Child']=0

fare=[]
for i,a in df_train['Fare'].items():
    if a<=10:
        fare.append(0)
    if (a<=20 and a>10):
        fare.append(1)
    if (a<=30 and a>20):
        fare.append(2)
    if (a>30):
        fare.append(3)
    
df_train['Fare2']=fare



df_train.loc[df_train['Sex']=='male','Sex']=int(1)
df_train.loc[df_train['Sex']=='female','Sex']=int(0)
df_train['Child']=df_train['Child'].astype(int)
#df_train.drop(columns=['Cabin'],inplace=True)
df_train.drop('Survived',axis=1,inplace=True)

In [23]:
df_test['Child']=[np.nan for _ in range(len(df_test))]
df_test.loc[df_test['Age']<=18,'Child']=1
df_test.loc[df_test['Age']>18,'Child']=0

fare=[]
for i,a in df_test['Fare'].items():
    if a<=10:
        fare.append(0)
    if (a<=20 and a>10):
        fare.append(1)
    if (a<=30 and a>20):
        fare.append(2)
    if (a>30):
        fare.append(3)
    
df_test['Fare2']=fare



df_test.loc[df_test['Sex']=='male','Sex']=int(1)
df_test.loc[df_test['Sex']=='female','Sex']=int(0)
df_test['Child']=df_test['Child'].astype(int)
#df_train.drop(columns=['Cabin'],inplace=True)
df_test.drop('Survived',axis=1,inplace=True)

In [7]:
X_train=df_train[['Child','Sex','Pclass','Fare2']]
y_train=df_train['Survived']
X_test=df_test[['Child','Sex','Pclass','Fare2']]
y_test=df_test['Survived']
X_train

Unnamed: 0,Child,Sex,Pclass,Fare2
0,0,1,1,3
1,0,0,1,3
2,0,0,3,0
3,0,1,3,2
4,0,1,3,0
...,...,...,...,...
588,0,1,2,1
589,0,0,1,2
590,0,0,3,2
591,0,1,1,2


In [8]:
gnb=GaussianNB()
gnb.fit(X_train,y_train)
gnb.score(X_test,y_test)

0.6912751677852349

## Comments
The results are better than the previous one. We won 6 points of score: there is clearly an improvement because we added new features to our train and test sets which improve the ability of our model to interprete the data.

## Decision tree

In [9]:
tree=DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)
tree.score(X_test,y_test)

0.7483221476510067

## Comments
The score obtained with decision  classifier is much better than the naive's bayes classifier

## performing the decision tree classifier and second missing values strategy

In [26]:
X_train2['Age']=Age
X_train2['Child']=[np.nan for _ in range(len(df_train))]
X_train2.loc[df_train['Age']<=18,'Child']=int(1)
X_train2.loc[df_train['Age']>18,'Child']=int(0)
X_train2['Cabin']=X_train2['Cabin'].factorize()[0]
X_train2.loc[X_train2['Cabin']==0,'Cabin']=np.nan
X_train2['Embarked']=X_train2['Embarked'].factorize()[0]
##
X_test2['Age']=Age2
X_test2['Child']=[np.nan for _ in range(len(df_test))]
X_test2.loc[df_test['Age']<=18,'Child']=int(1)
X_test2.loc[df_test['Age']>18,'Child']=int(0)
X_test2['Cabin']=X_test2['Cabin'].factorize()[0]
X_test2.loc[X_test2['Cabin']==0,'Cabin']=np.nan
X_test2['Embarked']=X_test2['Embarked'].factorize()[0]


In [32]:
model=make_pipeline(KNNImputer(),DecisionTreeClassifier(max_depth=3,random_state=0))
param={
    'knnimputer__n_neighbors':[1,2,3,4,5,6,7]
}
grid=GridSearchCV(model,param_grid=param,cv=5)

grid.fit(X_train2,y_train)
grid.best_score_

0.7958695342543798

In [33]:
best_model=grid.best_estimator_
best_model.score(X_test2,y_test)

0.7046979865771812

In [None]:
from sklearn.model_selection import learning_curve