<a href="https://colab.research.google.com/github/masaladosai/mlp/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **importing modules and initialising dataframe**

In [117]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
pd.set_option("display.max_columns",None)

In [118]:
df=sns.load_dataset("titanic") #load dataset using seaborn

In [119]:
df.describe() #check

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [120]:
df.info() #check

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


# **Imputation** - use **drop** to handle missing data

>missing values will cause errors in ml algorithms so we drop the values that isn't relevant



In [121]:
df.drop(columns=["deck"],inplace=True) #drop deck column since it has too much missing values
df["age"]=df["age"].fillna(df["age"].mean()) #age column has some missing values so use mean as a filler
df.dropna(inplace=True) #drops rows with missig values
redundant_data = [
    "alive",
    "who",
    "adult_male",
    "alone"
]

df = df.drop(columns=redundant_data)

#**encoding**-one hot encoder(get_dummies())

In [122]:
cat_columns=df.select_dtypes(exclude="number").columns

In [123]:
cat_columns

Index(['sex', 'embarked', 'class', 'embark_town'], dtype='object')

In [124]:
df1=pd.get_dummies(df,columns=cat_columns,drop_first=True) #get_dummies is a one hot encoder


# **Scaling**-StandardScalar



> split data first...for training and testing



In [125]:
Feat = df1.drop("survived", axis=1)
Res = df1["survived"]

In [126]:
Train_Feat,Test_Feat,Train_Res,Test_Res= train_test_split(Feat,Res,test_size=0.2,random_state=40)

In [127]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()

In [128]:
Train_Feat_Scaled=scalar.fit_transform(Train_Feat)
Test_Feat_Scaled=scalar.transform(Test_Feat)

In [129]:
Test_Feat

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_male,embarked_Q,embarked_S,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
817,2,31.000000,1,1,37.0042,True,False,False,True,False,False,False
772,2,57.000000,0,0,10.5000,False,False,True,True,False,False,True
455,3,29.000000,0,0,7.8958,True,False,False,False,True,False,False
26,3,29.699118,0,0,7.2250,True,False,False,False,True,False,False
200,3,28.000000,0,0,9.5000,True,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
698,1,49.000000,1,1,110.8833,True,False,False,False,False,False,False
237,2,8.000000,0,2,26.2500,False,False,True,True,False,False,True
786,3,18.000000,0,0,7.4958,False,False,True,False,True,False,True
145,2,19.000000,1,1,36.7500,True,False,True,True,False,False,True


In [130]:
np.std(Test_Feat_Scaled, axis=0)[:5]

array([1.03430525, 1.07360444, 0.88253769, 0.85316724, 1.16387759])

In [131]:
np.mean(Train_Feat_Scaled, axis=0)[:5]

array([-2.36410360e-16,  3.19794199e-16,  5.49646279e-17, -3.37282944e-17,
       -3.65389856e-17])

# **Training**

In [132]:
mlp=MLPClassifier(
    hidden_layer_sizes=(32,16),
    activation="logistic",
    solver="adam",
    max_iter=500,
    random_state=42

)

In [133]:
mlp.fit(Train_Feat_Scaled,Train_Res)

In [134]:
Res_Pred = mlp.predict(Test_Feat_Scaled)

print("Accuracy:", accuracy_score(Test_Res, Res_Pred))
print(classification_report(Test_Res, Res_Pred))

Accuracy: 0.797752808988764
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       110
           1       0.75      0.71      0.73        68

    accuracy                           0.80       178
   macro avg       0.79      0.78      0.78       178
weighted avg       0.80      0.80      0.80       178

