In [38]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [39]:
titanic_df = pd.read_csv('train_v1.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:


def data_transformer(data_df):

    data_df['cabin_count'] = data_df['Cabin'].apply(lambda x : 0 if pd.isna(x) else len(x.split(' ')))
    data_df['cabin_mod'] = data_df['Cabin'].transform(lambda x : str(x)[0])
    data_df['Title'] = data_df['Name'].apply(lambda x : x.split(',')[1].split('.')[0].strip())

    return data_df


def null_transformer(data_df,column):

    if column == 'Age':

        data_df[column] = data_df[['Title','Sex',column]].groupby(['Title','Sex']).\
                            transform(lambda grp:grp.fillna(round(grp.mean(),0)))

    if data_df['Age'].isnull().sum() > 0 :

        data_df['Age'] = data_df['Age'].fillna(round(data_df['Age'].mean(),0))
        
    return data_df
    

def column_dropper(data_df):

    data_df.drop(columns=['Name','Ticket','Cabin','cabin_count','PassengerId'],inplace=True)

    return data_df


# def cat_encoder(data_df):

#     data_df = pd.concat([data_df,pd.get_dummies(data_df.Sex,drop_first=True)],axis=1)
#     data_df = pd.concat([data_df,pd.get_dummies(data_df.Embarked,drop_first=True)],axis=1)
#     data_df = pd.concat([data_df,pd.get_dummies(data_df.cabin_mod,drop_first=True)],axis=1)
#     data_df = pd.concat([data_df,pd.get_dummies(data_df.Title,drop_first=True)],axis=1)
#     data_df = pd.concat([data_df,pd.get_dummies(data_df.Pclass,drop_first=True)],axis=1)
#     data_df.drop(columns=['Sex','Embarked','cabin_mod','Title','PassengerId','Pclass'],inplace=True)

#     return data_df

    

In [41]:
titanic_df = data_transformer(titanic_df)

titanic_df.dropna(subset=['Embarked'],inplace=True)
titanic_df = column_dropper(titanic_df)
titanic_df = null_transformer(titanic_df,'Age')

titanic_df.reset_index(drop=True,inplace=True)
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,cabin_mod,Title
0,0,3,male,22.0,1,0,7.25,S,n,Mr
1,1,1,female,38.0,1,0,71.2833,C,C,Mrs
2,1,3,female,26.0,0,0,7.925,S,n,Miss
3,1,1,female,35.0,1,0,53.1,S,C,Mrs
4,0,3,male,35.0,0,0,8.05,S,n,Mr


In [42]:
titanic_df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age          0
SibSp        0
Parch        0
Fare         0
Embarked     0
cabin_mod    0
Title        0
dtype: int64

In [43]:
cat_enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
X_tr = cat_enc.fit_transform(titanic_df[['Sex','Embarked','cabin_mod','Title','Pclass']])

In [44]:
X_tr.shape

(889, 34)

In [45]:


df1 = titanic_df[['Age','SibSp','Parch','Fare']]
df2 = pd.DataFrame(data=X_tr,columns=cat_enc.get_feature_names())
X = pd.concat([df1,df2],axis=1)
y = titanic_df.iloc[:,0]


In [46]:
X = X.values
y = y.values

In [47]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split

In [48]:
std_scale = StandardScaler()
X = std_scale.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=3,stratify=y)

In [49]:
logit_model2 = LogisticRegression(max_iter=2000)
logit_model2.fit(X_train,y_train)
print(logit_model2.score(X_train,y_train))
print(logit_model2.score(X_test,y_test))

0.8382559774964838
0.8426966292134831


In [50]:
forest_model = RandomForestClassifier(n_estimators=100,max_depth=6,min_samples_leaf=3,min_samples_split=7)
forest_model.fit(X_train,y_train)
print(forest_model.score(X_train,y_train))
print(forest_model.score(X_test,y_test))

0.8523206751054853
0.8426966292134831


In [51]:
svc_model = SVC(kernel='rbf',probability=True)
svc_model.fit(X_train,y_train)
print(svc_model.score(X_train,y_train))
print(svc_model.score(X_test,y_test))

0.8579465541490858
0.8258426966292135


In [52]:
knn_model2 = KNeighborsClassifier(n_neighbors=7,weights='uniform')
knn_model2.fit(X_train,y_train)
print(knn_model2.score(X_train,y_train))
print(knn_model2.score(X_test,y_test))

0.8565400843881856
0.8258426966292135


**Prediction**

In [56]:
test_df = pd.read_csv('test_v1.csv')
test_df.head()

ans_df = test_df.copy()
ans_df = ans_df.iloc[:,0]


In [57]:
test_df = data_transformer(test_df)
test_df = column_dropper(test_df)
test_df = null_transformer(test_df,'Age')
test_df.reset_index(drop=True,inplace=True)


In [59]:
test_df['Fare'].fillna(round(test_df['Fare'].mean(),0),inplace=True)

In [62]:
x_predict_enc = cat_enc.transform(test_df[['Sex','Embarked','cabin_mod','Title','Pclass']])
df1 = test_df[['Age','SibSp','Parch','Fare']]
df2 = pd.DataFrame(data=x_predict_enc,columns=cat_enc.get_feature_names())
x_predict = pd.concat([df1,df2],axis=1)


In [63]:
x_predict = x_predict.values

x_predict = std_scale.transform(x_predict)

In [64]:
predictions = forest_model.predict(x_predict)

In [65]:
predictions_df = pd.concat([ans_df,pd.DataFrame(data=predictions,columns=['Survived'])],axis=1)

In [66]:
predictions_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [69]:
predictions_df.to_csv('predictions.csv',index=False)