In [848]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [849]:
train = pd.read_csv('train.csv')
Y = train['Survived']
train.drop('Survived',axis=1,inplace = True)
test = pd.read_csv('test.csv')

In [850]:
def fill_nan(df):    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].isna().astype(int)
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Embarked'] = df['Embarked'].replace({'S':1,'C':2,'Q':3}).fillna(1)   
    return df


In [851]:
def get_title(df): 
    df['title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]
    mr_values =['Don','Rev','Dr','Major','Col','Capt','Jonkheer','Sir']
    mrs_replace = ['Mme','Dona','the Countess','Mra','Lady']
    miss_replace = ['Ms','Mlle']
    df['title'] = df['title'].replace(mr_values,'Mr',regex = True)
    df['title'] = df['title'].replace(mrs_replace,'Mrs',regex = True)
    df['title'] = df['title'].replace(miss_replace,'Miss',regex = True)
    return df


In [852]:
def ticket(df):
    df['Ticket'] = df['Ticket'].str.split(' ',expand = True)[0]
    df['Ticket'] = df['Ticket'].astype('category').cat.codes
    return df

In [853]:
def family_members(df):
    df['Family Size'] = df['Parch'] + df['SibSp']
    df['withFamily'] = df['Family Size'].where(df['Family Size'] < 1,1)
    return df

In [854]:
def getDummies(df):
    dummies_sex = pd.get_dummies(df['Sex'])
    dummies_embarked = pd.get_dummies(df['Embarked'])
    dummies_title = pd.get_dummies(df['title'])
    dummies_Pclass = pd.get_dummies(df['Pclass'])
    df = pd.concat([dummies_embarked,dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    #df = pd.concat([dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    return df

In [855]:
def preProcess(df):
    data = fill_nan(df)
    data = get_title(data)
    data = ticket(data)
    data = family_members(data)
    dummies = getDummies(data)
    final_df = pd.concat([data,dummies],axis= 1)
    final_df.drop(['Name','Sex','Pclass','Embarked','title'],inplace = True,axis = 1)
    #final_df.drop(['Name','Sex','Pclass','title'],inplace = True,axis = 1)
    return final_df

In [856]:
train_data  = preProcess(train)
test_data = preProcess(test)

In [857]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score

In [858]:
X_train, X_test, y_train,y_test = train_test_split(train_data, Y,random_state = 0,test_size = 0.10)

In [863]:
def gradient_boost_model(X_train,y_train,X_test,y_test):
    gbc = GradientBoostingClassifier(learning_rate=0.01,subsample = 0.3,max_features = 8,n_estimators =500).fit(X_train,y_train)
    train_score = gbc.score(X_train,y_train)
    prediction = gbc.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(gbc,X_train,y_train, cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
gradient_boost_model(X_train,y_train,X_test,y_test)

0.8714107365792759
0.8666666666666667
[0.80745342 0.80625    0.83125    0.83125    0.85625   ]


In [860]:
def random_forest_model(X_train,y_train, X_test,y_test):
    rf = RandomForestClassifier(n_estimators= 1500,max_features=1,max_depth=10).fit(X_train,y_train)
    train_score = rf.score(X_train,y_train)
    prediction =rf.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    print(train_score)
    print(test_score)
random_forest_model(X_train,y_train,X_test,y_test)

0.9500624219725343
0.8444444444444444


In [862]:
def ada_boost_model(X_train,y_train,X_test,y_test):
    ada = AdaBoostClassifier(n_estimators = 50,learning_rate = 1).fit(X_train,y_train)
    train_score = ada.score(X_train,y_train)
    prediction = ada.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    print(train_score)
    print(test_score)
ada_boost_model(X_train,y_train,X_test,y_test)

0.8451935081148564
0.8555555555555555
