In [848]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [918]:
train = pd.read_csv('train.csv')
Y = train['Survived']
train.drop('Survived',axis=1,inplace = True)
test = pd.read_csv('test.csv')
test['Id'] = test['PassengerId']

In [898]:
def fill_nan(df):    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].isna().astype(int)
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Embarked'] = df['Embarked'].replace({'S':1,'C':2,'Q':3}).fillna(1)   
    return df


In [899]:
def get_title(df): 
    df['title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]
    mr_values =['Don','Rev','Dr','Major','Col','Capt','Jonkheer','Sir']
    mrs_replace = ['Mme','Dona','the Countess','Mra','Lady']
    miss_replace = ['Ms','Mlle']
    df['title'] = df['title'].replace(mr_values,'Mr',regex = True)
    df['title'] = df['title'].replace(mrs_replace,'Mrs',regex = True)
    df['title'] = df['title'].replace(miss_replace,'Miss',regex = True)
    return df


In [900]:
def ticket(df):
    df['Ticket'] = df['Ticket'].str.split(' ',expand = True)[0]
    df['Ticket'] = df['Ticket'].astype('category').cat.codes
    return df

In [901]:
def family_members(df):
    df['Family Size'] = df['Parch'] + df['SibSp']
    df['withFamily'] = df['Family Size'].where(df['Family Size'] < 1,1)
    return df

In [902]:
def getDummies(df):
    dummies_sex = pd.get_dummies(df['Sex'])
    dummies_embarked = pd.get_dummies(df['Embarked'])
    dummies_title = pd.get_dummies(df['title'])
    dummies_Pclass = pd.get_dummies(df['Pclass'])
    df = pd.concat([dummies_embarked,dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    #df = pd.concat([dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    return df

In [903]:
def preProcess(df):
    data = fill_nan(df)
    data = get_title(data)
    data = ticket(data)
    data = family_members(data)
    dummies = getDummies(data)
    final_df = pd.concat([data,dummies],axis= 1)
    final_df.drop(['Name','Sex','Pclass','Embarked','title','PassengerId'],inplace = True,axis = 1)
    #final_df.drop(['Name','Sex','Pclass','title'],inplace = True,axis = 1)
    return final_df

In [904]:
train_data  = preProcess(train)
test_data = preProcess(test)

In [905]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV

In [906]:
X_train, X_test, y_train,y_test = train_test_split(train_data, Y,random_state = 0,test_size = 0.10)

In [917]:
def gradient_boost_model(X_train,y_train,X_test,y_test):
    gbc = GradientBoostingClassifier(learning_rate=0.01,subsample = 0.3,max_features = 7,n_estimators =500).fit(X_train,y_train)
    train_score = gbc.score(X_train,y_train)
    prediction = gbc.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(gbc,X_train,y_train, cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
gradient_boost_model(X_train,y_train,X_test,y_test)

#{'learning_rate': 0.1, 'max_features': 7, 'n_estimators': 300, 'subsample': 1}

0.8639200998751561
0.8444444444444444
[0.80745342 0.8        0.8375     0.81875    0.85      ]


In [908]:
def random_forest_model(X_train,y_train, X_test,y_test):
    rf = RandomForestClassifier(n_estimators= 1500,max_features=1,max_depth=10).fit(X_train,y_train)
    train_score = rf.score(X_train,y_train)
    prediction =rf.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(rf,X_train,y_train, cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
random_forest_model(X_train,y_train,X_test,y_test)

0.9463171036204744
0.8444444444444444
[0.81987578 0.81875    0.8375     0.81875    0.84375   ]


In [915]:
def ada_boost_model(X_train,y_train,X_test,y_test):
    ada = AdaBoostClassifier(n_estimators = 35,learning_rate = 1).fit(X_train,y_train)
    train_score = ada.score(X_train,y_train)
    prediction = ada.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(ada, X_train,y_train,cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
ada_boost_model(X_train,y_train,X_test,y_test)

0.8451935081148564
0.8888888888888888
[0.81987578 0.80625    0.825      0.81875    0.84375   ]


In [890]:
param_grid = {'learning_rate':[0.001,0.01,0.05,0.08,0.1],'subsample': [0.3,0.5,0.8,1],'n_estimators':[300,500,800,1500],'max_features':[6,7,8,9,10]}
model = GradientBoostingClassifier()
def grid_searchCV(param_grid,model,X_train,y_train,X_test,y_test):
    gs = GridSearchCV(model,param_grid).fit(X_train,y_train)
    prediction = gs.predict(X_test)
    train_score = gs.score(X_train,y_train)
    test_accuracy = accuracy_score(y_test,prediction)
    best_params = gs.best_params_
    print(train_score)
    print(test_score)
    print(best_params)
grid_searchCV(param_grid,model,X_train,y_train,X_test,y_test)


0.9650436953807741
0.8116591928251121
{'learning_rate': 0.1, 'max_features': 7, 'n_estimators': 300, 'subsample': 1}
