In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [79]:
train = pd.read_csv('train.csv')
Y = train['Survived']
train.drop('Survived',axis=1,inplace = True)
test = pd.read_csv('test.csv')
id = test['PassengerId']

In [80]:
def fill_nan(df):    
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Cabin'] = df['Cabin'].isna().astype(int)
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Embarked'] = df['Embarked'].replace({'S':1,'C':2,'Q':3}).fillna(1)   
    return df


In [81]:
def get_title(df): 
    df['title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]
    mr_values =['Don','Rev','Dr','Major','Col','Capt','Jonkheer','Sir']
    mrs_replace = ['Mme','Dona','the Countess','Mra','Lady']
    miss_replace = ['Ms','Mlle']
    df['title'] = df['title'].replace(mr_values,'Mr',regex = True)
    df['title'] = df['title'].replace(mrs_replace,'Mrs',regex = True)
    df['title'] = df['title'].replace(miss_replace,'Miss',regex = True)
    return df


In [82]:
def ticket(df):
    df['Ticket'] = df['Ticket'].str.split(' ',expand = True)[0]
    df['Ticket'] = df['Ticket'].astype('category').cat.codes
    return df

In [83]:
def family_members(df):
    df['Family Size'] = df['Parch'] + df['SibSp']
    df['withFamily'] = df['Family Size'].where(df['Family Size'] < 1,1)
    return df

In [84]:
def getDummies(df):
    dummies_sex = pd.get_dummies(df['Sex'])
    dummies_embarked = pd.get_dummies(df['Embarked'])
    dummies_title = pd.get_dummies(df['title'])
    dummies_Pclass = pd.get_dummies(df['Pclass'])
    df = pd.concat([dummies_embarked,dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    #df = pd.concat([dummies_sex, dummies_title,dummies_Pclass],axis = 1)
    return df

In [85]:
def preProcess(df):
    data = fill_nan(df)
    data = get_title(data)
    data = ticket(data)
    data = family_members(data)
    dummies = getDummies(data)
    final_df = pd.concat([data,dummies],axis= 1)
    final_df.drop(['Name','Sex','Pclass','Embarked','title','PassengerId'],inplace = True,axis = 1)
    #final_df.drop(['Name','Sex','Pclass','title'],inplace = True,axis = 1)
    return final_df

In [86]:
train_data  = preProcess(train)
test_data = preProcess(test)

In [87]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV

In [88]:
X_train, X_test, y_train,y_test = train_test_split(train_data, Y,random_state = 0,test_size = 0.10)

In [53]:
def gradient_boost_model(X_train,y_train,X_test,y_test):
    gbc = GradientBoostingClassifier(learning_rate=0.1,subsample = 1,max_features = 7,n_estimators =300).fit(X_train,y_train)
    train_score = gbc.score(X_train,y_train)
    prediction = gbc.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(gbc,X_train,y_train, cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
    return gbc
gradient_boost_model(X_train,y_train,X_test,y_test)

#{'learning_rate': 0.1, 'max_features': 7, 'n_estimators': 300, 'subsample': 1}

0.9450686641697877
0.8555555555555555
[0.81987578 0.825      0.8625     0.8375     0.84375   ]


GradientBoostingClassifier(max_features=7, n_estimators=300, subsample=1)

In [54]:
def random_forest_model(X_train,y_train, X_test,y_test):
    rf = RandomForestClassifier(n_estimators= 500,max_features=6,max_depth=10).fit(X_train,y_train)
    train_score = rf.score(X_train,y_train)
    prediction =rf.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(rf,X_train,y_train, cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
    return rf
random_forest_model(X_train,y_train,X_test,y_test)
#{'max_depth': 10, 'max_features': 4, 'n_estimators': 500}


0.9538077403245943
0.8555555555555555
[0.83229814 0.8375     0.85       0.8125     0.85      ]


RandomForestClassifier(max_depth=10, max_features=6, n_estimators=500)

In [55]:
def ada_boost_model(X_train,y_train,X_test,y_test):
    ada = AdaBoostClassifier(n_estimators = 50,learning_rate = 1.5).fit(X_train,y_train)
    train_score = ada.score(X_train,y_train)
    prediction = ada.predict(X_test)
    test_score = accuracy_score(y_test,prediction)
    cross_val_scores = cross_val_score(ada, X_train,y_train,cv = 5)
    print(train_score)
    print(test_score)
    print(cross_val_scores)
    return ada
ada_boost_model(X_train,y_train,X_test,y_test)
#{'learning_rate': 1.5, 'n_estimators': 50}

0.8626716604244694
0.8666666666666667
[0.8447205 0.79375   0.8625    0.81875   0.85     ]


AdaBoostClassifier(learning_rate=1.5)

In [56]:
param_grid = {'learning_rate':[0.001,0.01,0.05,0.08,0.1],'subsample': [0.3,0.5,0.8,1],'n_estimators':[300,500,800,1500],'max_features':[6,7,8,9,10]}
model = GradientBoostingClassifier()
def grid_searchCV_gbc(param_grid,model,X_train,y_train,X_test,y_test):
    gs = GridSearchCV(model,param_grid).fit(X_train,y_train)
    prediction = gs.predict(X_test)
    train_score = gs.score(X_train,y_train)
    test_score = accuracy_score(y_test,prediction)
    best_params = gs.best_params_
    print(train_score)
    print(test_score)
    print(best_params)
grid_searchCV_gbc(param_grid,model,X_train,y_train,X_test,y_test)


KeyboardInterrupt: 

In [16]:
param_grid = {'n_estimators':[100,300,500,800,1500],'max_features':[1,4,6,7,8,9,10],'max_depth' : [None,1, 3, 5 ,7,10,12]}
model = RandomForestClassifier()
def grid_searchCV_rf(param_grid,model,X_train,y_train,X_test,y_test):
    gs = GridSearchCV(model,param_grid).fit(X_train,y_train)
    prediction = gs.predict(X_test)
    train_score = gs.score(X_train,y_train)
    test_score = accuracy_score(y_test,prediction)
    best_params = gs.best_params_
    print(train_score)
    print(test_score)
    print(best_params)
grid_searchCV_rf(param_grid,model,X_train,y_train,X_test,y_test)

0.947565543071161
0.8555555555555555
{'max_depth': 10, 'max_features': 4, 'n_estimators': 500}


In [30]:
param_grid = {'learning_rate':[0.05,0.1,0.5,0.8,1,1.5,1.8,2],'n_estimators':[10,20,35,45,50,55,60,70]}
model = AdaBoostClassifier()
def grid_searchCV_ada(param_grid,model,X_train,y_train,X_test,y_test):
    gs = GridSearchCV(model,param_grid).fit(X_train,y_train)
    prediction = gs.predict(X_test)
    train_score = gs.score(X_train,y_train)
    test_score = accuracy_score(y_test,prediction)
    best_params = gs.best_params_ 
    print(train_score)
    print(test_score)
    print(best_params)
grid_searchCV_ada(param_grid,model,X_train,y_train,X_test,y_test)

0.8626716604244694
0.8666666666666667
{'learning_rate': 1.5, 'n_estimators': 50}


In [31]:
## MODEL IN TEST DATA

In [93]:
def final_models(X,Y,X_test,id):
    gbc = GradientBoostingClassifier(learning_rate=0.1,subsample = 1,max_features = 7,n_estimators =300)
    rf =  RandomForestClassifier(n_estimators= 500,max_features=6,max_depth=10)
    ada = AdaBoostClassifier(n_estimators = 50,learning_rate = 1.5)
    models = [gbc,rf,ada]
    for model in models:
        model.fit(X,Y)
        pred = model.predict(X_test)
        submission = pd.DataFrame({'PassengerID': id, 'Survived': pred})
        submission.to_csv('submission{}.csv'.format(model),index=False)

    
    
    
    
    ##gbc = GradientBoostingClassifier(learning_rate=0.1,subsample = 1,max_features = 7,n_estimators =300).fit(X,Y)
    #gbc_prediction = gbc.predict(X_test)
    #rf = rRandomForestClassifier(n_estimators= 500,max_features=6,max_depth=10).fit(X,Y)
    #rf_prediction = rf.predict(X_test)
    #ada = AdaBoostClassifier(n_estimators = 50,learning_rate = 1.5).fit(X,Y)
    #ada_predict = ada.predict(X_test)


    #gbc_submission = pd.DataFrame({'PassengerId' : id,'Survived': gbc_prediction})
    #gbc_submission.to_csv('gbc_submission.csv',index = False)
    return 


In [94]:
final_models(train_data,Y,test_data,id)