In [1]:
# Core Modules
import pandas as pd
import numpy as np

# Basic modules for data visualization
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [2]:
test_df = pd.read_csv('clean_test.csv')
train_df = pd.read_csv('clean_train.csv')

In [3]:
test_df.shape

(418, 11)

In [4]:
train_df.shape

(891, 12)

In [5]:
final_df = pd.concat([train_df,test_df],axis=0)

In [6]:
final_df.shape

(1309, 12)

In [7]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        1309 non-null   object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [8]:
def category_onehot_multcols(multcolumns):
    df_final = final_df
    i = 0
    for fields in multcolumns:
        print(fields)
        df1 = pd.get_dummies(final_df[fields],drop_first=True)
        final_df.drop([fields],axis=1,inplace=True)
        if i==0:
            df_final = df1.copy()
        else:
            df_final = pd.concat([df_final,df1],axis=1)
        i = i + 1

    df_final = pd.concat([final_df,df_final],axis = 1)
    return df_final

In [9]:
cat_attributes = final_df[['Sex', 'Cabin', 'Embarked']]

In [10]:
final_df = category_onehot_multcols(cat_attributes)

Sex
Cabin
Embarked


In [11]:
final_df.shape

(1309, 198)

In [12]:
final_df = final_df.loc[:,~final_df.columns.duplicated()]
final_df.shape

(1309, 198)

In [13]:
df_train = final_df.iloc[:891,:]
df_test = final_df.iloc[891:,:]

In [14]:
df_train.shape

(891, 198)

In [15]:
df_test.shape

(418, 198)

In [16]:
df_test.drop(['Survived'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Columns: 198 entries, PassengerId to S
dtypes: float64(3), int64(4), object(2), uint8(189)
memory usage: 234.1+ KB


In [18]:
df_train = df_train.drop(['Name', 'Ticket'],axis=1)

In [25]:
df_test = df_test.drop(['Name', 'Ticket'],axis=1)

In [19]:
x_train = df_train.drop(['Survived'],axis=1)
y_train = df_train['Survived']

### First model (Score : 0.78468)

In [20]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [21]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [22]:
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [23]:
df_test.shape

(418, 197)

In [24]:
df_train.shape

(891, 196)

In [26]:
model_pred = model.predict(df_test)

In [93]:
pred = pd.DataFrame(model_pred)
sub_df = pd.read_csv('../../Datasets/titanic/gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)

In [94]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   0            418 non-null    int32
dtypes: int32(1), int64(1)
memory usage: 5.0 KB


In [99]:
datasets.columns = ['PassengerId', 'Survived']
datasets.to_csv('submission.csv',index=False)

### Hyperparameter Tuning 1 (Score : 0.76794 (Downgrade))

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
model = XGBClassifier()

In [29]:
params = {
    "learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth" : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight" : [1, 3, 5, 7],
    "gamma" : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "subsample" : [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.0],
    "colsample_bytree" : [0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.0],
    "reg_alpha" : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    "reg_lambda" : [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

In [31]:
random_search = RandomizedSearchCV(estimator = model, param_distributions = params, n_iter = 10, scoring = 'roc_auc', cv = 5, verbose=3, n_jobs = -1)

In [32]:
random_search.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,...
                   param_distributions={'colsample_bytree': [0.6, 0.65, 0.7,
                                                             

In [35]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.95,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=5,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0.5, reg_lambda=0.5, ...)

In [33]:
random_search.best_params_

{'subsample': 0.65,
 'reg_lambda': 0.5,
 'reg_alpha': 0.5,
 'min_child_weight': 5,
 'max_depth': 12,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 0.95}

In [36]:
model = XGBClassifier(subsample= 0.65, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.95,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=5,
              monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0.5, reg_lambda=0.5)

In [37]:
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.95,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.3, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=5,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0.5, reg_lambda=0.5, ...)

In [38]:
model_pred = model.predict(df_test)

In [39]:
pred = pd.DataFrame(model_pred)
sub_df = pd.read_csv('../../Datasets/titanic/gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)

In [40]:
datasets.columns = ['PassengerId', 'Survived']
datasets.to_csv('submission2.csv',index=False)