In [1]:
#loading all the required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#loading the train and test dataset
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train_raw = train.copy()
test_raw = test.copy()
print("Train : {}".format(train.shape))
print("Test : {}".format(test.shape))

Train : (891, 12)
Test : (418, 11)


In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#some useful information about the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


* There are 7 features with numeric data and 5 features with categorical data.
* There are missing values in 'Age','Cabin' and 'Embarked' columns.


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


* There are 6 features with numeric data (No survival column) and 5 features with categorical data.
* There are missing values in 'Age','Cabin' and 'Fare' columns.

**PassengerId**

In [7]:
#dropping PassengerId columns from both train and test datasets
train.drop(columns = ['PassengerId'],inplace = True)
test.drop(columns = ['PassengerId'],inplace = True)

**Pclass**

These are not really numbers but they represent the Lower, Middle and Upper class. Hence converting this feature to object type

In [8]:
train['Pclass'] = train['Pclass'].astype(str)
test['Pclass'] = test['Pclass'].astype(str)

**Name** 

The name of the passenger obviously won't help in predicting the survival but we can extract some features that do help in predicting the survival such as the *Title*

Hence extracting the Title and dropping the Name column

In [9]:
for dataset in (train,test) :
    dataset['Title'] = dataset['Name'].str.extract('([A-Za-z]+)\.',expand = False)
    #dropping Name column
    dataset.drop(columns = ['Name'],inplace = True)

In [10]:
train['Title'].groupby(by = train['Title']).count()

Title
Capt          1
Col           2
Countess      1
Don           1
Dr            7
Jonkheer      1
Lady          1
Major         2
Master       40
Miss        182
Mlle          2
Mme           1
Mr          517
Mrs         125
Ms            1
Rev           6
Sir           1
Name: Title, dtype: int64

In [11]:
test['Title'].groupby(by = test['Title']).count()

Title
Col         2
Dona        1
Dr          1
Master     21
Miss       78
Mr        240
Mrs        72
Ms          1
Rev         2
Name: Title, dtype: int64

In [12]:
#aggregating titles
Title_Dict = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "Countess": "Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"

                    }
train['Title'] = train['Title'].map(Title_Dict)
test['Title'] = test['Title'].map(Title_Dict)

**Age**

* Filling in the missing values of Age by the mean values of their respective Title
* Then creating Age bins

In [13]:
Age_dict = train.groupby(by = 'Title')['Age'].mean().astype(int).to_dict()
Age_dict

{'Master': 4, 'Miss': 21, 'Mr': 32, 'Mrs': 35, 'Officer': 46, 'Royalty': 41}

In [14]:
#filling the missing values
for dataset in (train,test):
    nan_idx = dataset.loc[dataset['Age'].isnull()].index
    dataset.loc[nan_idx,'Age'] = dataset.loc[nan_idx,'Title'].map(Age_dict)

In [15]:
#Creating Age bins
#Taking a look at the categories
#quantile based discretization
pd.qcut(train['Age'],q = 5).head()

0    (20.0, 26.0]
1    (32.0, 38.0]
2    (20.0, 26.0]
3    (32.0, 38.0]
4    (32.0, 38.0]
Name: Age, dtype: category
Categories (5, interval[float64]): [(0.419, 20.0] < (20.0, 26.0] < (26.0, 32.0] < (32.0, 38.0] < (38.0, 80.0]]

In [16]:
#Let's create bins based on the above categories
bins = [0,20,26,32,38,80]
train['Age'] = pd.cut(train['Age'],bins = bins,
                      labels = ['Age_{}'.format(str(x)) for x in np.arange(1,6,1)])
test['Age'] = pd.cut(test['Age'],bins = bins,
                    labels = ['Age_{}'.format(str(x)) for x in np.arange(1,6,1)])

**SibSp and Parch**

Let's extract new features from SibSp and Parch and drop these two features.

In [17]:
for dataset in (train,test):
    #Creating a feature called the Family size
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # Create new feature IsAlone from FamilySize
    dataset['IsAlone'] = (dataset['FamilySize'] == 1) * 1

train.drop(columns = ['SibSp','Parch'],inplace = True)
test.drop(columns = ['SibSp','Parch'],inplace = True)


**Ticket**

In [18]:
#dropping ticket feature as ticket number doesn't help in predicting survival
train.drop(columns = ['Ticket'],inplace = True)
test.drop(columns = ['Ticket'],inplace = True)

**Fare**

* Filling a missing value of Fare in the test set by the mean value of the respective Pclass
* Creating Fare bins

In [19]:
test.loc[test['Fare'].isnull()]

Unnamed: 0,Pclass,Sex,Age,Fare,Cabin,Embarked,Title,FamilySize,IsAlone
152,3,male,Age_5,,,S,Mr,1,1


In [20]:
test['Fare'].fillna(test.loc[test['Pclass'] == '3','Fare'].mean(),inplace = True)

In [21]:
#Creating fare bins
#Quantile cut
pd.qcut(train['Fare'],q = 4).head()

0     (-0.001, 7.91]
1    (31.0, 512.329]
2     (7.91, 14.454]
3    (31.0, 512.329]
4     (7.91, 14.454]
Name: Fare, dtype: category
Categories (4, interval[float64]): [(-0.001, 7.91] < (7.91, 14.454] < (14.454, 31.0] < (31.0, 512.329]]

In [22]:
#Creating fare bins based on the above categories
fare_bins = [-0.001,7.91,14.454,31,513]
train['Fare'] = pd.cut(train['Fare'],bins = fare_bins,
                       labels = ['Fare_{}'.format(str(x)) for x in np.arange(1,5,1)])
test['Fare'] = pd.cut(test['Fare'],bins = fare_bins,
                     labels = ['Fare_{}'.format(str(x)) for x in np.arange(1,5,1)])

**Cabin**

* Filling the missing values of Cabin with 'U' - Unknown
* Then extracting the first letter in the Cabin number which might be helpful in predicting the survival.

In [23]:
for dataset in (train,test):
    dataset['Cabin'].fillna('U',inplace = True)
    dataset['Cabin'] = dataset['Cabin'].apply(lambda x : x[0])

**Embarked**

Filling the missing values of the Embarked column with the mode of that column

In [24]:
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace = True)

In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
Pclass        418 non-null object
Sex           418 non-null object
Age           418 non-null category
Fare          418 non-null category
Cabin         418 non-null object
Embarked      418 non-null object
Title         418 non-null object
FamilySize    418 non-null int64
IsAlone       418 non-null int64
dtypes: category(2), int64(2), object(5)
memory usage: 24.1+ KB


In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived      891 non-null int64
Pclass        891 non-null object
Sex           891 non-null object
Age           891 non-null category
Fare          891 non-null category
Cabin         891 non-null object
Embarked      891 non-null object
Title         891 non-null object
FamilySize    891 non-null int64
IsAlone       891 non-null int64
dtypes: category(2), int64(3), object(5)
memory usage: 57.9+ KB


In [27]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [28]:
#separating out target variables and predictor variables
y_train = train['Survived']
#dropping Cabin_T column also as it is not present in the test dataset
train.drop(columns = ['Survived','Cabin_T'],inplace = True)

Now we have our dataset ready for training. Let's train different classifiers on the training set and analyze the results.

# **Modelling**

# Logistic Regression

In [29]:
#defining our error metric
from sklearn.model_selection import StratifiedKFold,cross_val_score
def accuracy(model):
    skfold = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 66)
    acc = cross_val_score(model,X = train,y = y_train,scoring = 'accuracy',cv = skfold)
    return acc.mean()

In [30]:
#finding C in LogisticRegression model
#from sklearn.linear_model import LogisticRegression
#for c in [0.001,0.01,0.1,1,10,100]:
    #lr = LogisticRegression(penalty='l2',solver = 'lbfgs',C = c,random_state = 6,max_iter = 500)
    #print("{} : {:.4f}".format(c,accuracy(lr)))

In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2',solver = 'lbfgs',C = 0.1,random_state = 6,max_iter = 100)
print("Logistic Regression score : {:.4f}".format(accuracy(lr)))
lr.fit(train,y_train)
pred_lr = lr.predict(test)
submission_lr = pd.DataFrame({"PassengerId" : test_raw["PassengerId"], "Survived" : pred_lr })
submission_lr.to_csv("logistic_regression",index = False)

Logistic Regression score : 0.8283


The above submission scored **0.77033** on public leaderboard

# Ensembling

# Random Forest Classifier

In [32]:
#cross validation to tune the parameters of random forest
#from sklearn.model_selection import GridSearchCV
#from sklearn.ensemble import RandomForestClassifier
#parameter_grid = {'n_estimators' : [10,50,100,200,500],
#                 'criterion' : ['entropy','gini'],
#                 'max_features' : ['log2', 'sqrt','auto'],
#                  'min_samples_leaf': [1,5,8],
#                 'max_depth': [50,80,90,100,110],
 #                 'min_samples_split':[2,3,5]
  #               }
#skfold = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 66)
#grid_search = GridSearchCV(RandomForestClassifier(random_state = 666),param_grid = parameter_grid,
 #                          scoring = 'accuracy',n_jobs = -1,iid = False,cv = skfold,verbose = 2 )
#grid_search.fit(train,y_train)
#print(grid_search.best_params_)
#print(grid_search.best_score_)

In [33]:
#Let's fit a random forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50,max_depth = 90,criterion = 'entropy',
                            max_features = 'log2',min_samples_leaf = 5,random_state = 55,
                            min_samples_split = 2) #parameters estimated using cross validation
print("Random Forest score : {:.4f}".format(accuracy(rf)))
rf.fit(train,y_train)
pred_rf = rf.predict(test)
submission_rf = pd.DataFrame({"PassengerId" : test_raw["PassengerId"], "Survived" : pred_rf })
submission_rf.to_csv("random_forest",index = False)

Random Forest score : 0.8350


In [34]:
#cross validation
# from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
# param_grid = {#'n_estimators' : [10,100,200,300,400,500,700,1000],
#               #'max_depth' : [2,3,4,5],
#                # 'min_child_weight' : [1,2,3,4]
#                 #'gamma' : [0.0,0.1,0.2,0.3,0.4,0.5],
#                 #'colsample_bytree' : [0.6,0.7,0.8,0.9,1.0],
#                 #'subsample' : [0.6,0.7,0.8,0.9,1.0],
#                 #'reg_alpha' : [0.01,0.03,0.1,0.3,1,3,10,30],
#                 #'reg_lambda' :[0.01,0.03,0.1,0.3,1,3,10,30]
#              } 
# skfold = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 66)
# XGB = xgb.XGBClassifier(learning_rate = 0.05,n_jobs = -1,max_depth = 2,n_estimators = 200,
#                        subsample = 0.9,colsample_bytree = 0.9,min_child_weight = 1,
#                         gamma = 0.0,reg_alpha = 0.01,reg_lambda = 1,random_state = 66)
# grid_search = GridSearchCV(XGB,param_grid = param_grid,scoring = 'accuracy',
#                            n_jobs = -1,iid = False,cv = skfold,verbose = 2)
# grid_search.fit(train,y_train)
# print(grid_search.best_params_)
# print(grid_search.best_score_)



In [35]:
import xgboost as xgb
XGB = xgb.XGBClassifier(learning_rate = 0.05,n_jobs = -1,max_depth = 2,n_estimators = 200,
                       subsample = 0.9,colsample_bytree = 0.9,min_child_weight = 1,
                        gamma = 0.0,reg_alpha = 0.01,reg_lambda = 1,
                        random_state = 66)   #parameters found by cross validation
print("XGB score : {:.4f}".format(accuracy(XGB)))
XGB.fit(train,y_train)
pred_xgb = XGB.predict(test)
submission_xgb = pd.DataFrame({"PassengerId" : test_raw["PassengerId"], "Survived" : pred_xgb })
submission_xgb.to_csv("XGBoost",index = False)

XGB score : 0.8429
