# Load datasets 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [100]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [101]:
train.shape, test.shape

((891, 12), (418, 11))

In [102]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [103]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data preprocessing-1

* Removing Name feature from train and test datasets,  assuming that it does not contain any useful information

In [104]:
#Removing Name features from train and test datasets, assuming that it does not contain any useful information
train.drop('Name', axis=1, inplace=True)
test.drop('Name', axis=1, inplace=True)

In [105]:
catagorical_features = list()
for column in train.columns:
    if train[column].dtypes == 'object':
        catagorical_features.append(column)
numerical_features = [set(train.columns) - set(catagorical_features)]

In [106]:
#train.Ticket.value_counts()

* feature 'Age' has some missing values so replacing missing values with median of Age
* feature 'Cabin' also has some missing values so for this model, we ignore this feature

** featres 'Ticket' is a catagorical feature and it has so many different values(a big set of different values) so we ignore it in this model because while converting this catagorical feature inot numerical resulting in so many dummy columns. So ignoring this feature

** In test data, feature 'Fare' has some missing values. In train data, Fare does not have any missing values. So we simply replace missing values with the median on Fare in train data and then try to predict.

In [107]:
train['Age'].fillna(train.Age.median(), inplace=True)
test['Age'].fillna(train.Age.median(), inplace=True)

test.Fare.fillna( train.Fare.median(), inplace=True)


train.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
test.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [108]:
train.shape, test.shape

((891, 9), (418, 8))

* Now we have features with values, we have either removed or replaced missing values of features

In [109]:
catagorical_features.remove('Cabin')
catagorical_features.remove('Ticket')
catagorical_features

['Sex', 'Embarked']

In [110]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

###### Now convert catagorical features into numerical features

In [111]:
train = pd.get_dummies(data=train, 
                      columns=catagorical_features,
                      drop_first=True)

In [112]:
test = pd.get_dummies(data=test,
                     columns=catagorical_features,
                     drop_first=True)

In [113]:
train.shape,test.shape

((891, 10), (418, 9))

In [114]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [115]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


* check after making dummy variables does train and test data have same columns name or not?
* Ensure that both should have same features(features name) after coverting catagorical into numerical

In [116]:
set(train.columns) - set(test.columns)

{'Survived'}

# First round of analysis-1

In [117]:
x, y = train.drop('Survived', axis=1), train['Survived']

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=17)

In [120]:
from sklearn.tree import DecisionTreeClassifier

In [121]:
dt1 = DecisionTreeClassifier()

In [122]:
dt1.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [123]:
try:
    y_pred = dt1.predict(x_test)
except Exception as e:
    msg = e.msg if hasattr(e, 'msg') else str(e)
    print(msg)

#### Accuracy measure of model-1

In [124]:
from sklearn.metrics import accuracy_score

In [125]:
accuracy_score(y_test, y_pred)

0.7238805970149254

## Now build a new model by tunning the hyper parameter - say model-12

In [126]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [127]:
param_grid = {'max_depth': np.arange(2,11), 'min_samples_leaf': np.arange(2,11)}

In [128]:
dt = DecisionTreeClassifier(random_state=17)

In [129]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [130]:
optimal_dt1 = GridSearchCV(estimator=dt, param_grid=param_grid, n_jobs=-1, cv=skf)

In [131]:
optimal_dt1.fit(x_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=17, shuffle=True),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]), 'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [135]:
optimal_dt1.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [136]:
optimal_dt1.best_score_

0.8426966292134831

In [137]:
y_pred_optimal_dt1 = optimal_dt1.predict(x_test)

In [138]:
accuracy_score(y_test, y_pred_optimal_dt1)

0.7723880597014925

# Now predict on final test set using the model-12 and upload the results on kaggle 

In [140]:
submission = pd.DataFrame(data=test['PassengerId'])


In [142]:
submission['Survived'] = optimal_dt1.predict(test)

In [150]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [151]:
submission.to_csv('Submission2.csv', index=False)


In [152]:
!ls

Submission2.csv
Untitled.ipynb
gender_submission.csv
test.csv
train.csv


# Second round of analysis-2

#### Load datasets

In [289]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

###### Pre-processing of data

In [290]:
train.shape, test.shape

((891, 12), (418, 11))

In [291]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

(None, None)

In [292]:
train.columns, test.columns

(Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

In [293]:
# check class distribution is balanced or not
train['Survived'].value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [294]:
# check features have NaN or not
nan_features_train = train.isna().any()
nan_features_test = test.isna().any()

In [295]:
nan_featureas = set()
for i, v in nan_features_test.items():
    if v:
        nan_featureas.add(i)

for i, v in nan_features_train.items():
    if v:
        nan_featureas.add(i)

In [296]:
# seperate out the catagorical and numerical features
catagorical_features = []
for col in train.columns:
    if train[col].dtypes == 'object':
        catagorical_features.append(col)
catagorical_features = set(catagorical_features)
numerical_features = set(train.columns) - catagorical_features


In [297]:
catagorical_features, numerical_features

({'Cabin', 'Embarked', 'Name', 'Sex', 'Ticket'},
 {'Age', 'Fare', 'Parch', 'PassengerId', 'Pclass', 'SibSp', 'Survived'})

In [298]:
nan_catagorical_features = nan_featureas.intersection(catagorical_features)
nan_numerical_features = nan_featureas.intersection(numerical_features)

In [299]:
nan_catagorical_features

{'Cabin', 'Embarked'}

In [300]:
nan_numerical_features

{'Age', 'Fare'}

* find out the percentage of missing values for all the features

In [301]:
missing_percentage_in_train = train[train.columns].isnull().sum()/len(train)
missing_percentage_in_test = test[test.columns].isnull().sum()/len(test)

In [302]:
missing_percentage_in_train, missing_percentage_in_test

(PassengerId    0.000000
 Survived       0.000000
 Pclass         0.000000
 Name           0.000000
 Sex            0.000000
 Age            0.198653
 SibSp          0.000000
 Parch          0.000000
 Ticket         0.000000
 Fare           0.000000
 Cabin          0.771044
 Embarked       0.002245
 dtype: float64, PassengerId    0.000000
 Pclass         0.000000
 Name           0.000000
 Sex            0.000000
 Age            0.205742
 SibSp          0.000000
 Parch          0.000000
 Ticket         0.000000
 Fare           0.002392
 Cabin          0.782297
 Embarked       0.000000
 dtype: float64)

* features having more than 20% missing values should be removed, because they do not contain usefull information so better to remove them.
* And features having less than 20% missing values should be imputed

In [303]:
remove_features, impute_features = set(), set()
for i,v in missing_percentage_in_train.items():
    if v <= 0.25 and v > 0.0:
        impute_features.add(i)
    if v > 0.25:
        remove_features.add(i)
        
for i,v in missing_percentage_in_test.items():
    if v <= 0.25 and v > 0.0:
        impute_features.add(i)
    if v > 0.25:
        remove_features.add(i)

In [304]:
remove_features, impute_features

({'Cabin'}, {'Age', 'Embarked', 'Fare'})

###### Impute and remove selective features

In [305]:
# add 'Name' feature in the list of remove_features because there is no useful information that needs to be learned
remove_features.add('Name')
remove_features.add('Ticket')

In [306]:
train.drop(columns=remove_features, axis=1, inplace=True)

test.drop(columns=remove_features, axis=1, inplace=True)

In [307]:
train.columns,test.columns
len(train.Age)

891

###### Impute selective features #######

In [308]:
from sklearn.preprocessing import Imputer

In [309]:
num_impute_features = impute_features.intersection(numerical_features)
imputer = Imputer(strategy='median')
for feat in num_impute_features:
    print('Before impute train:', train[feat].isnull().any(), 'test', test[feat].isnull().any())
    imputer.fit( np.array(train[feat]).
                reshape( len( np.array( train[feat])), 1))
    
    train[feat] = imputer.transform( np.array(train[feat]).
                                   reshape( len(train[feat]), 1))
    
    test[feat] = imputer.transform( np.array(test[feat]).
                                  reshape( len(test[feat]), 1))
    print('After impute train:', train[feat].isnull().any(), 'test', test[feat].isnull().any())
    
cat_impute_features = impute_features.intersection(catagorical_features)
imputer = Imputer(strategy='most_frequent')
for feat in cat_impute_features:
    print('Before impute train:', train[feat].isnull().any(), 'test', test[feat].isnull().any())
    train[feat].fillna(train[feat].mode()[0], inplace=True)
    test[feat].fillna(train[feat].mode()[0], inplace=True)
    print('After impute train:', train[feat].isnull().any(), 'test', test[feat].isnull().any())

Before impute train: False test True
After impute train: False test False
Before impute train: True test True
After impute train: False test False
Before impute train: True test False
After impute train: False test False


###### Now replace catagorical data into numerical using one hot encoding

In [310]:
cat_covertable_feats = catagorical_features - remove_features

In [311]:
train = pd.get_dummies(data=train, columns=list(cat_covertable_feats), drop_first=True )
test = pd.get_dummies(data=test, columns=list(cat_covertable_feats), drop_first=True)

In [312]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked_Q', 'Embarked_S', 'Sex_male'],
      dtype='object')

###### Now devide data into train and test sets 

In [313]:
x, y = train.drop(columns='Survived', axis=0), train['Survived']

In [314]:
from sklearn.model_selection import train_test_split

In [315]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=17)

###### Build decision tree model on newly pre-processed data

In [316]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [317]:
dt2 = DecisionTreeClassifier(random_state=17)

In [318]:
skf2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

In [319]:
grid_param = {'max_depth': np.arange(2,15), 'min_samples_leaf': np.arange(2, 11)}
best_dt2 = GridSearchCV(estimator=dt2, param_grid=grid_param, cv=skf2, n_jobs=-1)

In [320]:
best_dt2.fit(x_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=17, shuffle=True),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [324]:
y_pred = best_dt2.predict(x_test)

In [325]:
from sklearn.metrics import accuracy_score

In [326]:
accuracy_score(y_test, y_pred)

0.7686567164179104

###### finally predict on original test set and submit the answer for evaluation

In [332]:
submission = pd.DataFrame(data=test['PassengerId'])

In [335]:
submission['Survived'] = best_dt2.predict(test)