##Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re

##Importing Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

##Data Engineering

In [3]:
def get_title(x):
    name = x['Name']
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        title = title_search.group(1)
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col','Sir']:
        return 'Mr'
    elif title in ['Countess', 'Mme','Dona','Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

In [4]:
train['Cabin']=train['Cabin'].fillna(0)
test['Cabin']=test['Cabin'].fillna(0)
train['hasCabin']=train['Cabin'].apply(lambda x: 0 if x==0 else 1)
test['hasCabin']=test['Cabin'].apply(lambda x: 0 if x==0 else 1)

In [5]:
train['FamilyMem']=train.apply(lambda x: x['SibSp']+x['Parch'], axis=1)
test['FamilyMem']=test.apply(lambda x: x['SibSp']+x['Parch'], axis=1)

In [6]:
train['title']=train.apply(get_title, axis=1)
test['title']=test.apply(get_title, axis=1)

In [7]:
fcol=['Pclass','Sex','Embarked','hasCabin','title','Age','FamilyMem','Fare']
train_df=train[fcol]
test_df=test[fcol]

In [8]:
X = train_df.iloc[:, 0:].values
y = train['Survived'].values

##Imputing Missing data

In [9]:
def impute_missing_data(X):
  from sklearn.impute import SimpleImputer
  imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
  imputer.fit(X[:, 5:])
  X[:, 5:] = imputer.transform(X[:, 5:])
  imputer2 = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
  imputer2.fit(X[:, 0:4])
  X[:, 0:4] = imputer2.transform(X[:, 0:4])
  return X

##Encoding categorical data

In [10]:
def encode_categorical_data(X):
  from sklearn.compose import ColumnTransformer
  from sklearn.preprocessing import OneHotEncoder
  ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [0,1,2,3,4])],remainder='passthrough')
  X = np.array(ct.fit_transform(X))
  return X

In [11]:
X = impute_missing_data(X)
X = encode_categorical_data(X)

##Splitting the data between Training and Test dataset

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=32)

##Feature Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

##Training the classifiers

In [14]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 400, criterion = 'entropy', random_state = 32, max_depth = 5)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=400,
                       random_state=32)

## Predicting the Test set results

In [15]:
y_pred = classifier.predict(X_test)
print(y_pred)

[0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1
 0 0 1 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1
 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 1
 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 1 0 1 0]


## Making the Confusion Matrix

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))

[[96 12]
 [22 49]]
0.8100558659217877


## Applying k-Fold Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.99 %
Standard Deviation: 2.97 %


## Applying Grid Search to find the best model and the best parameters

In [18]:
# from sklearn.model_selection import GridSearchCV
# parameters = { 
#     'n_estimators': [100,200,300,400],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini', 'entropy']
# }
# grid_search = GridSearchCV(estimator = classifier,
#                            param_grid = parameters,
#                            scoring = 'accuracy',
#                            cv = 10,
#                            n_jobs = -1)
# grid_search = grid_search.fit(X_train, y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
# print("Best Parameters:", best_parameters)

KeyboardInterrupt: 

In [None]:
X_testset = test_df.to_numpy()
X_testset = impute_missing_data(X_testset)
X_testset = encode_categorical_data(X_testset)
X_testset = sc.transform(X_testset)
y_pred_testset = classifier.predict(X_testset)
print(y_pred_testset)