#Data Pre-processing

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import the dataset

In [6]:
#importing the dataset
dataset=pd.read_csv("train_formatted.csv")
dataset

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,unknown,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,unknown,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,unknown,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,unknown,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,unknown,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,C


Extracting matrix of features and dependent variable

In [0]:
X=dataset.iloc[:,[1,3,4,5,6,7,8,9,10]].values
y=dataset.iloc[:,0].values

In [8]:
X

array([[3, 'male', 22.0, ..., 7.25, 'unknown', 'S'],
       [1, 'female', 38.0, ..., 71.2833, 'C', 'C'],
       [3, 'female', 26.0, ..., 7.925, 'unknown', 'S'],
       ...,
       [3, 'female', nan, ..., 23.45, 'unknown', 'S'],
       [1, 'male', 26.0, ..., 30.0, 'C', 'C'],
       [3, 'male', 32.0, ..., 7.75, 'unknown', 'Q']], dtype=object)

Dealing with Missing Values

In [0]:
#dealing with missing values
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit(X[:,2:3])
X[:,2:3]=imputer.transform(X[:,2:3])

In [10]:
X

array([[3, 'male', 22.0, ..., 7.25, 'unknown', 'S'],
       [1, 'female', 38.0, ..., 71.2833, 'C', 'C'],
       [3, 'female', 26.0, ..., 7.925, 'unknown', 'S'],
       ...,
       [3, 'female', 29.69911764705882, ..., 23.45, 'unknown', 'S'],
       [1, 'male', 26.0, ..., 30.0, 'C', 'C'],
       [3, 'male', 32.0, ..., 7.75, 'unknown', 'Q']], dtype=object)

Encoding categorical variables

In [0]:
#categorical encoding 
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()
X[:,0]=labelencoder_X.fit_transform(X[:,0])
X[:,1]=labelencoder_X.fit_transform(X[:,1])
X[:,7]=labelencoder_X.fit_transform(X[:,7])
X[:,8]=labelencoder_X.fit_transform(X[:,8])

In [0]:
#Removing ticket column
X=X[:,[0,1,2,3,4,6,7,8]]

Dummy coding categorical variables with more than 2 categories

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [0]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [7]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [0]:
#Dummy coding using OneHotEncoder
#X
from sklearn.compose import ColumnTransformer

transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [15]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())
X = X.astype('float64')

# To avoid the dummy variable trap , we remove one  of the dummy variables 
X=X[:,1:]

In [16]:
X[1,:]

array([ 0.    ,  0.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
       38.    ,  1.    ,  0.    , 71.2833])

Splitting the dataset into training set and test set

In [0]:
#splitting the dataset into training and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

Applying feature scaling

In [0]:
from sklearn.preprocessing import StandardScaler
sc_X2=StandardScaler()
# scaling dummy variables
X_train=sc_X2.fit_transform(X_train)
X_test=sc_X2.transform(X_test)

In [19]:
X_train[1,:]

array([-0.31426968,  0.61414657, -0.05307449, -0.23076923, -0.25664813,
       -0.19468147, -0.20973381, -0.10660036, -0.06504853, -0.03750293,
        0.54488848,  1.96893685, -1.11944833,  0.72882288,  0.09662937,
       -0.46445234, -0.47741019, -0.42640542])

In [37]:
X_train.shape

(712, 18)

#Applying Ensemble Learning 

In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
clf1=KNeighborsClassifier(n_neighbors=5,p=2,metric="minkowski")
clf2=SVC(kernel='rbf',random_state=0)
clf3=RandomForestClassifier(n_estimators=200,criterion="entropy",random_state=0)
eclf1 = VotingClassifier(estimators=[('knn', clf1), ('svc', clf2), ('rf', clf3)], voting='hard')
eclf1.fit(X_train,y_train)



VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=5,
                                                   p=2, weights='uniform')),
                             ('svc',
                              SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='rbf', max_iter=-1,
                                  probability=False, ra...
                                                     criterion='entropy',
                                                     max_depth=None,
  

#Making Predictions and checking accuracy

Making predictions using classifier

In [0]:
#predicting results
y_pred=eclf1.predict(X_test)


Building confusion matrix

In [22]:
#seeing accuracy
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
pred_true=cm[0,0]+cm[1,1]
pred_false=cm[0,1]+cm[1,0]
print("correct predictions:"+str(pred_true))
print("incorrect predictions:"+str(pred_false))

correct predictions:146
incorrect predictions:33


Applying k-fold cross validation

In [0]:
from sklearn.model_selection import cross_val_score
'''this will return the k accuracies of the k experiments '''
accuracies=cross_val_score(estimator=eclf1,X=X_train,y=y_train,cv=10)

In [42]:
accuracies.std()

0.022307881150360073

In [43]:
accuracies.mean()

0.8076291079812206