In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [155]:
data = pd.read_csv('Titanic dataset.csv')

In [156]:
# reading data
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [157]:
#checking dimensions of the dataset
data.shape

(418, 12)

In [158]:
# checking unique values
data.value_counts()

PassengerId  Survived  Pclass  Name                                           Sex     Age   SibSp  Parch  Ticket    Fare      Cabin    Embarked
904          1         1       Snyder, Mrs. John Pillsbury (Nelle Stevenson)  female  23.0  1      0      21228     82.2667   B45      S           1
1164         1         1       Clark, Mrs. Walter Miller (Virginia McDowell)  female  26.0  1      0      13508     136.7792  C89      C           1
1213         0         3       Krekorian, Mr. Neshan                          male    25.0  0      0      2654      7.2292    F E57    C           1
1208         0         1       Spencer, Mr. William Augustus                  male    57.0  1      0      PC 17569  146.5208  B78      C           1
1206         1         1       White, Mrs. John Stuart (Ella Holmes)          female  55.0  0      0      PC 17760  135.6333  C32      C           1
                                                                                                               

In [159]:
# checking missiing data
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [160]:
data.fillna({'Age':data.Age.median()} , inplace = True)

In [161]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [162]:
data.drop('Cabin',axis=1,inplace=True)

In [163]:
data.Embarked.value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [164]:
data.fillna({'Embarked':data.Embarked.mode()[0]},inplace = True)

In [165]:
data.dropna(subset=['Fare'],inplace=True)

In [166]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [167]:
from sklearn.preprocessing import LabelEncoder

In [168]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

In [169]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]   # feature
y = data['Survived']  # target variable

In [170]:
# Split Data Set In To Train Test
from sklearn.model_selection import train_test_split
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=.20,random_state=12)

In [171]:
#feature scaling
from sklearn.preprocessing import StandardScaler

In [172]:
std = StandardScaler()
std_train = std.fit_transform(Xtrain)
std_test = std.fit_transform(Xtest)

# USING LOGISTIC REGRESSION 

In [173]:
from sklearn.linear_model import LogisticRegression

In [174]:
# Create a LogisticRegression Classifier
log = LogisticRegression(max_iter=2000)
# Train the model
log.fit(std_train,ytrain)

LogisticRegression(max_iter=2000)

In [175]:
print('Train_Score')
log.score(std_train,ytrain)

Train_Score


1.0

In [176]:
print('Test_Score')
log.score(std_test,ytest)

Test_Score


1.0

# USING RANDOM FOREST

In [177]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [178]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
rf_classifier.fit(Xtrain, ytrain)

RandomForestClassifier(random_state=42)

In [179]:
rf_classifier.predict(Xtest)

array([0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0], dtype=int64)

In [180]:
print('Accuracy_score')
accuracy_score(ytest,rf_classifier.predict(Xtest))

Accuracy_score


1.0

In [181]:
print('confusion_matrix')
confusion_matrix(ytest,rf_classifier.predict(Xtest))

confusion_matrix


array([[53,  0],
       [ 0, 31]], dtype=int64)

In [182]:
print('Classification report')
classification_report(ytest,rf_classifier.predict(Xtest))

Classification report


'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        53\n           1       1.00      1.00      1.00        31\n\n    accuracy                           1.00        84\n   macro avg       1.00      1.00      1.00        84\nweighted avg       1.00      1.00      1.00        84\n'