In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [9]:

df = pd.read_csv('dataset/titanic.csv')
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,,3,"Heikkinen, Miss. Laina",female,,0,0,STON/O2. 3101282,,,S
3,4,,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0.0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1.0,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     887 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          713 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         888 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(3), int64(4), object(5)
memory usage: 83.7+ KB


In [11]:
df.isnull().sum()

PassengerId      0
Survived         4
Pclass           0
Name             0
Sex              0
Age            178
SibSp            0
Parch            0
Ticket           0
Fare             3
Cabin          687
Embarked         2
dtype: int64

In [12]:
df.drop(['PassengerId','Ticket','Fare','Cabin','Name'] , inplace=True , axis=1)

In [13]:

df.fillna(method='ffill' , inplace=True)

In [14]:

df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [15]:
x = df.iloc[:,1:]

In [16]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,38.0,0,0,S
3,1,female,35.0,1,0,S
4,3,male,35.0,0,0,S
...,...,...,...,...,...,...
886,2,male,27.0,0,0,S
887,1,female,19.0,0,0,S
888,3,female,19.0,1,2,S
889,1,male,26.0,0,0,C


In [17]:

y = df.iloc[:,0]

In [18]:
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x , y , test_size=0.2 ,random_state=2)


In [20]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
30,1,male,40.0,0,0,C
10,3,female,4.0,1,1,S
873,3,male,47.0,0,0,S
182,3,male,9.0,4,2,S
876,3,male,20.0,0,0,S
...,...,...,...,...,...,...
534,3,female,30.0,0,0,S
584,3,male,36.0,0,0,C
493,1,male,71.0,0,0,C
527,1,male,50.0,0,0,S


In [21]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
707,1,male,42.0,0,0,S
37,3,male,21.0,0,0,S
615,2,female,24.0,1,2,S
169,3,male,28.0,0,0,S
68,3,female,17.0,4,2,S
...,...,...,...,...,...,...
89,3,male,24.0,0,0,S
80,3,male,22.0,0,0,S
846,3,male,42.0,8,2,S
870,3,male,26.0,0,0,S


In [22]:
y_train

30     0.0
10     1.0
873    0.0
182    0.0
876    0.0
      ... 
534    0.0
584    0.0
493    0.0
527    0.0
168    0.0
Name: Survived, Length: 712, dtype: float64

In [23]:
y_test

707    1.0
37     0.0
615    1.0
169    0.0
68     1.0
      ... 
89     0.0
80     0.0
846    0.0
870    0.0
251    0.0
Name: Survived, Length: 179, dtype: float64

In [24]:
tr = ColumnTransformer(transformers=[
    ('t2',OneHotEncoder(drop='first',sparse=False),['Sex','Embarked'])
] , remainder='passthrough')


In [25]:
a = tr.fit_transform(x_train)
a


array([[ 1.,  0.,  0., ..., 40.,  0.,  0.],
       [ 0.,  0.,  1., ...,  4.,  1.,  1.],
       [ 1.,  0.,  1., ..., 47.,  0.,  0.],
       ...,
       [ 1.,  0.,  0., ..., 71.,  0.,  0.],
       [ 1.,  0.,  1., ..., 50.,  0.,  0.],
       [ 1.,  0.,  1., ..., 45.,  0.,  0.]])

In [26]:

b = tr.fit_transform(x_test)
b


array([[ 1.,  0.,  1., ..., 42.,  0.,  0.],
       [ 1.,  0.,  1., ..., 21.,  0.,  0.],
       [ 0.,  0.,  1., ..., 24.,  1.,  2.],
       ...,
       [ 1.,  0.,  1., ..., 42.,  8.,  2.],
       [ 1.,  0.,  1., ..., 26.,  0.,  0.],
       [ 0.,  0.,  1., ..., 29.,  1.,  1.]])

In [27]:
log = LogisticRegression()

In [29]:
log.fit(a,y_train)

LogisticRegression()

In [30]:
y_pred = log.predict(b)
y_pred

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1.,
       1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 1., 1., 1., 0., 0., 0., 0., 0.])

In [31]:
from sklearn.metrics import accuracy_score , confusion_matrix


In [32]:
print("Accuracy matrix =",accuracy_score(y_test,y_pred))


Accuracy matrix = 0.7821229050279329


In [33]:
pd.DataFrame(confusion_matrix(y_test , y_pred)) # if all the values are comes along the diagonal then accurac

Unnamed: 0,0,1
0,92,8
1,31,48


In [34]:

result = pd.DataFrame()
result['Actual Label'] = y_test
result['Decision Tree Prediction'] = y_pred

In [35]:
result

Unnamed: 0,Actual Label,Decision Tree Prediction
707,1.0,0.0
37,0.0,0.0
615,1.0,1.0
169,0.0,0.0
68,1.0,0.0
...,...,...
89,0.0,0.0
80,0.0,0.0
846,0.0,0.0
870,0.0,0.0
