In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [94]:
df= pd.read_csv('dataset/titanic.csv')

In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     887 non-null    float64
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          713 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         888 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(3), int64(4), object(5)
memory usage: 83.7+ KB


In [96]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,887.0,891.0,713.0,891.0,891.0,888.0
mean,446.0,0.382187,2.308642,29.704306,0.523008,0.381594,32.28253
std,257.353842,0.486196,0.836071,14.536033,1.102743,0.806057,49.758986
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.0,0.0,0.0,7.9177
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45625
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.06875
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [97]:
df.isnull().sum()

PassengerId      0
Survived         4
Pclass           0
Name             0
Sex              0
Age            178
SibSp            0
Parch            0
Ticket           0
Fare             3
Cabin          687
Embarked         2
dtype: int64

In [98]:
df.drop(['Ticket','PassengerId','Cabin','Name','Fare'],axis=1, inplace =True)

In [99]:
df.fillna(method='ffill' , inplace=True)

In [100]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.0,3,male,22.0,1,0,S
1,0.0,1,female,38.0,1,0,C
2,0.0,3,female,38.0,0,0,S
3,0.0,1,female,35.0,1,0,S
4,0.0,3,male,35.0,0,0,S


In [101]:
x= df.iloc[:,1:]

In [102]:
y = df.iloc[:,0]
y

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [103]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size=0.2,random_state=2)

In [104]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
30,1,male,40.0,0,0,C
10,3,female,4.0,1,1,S
873,3,male,47.0,0,0,S
182,3,male,9.0,4,2,S
876,3,male,20.0,0,0,S
...,...,...,...,...,...,...
534,3,female,30.0,0,0,S
584,3,male,36.0,0,0,C
493,1,male,71.0,0,0,C
527,1,male,50.0,0,0,S


In [105]:
x_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
707,1,male,42.0,0,0,S
37,3,male,21.0,0,0,S
615,2,female,24.0,1,2,S
169,3,male,28.0,0,0,S
68,3,female,17.0,4,2,S
...,...,...,...,...,...,...
89,3,male,24.0,0,0,S
80,3,male,22.0,0,0,S
846,3,male,42.0,8,2,S
870,3,male,26.0,0,0,S


In [106]:
y_train

30     0.0
10     1.0
873    0.0
182    0.0
876    0.0
      ... 
534    0.0
584    0.0
493    0.0
527    0.0
168    0.0
Name: Survived, Length: 712, dtype: float64

In [107]:
y_test

707    1.0
37     0.0
615    1.0
169    0.0
68     1.0
      ... 
89     0.0
80     0.0
846    0.0
870    0.0
251    0.0
Name: Survived, Length: 179, dtype: float64

In [108]:
tr = ColumnTransformer(transformers=[
    ('t2',OneHotEncoder(drop='first',sparse=False),['Sex','Embarked'])
] , remainder='passthrough')

In [109]:
x1 = tr.fit_transform(x_train)

In [110]:
x2 = tr.fit_transform(x_test)

In [111]:
log = LogisticRegression(random_state=0)

In [112]:
log.fit(x1,y_train)

LogisticRegression(random_state=0)

In [113]:
y_pred = log.predict(x2)

In [114]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [115]:
print("Accuracy Score",accuracy_score(y_test,y_pred))

Accuracy Score 0.7821229050279329


In [116]:
pd.DataFrame(confusion_matrix(y_test,y_pred),columns=list(range(2)))

Unnamed: 0,0,1
0,92,8
1,31,48


In [117]:
result = pd.DataFrame()
result['Actual Label'] = y_test
result['Decision Tree Prediction'] = y_pred
result

Unnamed: 0,Actual Label,Decision Tree Prediction
707,1.0,0.0
37,0.0,0.0
615,1.0,1.0
169,0.0,0.0
68,1.0,0.0
...,...,...
89,0.0,0.0
80,0.0,0.0
846,0.0,0.0
870,0.0,0.0
