<a href="https://colab.research.google.com/github/mlvika/Machine_learining/blob/main/Titanic_dataset_analysis_(logistic_regression).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ML model to predict survival rate in titanic dataset
 

In [1]:
# importing libraries
import os
import io
import warnings

import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import numpy.ma as ma

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()


# evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# logistic regression
from sklearn.linear_model import LogisticRegression

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# load data
titanic = pd.read_csv('https://raw.githubusercontent.com/zariable/data/master/titanic_train.csv')
display(titanic.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Identify and replace nulls

In [4]:
display(titanic.isna().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Update :

*   the missing values of column age with the mean of that column.

*   the missing values of column Cabin with a constant value 'other'.

*   the missing values of column Embarked with the mode of that column.








In [3]:
titanic.Age = round(titanic.Age.fillna(titanic.Age.mean()),1)
titanic.Cabin = titanic.Cabin.fillna('Other')
mode = titanic.Embarked.mode()
titanic.Embarked = titanic.Embarked.fillna(mode[0])
display(titanic.head())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Other,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Other,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Other,S


Convert to categorical variables

In [5]:
titanic.Pclass = titanic.Pclass.astype('category')
titanic.Sex = titanic.Sex.astype('category')
titanic.Parch = titanic.Parch.astype('category')
titanic.SibSp = titanic.SibSp.astype('category')
titanic.Cabin = titanic.Cabin.astype('category')
titanic.Embarked = titanic.Embarked.astype('category')
titanic.Survived = titanic.Survived.astype('category')


Convert categories into numeric codes

In [6]:
titanic.Pclass = titanic.Pclass.cat.codes
titanic.Sex = titanic.Sex.cat.codes
titanic.Parch = titanic.Parch.cat.codes
titanic.SibSp = titanic.SibSp.cat.codes
titanic.Cabin = titanic.Cabin.cat.codes
titanic.Embarked = titanic.Embarked.cat.codes
titanic.Survived = titanic.Survived.cat.codes

titanic.info()
titanic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int8   
 2   Pclass       891 non-null    int8   
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int8   
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int8   
 7   Parch        891 non-null    int8   
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int16  
 11  Embarked     891 non-null    int8   
dtypes: float64(2), int16(1), int64(1), int8(6), object(2)
memory usage: 41.9+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,2,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,146,2
1,2,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,81,0
2,3,1,2,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,146,2
3,4,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,55,2
4,5,0,2,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,146,2


Selecting factors and target variable

In [7]:
train_x = titanic.loc[:,['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'Cabin', 'Embarked']]
train_y = titanic.loc[:,['Survived']]

Logistic regression

In [8]:
model = LogisticRegression()
model.fit(train_x,train_y)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Checking accuracy with train dataset

In [9]:
predicted_values = model.predict(train_x)
accuracy_score(train_y, predicted_values)

0.7957351290684624

Confusion matix and classification report

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

confusion_matrix = confusion_matrix(train_y, predicted_values)
print("Confusion Matrix: \n{}\n".format(confusion_matrix))
print("Classification Report: \n{}".format(classification_report(train_y, predicted_values)))

Confusion Matrix: 
[[469  80]
 [102 240]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       549
           1       0.75      0.70      0.73       342

    accuracy                           0.80       891
   macro avg       0.79      0.78      0.78       891
weighted avg       0.79      0.80      0.79       891



Importing test dataset

In [11]:
test = pd.read_csv("https://raw.githubusercontent.com/zariable/data/master/titanic_test.csv")
display(test.shape)
display(test.head())

(418, 11)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Handle null values

In [12]:
test.Age = round(test.Age.fillna(test.Age.mean()),1)
test.Fare = round(test.Fare.fillna(test.Fare.mean()),1)
test.Cabin = test.Cabin.fillna('Other')
mode = test.Embarked.mode()
test.Embarked = test.Embarked.fillna(mode[0])
display(test.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8,Other,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Other,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.7,Other,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.7,Other,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.3,Other,S


Convert to categorical variables

In [13]:
test.Pclass = test.Pclass.astype('category')
test.Sex = test.Sex.astype('category')
test.Parch = test.Parch.astype('category')
test.Cabin = test.Cabin.astype('category')
test.Embarked = test.Embarked.astype('category')

#Convert categories into numeric codes
test.Pclass = test.Pclass.cat.codes
test.Sex = test.Sex.cat.codes
test.Parch = test.Parch.cat.codes
test.Cabin = test.Cabin.cat.codes
test.Embarked = test.Embarked.cat.codes

Fit model and prepare result dataframe with columns "PassengerId" and "Survived"

In [14]:
test_x = test.loc[:,['Pclass', 'Sex', 'Age', 'Fare', 'Parch', 'Cabin', 'Embarked']]
test_prediction = model.predict(test_x)
prediction_df =  pd.DataFrame(test_prediction)
prediction_df['PassengerId'] = test['PassengerId']
prediction_df = prediction_df.rename(columns={0: "Survived"})
display(prediction_df)

Unnamed: 0,Survived,PassengerId
0,0,892
1,1,893
2,0,894
3,0,895
4,1,896
...,...,...
413,0,1305
414,1,1306
415,0,1307
416,0,1308


Save dataframe to csv for submission to Kaggle

In [15]:
from google.colab import drive
drive.mount('/content/drive')
prediction_df.to_csv('drive/My Drive/titanic_prediction.csv', index = False)

Mounted at /content/drive


**Score on Kaggle: 0.76794**