<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Importing-Dependencies" data-toc-modified-id="Importing-Dependencies-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Importing Dependencies</a></span></li><li><span><a href="#Data-Loading-&amp;-Processing" data-toc-modified-id="Data-Loading-&amp;-Processing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data Loading &amp; Processing</a></span><ul class="toc-item"><li><span><a href="#Handling-the-Missing-values" data-toc-modified-id="Handling-the-Missing-values-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Handling the Missing values</a></span></li><li><span><a href="#Data-Analysis" data-toc-modified-id="Data-Analysis-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Data Analysis</a></span></li><li><span><a href="#Data-Visualization" data-toc-modified-id="Data-Visualization-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Data Visualization</a></span></li><li><span><a href="#Encoding-the-Categorical-Columns" data-toc-modified-id="Encoding-the-Categorical-Columns-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Encoding the Categorical Columns</a></span></li></ul></li><li><span><a href="#Separating-features-&amp;-Target" data-toc-modified-id="Separating-features-&amp;-Target-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Separating features &amp; Target</a></span></li><li><span><a href="#Splitting-the-data-into-training-data-&amp;-Test-data" data-toc-modified-id="Splitting-the-data-into-training-data-&amp;-Test-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Splitting the data into training data &amp; Test data</a></span></li><li><span><a href="#Model-Training" data-toc-modified-id="Model-Training-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Model-Evaluation" data-toc-modified-id="Model-Evaluation-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Model Evaluation</a></span></li><li><span><a href="#Save-the-trained-model" data-toc-modified-id="Save-the-trained-model-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Save the trained model</a></span></li></ul></div>

### Importing Dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

### Data Loading & Processing

In [3]:
titanic_data = pd.read_csv('titanic_train.csv')

In [4]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_data.shape

(891, 12)

In [6]:
# getting some informations about the data
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

#### Handling the Missing values

In [None]:
titanic_data = titanic_data.drop(columns='Cabin', axis=1)

In [None]:
# replacing the missing values in "Age" column with mean value
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

In [None]:
# finding the mode value of "Embarked" column
print(titanic_data['Embarked'].mode())

In [None]:
print(titanic_data['Embarked'].mode()[0])

In [None]:
# replacing the missing values in "Embarked" column with mode value
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

In [None]:
# check the number of missing values in each column
titanic_data.isnull().sum()

#### Data Analysis

In [None]:
# getting some statistical measures about the data
titanic_data.describe()

In [None]:
# finding the number of people survived and not survived
titanic_data['Survived'].value_counts()

#### Data Visualization

In [None]:
sns.set()

In [None]:
# making a count plot for "Survived" column
sns.countplot(titanic_data, x='Survived')

In [None]:
# making a count plot for "Pclass" column
sns.countplot(titanic_data, x='Pclass')

In [None]:
sns.countplot(titanic_data, hue='Survived', x='Pclass')

#### Encoding the Categorical Columns

In [None]:
titanic_data['Sex'].value_counts()

In [None]:
titanic_data['Embarked'].value_counts()

In [None]:
# converting categorical Columns

titanic_data.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)

In [None]:
titanic_data.head()

### Separating features & Target

In [None]:
X = titanic_data.drop(columns = ['PassengerId','Name','Ticket','Survived'],axis=1)
Y = titanic_data['Survived']

In [None]:
print(X)

In [None]:
print(Y)

### Splitting the data into training data & Test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

### Model Training

In [None]:
model = LogisticRegression()

In [None]:
# training the Logistic Regression model with training data
model.fit(X_train, Y_train)

### Model Evaluation

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)

In [None]:
print(X_train_prediction)

In [None]:
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)

In [None]:
X_test

In [None]:
print(X_test_prediction)

In [None]:
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', round(test_data_accuracy, 2))

### Save the trained model

In [None]:
# Save the trained model to a file
filename = 'models/titanic_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open("models/titanic_model.sav", 'rb'))