### ML EXP-2: Implementation of an End-to-End Machine Learning Data Pipeline.
### Name : Manav Lakhani
### Roll No. : 35
### Date : 29/01/2026

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import seaborn as sns

In [67]:
#Load Dataset
titanic_data = sns.load_dataset('titanic')
print(titanic_data.shape)
print(titanic_data.columns)

(891, 15)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [68]:
print(titanic_data.head())
print(titanic_data.tail())

   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
0         0       3    male  22.0  ...   NaN  Southampton     no  False
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
4         0       3    male  35.0  ...   NaN  Southampton     no   True

[5 rows x 15 columns]
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
886         0       2    male  27.0  ...   NaN  Southampton     no   True
887         1       1  female  19.0  ...     B  Southampton    yes   True
888         0       3  female   NaN  ...   NaN  Southampton     no  False
889         1       1    male  26.0  ...     C    Cherbourg    yes   True
890         0       3    male  32.0  ...   NaN   Queenstown     no   True

[5 rows x 15 columns]


In [69]:
print(titanic_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [70]:
print(titanic_data.describe())

         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [71]:
missing_vaslues = titanic_data.isnull().sum()
print(missing_vaslues)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [72]:
#Drop Deck Columns
new_titanic_data = titanic_data.drop(columns = ['deck'])
print(new_titanic_data.head())

   survived  pclass     sex   age  ...  adult_male  embark_town  alive  alone
0         0       3    male  22.0  ...        True  Southampton     no  False
1         1       1  female  38.0  ...       False    Cherbourg    yes  False
2         1       3  female  26.0  ...       False  Southampton    yes   True
3         1       1  female  35.0  ...       False  Southampton    yes  False
4         0       3    male  35.0  ...        True  Southampton     no   True

[5 rows x 14 columns]


In [73]:
# Filling median for age
age_median = new_titanic_data['age'].median()
new_titanic_data['age'].fillna(age_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_titanic_data['age'].fillna(age_median, inplace=True)


In [74]:
print(new_titanic_data.isnull().sum())

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    2
alive          0
alone          0
dtype: int64


In [75]:
new_titanic_data['embark_town'].dtype
new_titanic_data['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

In [76]:
new_titanic_data['embark_town'].fillna(new_titanic_data['embark_town'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_titanic_data['embark_town'].fillna(new_titanic_data['embark_town'].mode()[0], inplace=True)


In [77]:
print(new_titanic_data.isnull().sum())

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [80]:
new_titanic_data['embarked'].fillna(new_titanic_data['embarked'].mode()[0], inplace=True)


In [81]:
print(new_titanic_data.isnull().sum())

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64


In [82]:
# Enocde categorical variables
le = LabelEncoder()
new_titanic_data['sex'] = le.fit_transform(new_titanic_data['sex'])
new_titanic_data['embarked'] = le.fit_transform(new_titanic_data['embarked'])

In [83]:
new_titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,Third,man,True,Southampton,no,False
1,1,1,0,38.0,1,0,71.2833,0,First,woman,False,Cherbourg,yes,False
2,1,3,0,26.0,0,0,7.925,2,Third,woman,False,Southampton,yes,True
3,1,1,0,35.0,1,0,53.1,2,First,woman,False,Southampton,yes,False
4,0,3,1,35.0,0,0,8.05,2,Third,man,True,Southampton,no,True


In [84]:
new_titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    int64   
 3   age          891 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     891 non-null    int64   
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  891 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(6), object(3)
memory usage: 79.4+ KB


In [85]:
data = new_titanic_data[['pclass', 'sex', 'age','fare', 'embarked', 'survived']]

X = data[['pclass', 'sex', 'age', 'fare', 'embarked']]
Y = data['survived']

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [147]:
model = LogisticRegression(max_iter = 1000)
model.fit(X_train,y_train)

In [148]:
y_predict = model.predict(X_test)

In [149]:
accuracy = accuracy_score(y_test,y_predict)
print("Accuracy : ", accuracy)

Accuracy :  0.7988826815642458


In [156]:
new_passenger = pd.DataFrame({
    'pclass' : [3],
    'sex' : ['male'],
    'age' : [28],
    'fare' : [7.25],
    'embarked' : ['S']
})

In [157]:
new_passenger_encoded = pd.get_dummies(new_passenger)
new_passenger_encoded = new_passenger_encoded.reindex(columns=X.columns, fill_value=0)


In [158]:
prediction = model.predict(new_passenger_encoded)
print("Survived" if prediction[0] == 1 else "Not Survived")


Survived


In [178]:
new_passengers = pd.DataFrame({
    'pclass' : [1, 3, 2],
    'sex' : ['female', 'male', 'female'],
    'age' : [38, 100, 21],
    'fare' : [80.05, 8.05, 20.0],
    'embarked' : ['C', 'S', 'Q']
})

In [179]:
new_passengers_encoded = pd.get_dummies(new_passengers)
new_passengers_encoded = new_passengers_encoded.reindex(columns=X.columns, fill_value=0)

In [181]:
prediction = model.predict(new_passengers_encoded)

for i, pred in enumerate(prediction):
    print(f"Passenger {i+1}: {'Survived' if pred == 1 else 'Not Survived'}")

Passenger 1: Survived
Passenger 2: Not Survived
Passenger 3: Survived
