#Titanic Survival Prediction 🛳️
📌 Overview

This notebook uses the Titanic dataset to predict passenger survival (Survived: 0 = died, 1 = survived) from demographic and travel features.

🤖 Models

Logistic Regression → interpretable baseline.

Random Forest → ensemble of decision trees, captures nonlinear patterns.

Naive Bayes  → simple probabilistic model.

In [40]:
import pandas as pd

In [66]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data=pd.read_csv(url)

In [67]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# EDA

In [43]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
class Missing:
  def __init__ (self,data):
    self.data=data

  def missing_values(self):

    Missing_values=self.data.isnull().sum()
    percentage=Missing_values/len(data)*100


    Missing_df=pd.DataFrame({
    'Missing_values_count':Missing_values,
    'Percentage':percentage
    })
    return Missing_df


In [45]:
data_missing= Missing(data)
data_missing.missing_values()

Unnamed: 0,Missing_values_count,Percentage
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,177,19.86532
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,0.0


In [46]:
data.drop(['Age','Cabin','Embarked'],axis=1,inplace=True)

In [47]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare'],
      dtype='object')

In [48]:
data.dtypes

Unnamed: 0,0
PassengerId,int64
Survived,int64
Pclass,int64
Name,object
Sex,object
SibSp,int64
Parch,int64
Ticket,object
Fare,float64


In [49]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare'],
      dtype='object')

In [50]:
X_data=data.drop(['Survived'],inplace=False,axis=1)
X_data.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,1,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833
2,3,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1
4,5,3,"Allen, Mr. William Henry",male,0,0,373450,8.05


In [51]:
y_data=data['Survived']
y_data.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [52]:
Not_int_columns=data.select_dtypes(include=['category','object']).columns
Not_int_columns

Index(['Name', 'Sex', 'Ticket'], dtype='object')

In [53]:
data[Not_int_columns]

Unnamed: 0,Name,Sex,Ticket
0,"Braund, Mr. Owen Harris",male,A/5 21171
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803
4,"Allen, Mr. William Henry",male,373450
...,...,...,...
886,"Montvila, Rev. Juozas",male,211536
887,"Graham, Miss. Margaret Edith",female,112053
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607
889,"Behr, Mr. Karl Howell",male,111369


In [54]:
X_data.dtypes

Unnamed: 0,0
PassengerId,int64
Pclass,int64
Name,object
Sex,object
SibSp,int64
Parch,int64
Ticket,object
Fare,float64


In [55]:
X_data_enc=pd.get_dummies(X_data[Not_int_columns],drop_first=False,dtype=bool)
X_data_enc

Unnamed: 0,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel","Name_Abelson, Mrs. Samuel (Hannah Wizosky)","Name_Adahl, Mr. Mauritz Nils Martin","Name_Adams, Mr. John","Name_Ahlin, Mrs. Johan (Johanna Persdotter Larsson)","Name_Aks, Mrs. Sam (Leah Rosen)","Name_Albimona, Mr. Nassef Cassem",...,Ticket_STON/O2. 3101290,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
887,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
889,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Split Dataset

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X_data_enc, y_data, test_size=0.2, random_state=42
)

#Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [58]:
clf=LogisticRegression(max_iter=1000)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [59]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7877094972067039
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       105
           1       0.76      0.70      0.73        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



# Random Forest

In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
clf=RandomForestClassifier()
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [62]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



#Naive Bayes

In [63]:
from sklearn.naive_bayes import BernoulliNB

In [64]:
clf= BernoulliNB()
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [65]:
print("Accuracy",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy 0.7988826815642458
              precision    recall  f1-score   support

           0       0.81      0.87      0.83       105
           1       0.79      0.70      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

