In [16]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [17]:
#load dataset
df = pd.read_csv("C:/Users/itspr/OneDrive/Documents/Jupyter notebook/titanic.csv")

#select the relevant data
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df= df[features +['Survived']].dropna()

#display first few columns
print(df.head())

   Pclass     Sex   Age  SibSp  Parch     Fare Embarked  Survived
0       3    male  22.0      1      0   7.2500        S         0
1       1  female  38.0      1      0  71.2833        C         1
2       3  female  26.0      0      0   7.9250        S         1
3       1  female  35.0      1      0  53.1000        S         1
4       3    male  35.0      0      0   8.0500        S         0


In [18]:
#Define preprocessing steps

num_features=['Age' , 'SibSp', 'Parch', 'Fare']
cat_features=['Pclass', 'Sex', 'Embarked']

#define tranformers

num_transformer= StandardScaler()
cat_transformer= OneHotEncoder(handle_unknown='ignore')

#combine  transformer into preprocesssor
preprocessor= ColumnTransformer([
    ('num' , num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [19]:
#split the data for training and testing
#define target and features
X=df[features]
y=df['Survived']

#split into train and test sets
X_train,X_test, y_train, y_test= train_test_split(X, y , test_size=0.2, random_state=42)

#Display the shape of the data
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (569, 7)
Testing set shape: (143, 7)


In [12]:
#Build and Train Model
#define the Pipeline

pipeline=Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

#Train the model
pipeline.fit(X_train,y_train)
print("Model Training Complete!")

Model Training Complete!


In [20]:
#Evaluate the model
#make prediction
y_pred= pipeline.predict(X_test)

#Compute acuracy
accuracy= accuracy_score(y_test,y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.76


In [21]:
#Save and Load the Model
joblib.dump(pipeline,'ml_pipeline.pkl')

#load the model
loaded_pipeline=joblib.load('ml_pipeline.pkl')

#Predict using the load model
sample_data=pd.DataFrame([{'Pclass':3, 'Sex':'male', 'Age':25, 'SibSp':0, 'Parch':0, 'Fare':7.5, 'Embarked':'5'}])
prediction= loaded_pipeline.predict(sample_data)
print(f"Prediction: {'Survived' if prediction[0] == 1 else  'Did not Survived'}")

Prediction: Did not Survived
