In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [2]:
#import the dataset and gain some info on it 
data = pd.read_csv('F:/Data science/projects/data_analysis/titanic_project/data/Titanic-Dataset.csv')

#check the shape and info of the data 
data.info()

#print the shape 
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
(891, 12)


In [3]:
#check if there are any null values 
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
#drop any missing values 
#we are going to perform data cleaning 
from sklearn.model_selection import train_test_split

def data_preprocess(data):
    df = data
    #remove the column with many missing features 
    df = df.drop(columns = ['PassengerId', 'Cabin', 'Ticket', 'Name'])
    #df fill missing values 
    df.fillna({'Embarked': 'S'}, inplace=True)

    #df map the sex with 1 and zero 
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    #encode the embarked 
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    #fill the age of missing values with middle age 
    df['Age'] = df['Age'].fillna(df['Age'].median())

    #calculate the family sise 
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    #split into features and target variables 

    X = df.drop(columns='Survived')
    y = df['Survived']

    #split into train and test sets 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test






In [14]:
#generate the X_train, y_train split 
X_train, X_test, y_train, y_test = data_preprocess(data)
#print the xshape
print(X_train.shape)
#print the yshape
print(y_train.shape)

(712, 9)
(712,)


In [20]:
#create the model and test various ones choose the best one at their basic 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


models = {
    'logistic_regression': LogisticRegression(max_iter=1000),
    'Decision_tree': DecisionTreeClassifier(),
    'Random_forest': RandomForestClassifier(),
    'SVC': SVC(),
    'K_nearest_neighbors': KNeighborsClassifier()
}

def evaluation_models(name, model, X_train, X_test, y_train, y_test):

    print(f'Training and evaluating {name}')   

    #fit the model
    model.fit(X_train, y_train)

    #create predictions 
    y_pred = model.predict(X_test)

    #produce a classification report 
    class_report = classification_report(y_test, y_pred)

    return print(f'the classification report of {name} :\n{class_report}')




In [21]:
for name, model in models.items():
    evaluation_models(name, model, X_train, X_test, y_train, y_test)
    


Training and evaluating logistic_regression
the classification report of logistic_regression :
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Training and evaluating Decision_tree
the classification report of Decision_tree :
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       105
           1       0.73      0.73      0.73        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

Training and evaluating Random_forest
the classification report of Random_forest :
              precision    recall  f1-score   support

           0       0.83      0

In [31]:
#lets find the best model and save it locally  
from sklearn.model_selection import GridSearchCV
import joblib


#create a function that test different parameters using grid search and then return best model with parameters 

def train_model(X_train, y_train):
    model = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [10,20,50,100,1000],
        'max_depth': [4, 6, 8]
    }
    grid = GridSearchCV(model, param_grid, cv=5)
    grid.fit(X_train, y_train)
    joblib.dump(grid.best_estimator_, 'model.pkl')

    return grid.best_estimator_

In [23]:
grid = train_model(X_train, y_train)

grid

In [24]:
#perform model evaluation 

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))
    

In [25]:
evaluate_model(grid, X_test, y_test)

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       105
           1       0.85      0.70      0.77        74

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.82       179



In [27]:
#import os 
import os 

#get the current working directory
current_directory = os.getcwd()
#subfolder name 
sub_folder = 'data'

#construct the path to the subfolder 
subfolder_path = os.path.join(current_directory, sub_folder)

#create the subfolder if it doesnt exists 
if not os.path.exists(subfolder_path):
    os.makedirs(subfolder_path)

#define the output csv file name within the subfolder 

output_csv_file = os.path.join(subfolder_path, 'X_train.csv')

#save the numpy array to csv file 
np.savetxt(output_csv_file, X_train, delimiter=',')

print(f'X_train saved as csv to : {output_csv_file}')

X_train saved as csv to : f:\Data science\projects\data_analysis\titanic_project\notebooks\data\X_train.csv


In [28]:
#define the output csv file name within the subfolder 

output_csv_file_1 = os.path.join(subfolder_path, 'X_test.csv')

#save the numpy array to csv file 
np.savetxt(output_csv_file_1, X_test, delimiter=',')

print(f'X_train saved as csv to : {output_csv_file_1}')

X_train saved as csv to : f:\Data science\projects\data_analysis\titanic_project\notebooks\data\X_test.csv


In [29]:
#define the output csv file name within the subfolder 

output_csv_file_2 = os.path.join(subfolder_path, 'y_train.csv')

#save the numpy array to csv file 
np.savetxt(output_csv_file_2, y_train, delimiter=',')

print(f'X_train saved as csv to : {output_csv_file_2}')

X_train saved as csv to : f:\Data science\projects\data_analysis\titanic_project\notebooks\data\y_train.csv


In [30]:
#define the output csv file name within the subfolder 

output_csv_file_3 = os.path.join(subfolder_path, 'y_test.csv')

#save the numpy array to csv file 
np.savetxt(output_csv_file_3, y_test, delimiter=',')

print(f'X_train saved as csv to : {output_csv_file_3}')

X_train saved as csv to : f:\Data science\projects\data_analysis\titanic_project\notebooks\data\y_test.csv
