In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder Ùˆ StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report


In [None]:
train_data = pd.read_csv(f"C:\\Users\\OMID\\Desktop\\Titanic\\train (2).csv")
test_data = pd.read_csv(f"C:\\Users\\OMID\\Desktop\\Titanic\\test (1).csv")

In [41]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [43]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [44]:
imputer_age = SimpleImputer(strategy="mean")
encoder = OrdinalEncoder()
imputer_embarked = SimpleImputer(strategy="most_frequent")

imputer_age.fit(train_data[["Age"]])
imputer_embarked.fit(train_data[["Embarked"]])

train_data["Age"] = imputer_age.transform(train_data[["Age"]]).flatten()
train_data["Embarked"] = imputer_embarked.transform(train_data[["Embarked"]]).flatten()

train_data["Sex"] = encoder.fit_transform(train_data[["Sex"]])
train_data["Embarked"] = encoder.fit_transform(train_data[["Embarked"]])

In [None]:
X = np.array(train_data[["Pclass" , "Sex" , "Age" , "SibSp" , "Parch" , "Fare" , "Embarked"]])
Y = (train_data["Survived"])

In [46]:
X_train , X_test , Y_train , Y_test = train_test_split(X ,Y , test_size=0.2 , random_state=42)

In [47]:
models = {
    "random_forest" : Pipeline([
        ("scaler" ,StandardScaler()),
        ("rf" , RandomForestClassifier(random_state=42))
    ]),
    "logistic_regression" : Pipeline([
        ("scaler" , StandardScaler()),
        ("lr" , LogisticRegression(random_state=42))
    ]),
    "gradient_boosting" : Pipeline([
        ("scaler" , StandardScaler()),
        ("gb" , GradientBoostingClassifier(random_state=42))
    ])

}

param_grid_rf = {
    "rf__n_estimators" : [200 , 300 , 400] ,
    "rf__max_depth" : [None , 10 , 20] ,
    "rf__min_samples_split" : [2 , 5 , 10]
}

param_grid_lr={
    "lr__C":[0.1 , 1 , 10] ,
    "lr__penalty" : ["l2"] ,
    "lr__solver" : ["lbfgs"],
}
param_grid_gb = {
    "gb__n_estimators" : [500 , 100 , 200] ,
    "gb__learning_rate" : [0.01 , 0.1 , 0.2] ,
    "gb__max_depth" : [3 , 4 , 5]
}

best_models = {}

for model_name , model_pipeline in models.items():
    print(f"running GridSearchCV for {model_name}...")

    # crating GridSearchCV
    if model_name == "random_forest":
        param_grid = param_grid_rf
    
    elif model_name == "logistic_regression":
        param_grid = param_grid_lr
    
    else:
        param_grid = param_grid_gb

    
    grid_search = GridSearchCV(estimator = model_pipeline , param_grid = param_grid , cv=5 , scoring="accuracy" , verbose=2 , n_jobs=-1)
    grid_search.fit(X_train , Y_train)

    # save best model
    best_models[model_name] = grid_search.best_estimator_
    print(f"best_params for {model_name} : \n {grid_search.best_params_}")
    print(f"best_score for {model_name} : \n {grid_search.best_score_:.4f}")

running GridSearchCV for random_forest...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
best_params for random_forest : 
 {'rf__max_depth': None, 'rf__min_samples_split': 10, 'rf__n_estimators': 400}
best_score for random_forest : 
 0.8244
running GridSearchCV for logistic_regression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
best_params for logistic_regression : 
 {'lr__C': 0.1, 'lr__penalty': 'l2', 'lr__solver': 'lbfgs'}
best_score for logistic_regression : 
 0.7977
running GridSearchCV for gradient_boosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits




best_params for gradient_boosting : 
 {'gb__learning_rate': 0.1, 'gb__max_depth': 4, 'gb__n_estimators': 100}
best_score for gradient_boosting : 
 0.8217


In [51]:
for model_name, best_model in best_models.items():
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    print(f"Accuracy of best {model_name} on test data: {accuracy:.4f}")
    print(f"Classification report for {model_name}:\n {classification_report(Y_test, y_pred)}")

Accuracy of best random_forest on test data: 0.8436
Classification report for random_forest:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       105
           1       0.85      0.76      0.80        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.84       179
weighted avg       0.84      0.84      0.84       179

Accuracy of best logistic_regression on test data: 0.7989
Classification report for logistic_regression:
               precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

Accuracy of best gradient_boosting on test data: 0.8212
Classification report for gradient_boosting:
               precision    recall  f1-score 