In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [None]:
titanic = pd.read_csv(r".\Titanicdata\train.csv")

In [None]:
print(titanic.head())
print(titanic.info())

In [None]:
#cambin,age and embarked have missing values 


In [None]:
print(titanic.describe())
print(titanic['Survived'].value_counts())


In [None]:
#Visualize how many people survived vs didn’t
sns.countplot(x='Survived', data=titanic)
plt.title("Survival Count (0 = Died, 1 = Survived)")
plt.show()

In [None]:
##Visualize  survived by gender
sns.countplot(x='Sex', hue='Survived', data=titanic)
plt.title("Survival by Gender")
plt.show()

In [None]:
##Visualize  survived by Age 
# Create age groups
titanic['AgeGroup'] = pd.cut(titanic['Age'], 
                             bins=[0, 12, 18, 35, 60, 80],
                             labels=['Child','Teen','Young Adult','Adult','Senior'])

# Plot survival by age group
sns.countplot(x='AgeGroup', hue='Survived', data=titanic)
plt.title("Survival by Age Group")
plt.show()


In [None]:
##Visualize  survived byPassenger Class
sns.countplot(x='Pclass', hue='Survived', data=titanic)
plt.title("Survival by Passenger Class")
plt.show()

In [None]:
#Visualize  survived by Embarked
sns.countplot(x='Embarked',hue="Survived",data=titanic)
plt.title("Survival by Port of Embarkation")
plt.show()


In [None]:
#thus in all these features there is variation with number of survial, so they really important in analysis 

In [None]:
#Step2:Data Preprocessing
print(titanic.info())
print(titanic.isna().sum())

In [None]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
print(titanic['Age'])

titanic['Cabin'] = titanic['Cabin'].fillna('Unknown')
print(titanic.head())

In [None]:

Sex_features=["Sex"]
# Fit and transform Sex column
# le = LabelEncoder()
# # Fit and transform the column
# titanic['Sex'] = le.fit_transform(titanic['Sex']) 

# print(titanic['Sex'])

Sex_transformer = Pipeline([
    ("onehot", OneHotEncoder(drop='if_binary'))
])


In [None]:
Embarked_features=['Embarked']
Embarked_transformer=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
         ("onehot",OneHotEncoder(handle_unknown="ignore"))
    ]
)

In [None]:
# Extract Title from Name
def extract_title(X):
    X = X.copy()
    X['Title'] = X['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)
    return X[['Title']]


In [None]:
# Bin Age into groups
def bin_age(X):
    X = X.copy()
    X['Age'] = X['Age'].fillna(X['Age'].median())
    bins = [0, 12, 20, 40, 60, 100]
    labels = ['Child', 'Teen', 'Adult', 'Middle-Age', 'Senior']
    X['AgeGroup'] = pd.cut(X['Age'], bins=bins, labels=labels)
    return X[['AgeGroup']]

In [None]:

name_features=['Name']
name_transformer=Pipeline(
    steps=[
      ("imputer",FunctionTransformer(func=extract_title,validate=False)),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))
      
    ]
)

In [None]:
age_features=['Age']
age_transformer=Pipeline(
    steps=[
        ("imputer",FunctionTransformer(func=bin_age,validate=False)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)


In [None]:
preprocessor= ColumnTransformer(
    transformers=[
        ("sex",Sex_transformer,Sex_features),
        ("embarked",Embarked_transformer,Embarked_features),
        ("name",name_transformer,name_features),
        ("age",age_transformer,age_features)
        ])

print(titanic.head())

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  

In [None]:

titanic_survive_X=titanic.drop('Survived',axis=1)
titanic_survive_y=titanic["Survived"]

In [None]:
titanic_X_train, titanic_X_test, titanic_y_train, titanic_y_test = train_test_split(
    titanic_survive_X, titanic_survive_y, test_size=0.2, random_state=42
)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

In [None]:
classification_results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    cv_scores = cross_val_score(pipeline, titanic_survive_X, titanic_survive_y, cv=5, scoring="accuracy")
    #score per fold ,it split autumatically
    print(cv_scores)
    classification_results[model_name] = cv_scores.mean()
    # result_df = pd.DataFrame.from_dict(classification_results, orient='index', columns=["Mean Accuracy"])


In [None]:
# Results
result_df = pd.DataFrame(
    list(classification_results.values()),
    index=classification_results.keys(),
    columns=["Accuracy"]
)
print("\nResults:\n", result_df)

#Logistic Regression  0.792361 -->it has the highest average accurecy


In [None]:
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score,confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import GridSearchCV


In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])


In [None]:
# hyperparameters to try
param_grid = {
    "model__C": [0.01, 0.1, 1, 10, 100],        
    "model__solver": ["liblinear"]}     

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)


In [None]:
grid_search.fit(titanic_survive_X, titanic_survive_y)


In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best mean CV accuracy:", grid_search.best_score_)



In [None]:
clf = grid_search.best_estimator_
y_pred=clf.predict(titanic_X_test)

In [None]:
import seaborn as sns
def plot_conf_mat(y_test, y_pred, title="Confusion Matrix"):
    fig, ax = plt.subplots(figsize=(4,4))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cbar=False, ax=ax)
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    ax.set_title(title)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.show()

plot_conf_mat(titanic_y_test,y_pred)

In [None]:
print(classification_report(titanic_y_test,y_pred))

x=precision_score(titanic_y_test,y_pred)
t=recall_score(titanic_y_test,y_pred)


In [None]:

print(x)
print(t)

In [None]:
from joblib import dump, load
dump(clf,"clf_modek.joblib")

Loaded_model=load("clf_modek.joblib")

print(f"Model score:{clf.score(titanic_X_test,titanic_y_test)}")
print(f"loaded model score:{Loaded_model.score(titanic_X_test,titanic_y_test)}")
