In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

train = pd.read_csv('/kaggle/input/soai-lab-titanic-survival-prediction-challenge/train_titanic.csv')
test = pd.read_csv('/kaggle/input/soai-lab-titanic-survival-prediction-challenge/test_titanic.csv')
sample_submission = pd.read_csv('/kaggle/input/soai-lab-titanic-survival-prediction-challenge/sample_submission.csv')

# brief data describtion 
print(train.info())
print(train.describe())

# missing values
def handle_missing_values(data):
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['age'].fillna(data['age'].median(), inplace=True)
    data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)
    data.drop(['cabin'], axis=1, inplace=True) 
    return data

train = handle_missing_values(train)
test = handle_missing_values(test)

# feature engineering
def featureng(data):
    data['Title'] = data['name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    title_mapping = {
        'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Dr': 5, 'Rev': 6,
        'Col': 7, 'Major': 7, 'Mlle': 2, 'Countess': 3, 'Ms': 2, 'Lady': 3, 'Jonkheer': 1,
        'Don': 1, 'Dona': 3, 'Mme': 3, 'Capt': 7, 'Sir': 1
    }
    data['Title'] = data['Title'].map(title_mapping).fillna(0)
    data['FamilySize'] = data['sibsp'] + data['parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    data.drop(['name', 'ticket', 'sibsp', 'parch'], axis=1, inplace=True)
    return data

train = featureng(train)
test = featureng(test)

# drop target and let features
y = train['survived']
X = train.drop(['PassengerId', 'survived'], axis=1)
X_test = test.drop(['PassengerId'], axis=1)

#important to check if the column is categorical or numerical to change it later
categorical_columns = ['sex', 'embarked', 'boat', 'home.dest', 'Title']
numeric_columns = ['pclass', 'age', 'fare', 'body', 'FamilySize', 'IsAlone']


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing numeric values with mean
    ('scaler', StandardScaler())                 # Scale numeric values
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                      # One-hot encode categorical values
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# preprocessing
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

# data split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


param_grids = {
    'logistic regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']},
    'random forest': {'n_estimators': [100, 200], 'max_depth': [10, 20, None]},
    'gradient boosting': {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    'svm': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']},
    'knn': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    'xgbboost': {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1, 0.2], 'max_depth': [3, 5, 7]}
}

models = {
    'logistic regression': LogisticRegression(max_iter=1000),
    'random forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'gradient boosting': GradientBoostingClassifier(random_state=42),
    'svm': SVC(),
    'knn': KNeighborsClassifier(),
    'xgbboost': XGBClassifier(eval_metric='logloss')
}

best_score = 0
best_model = ''

for name, modele in models.items():
    grid_search = GridSearchCV(modele, param_grids[name], cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    pred = grid_search.best_estimator_.predict(X_val)
    score = f1_score(y_val, pred)
    print(f'{name} score: {score:.5f}')
    if score > best_score:
        best_model = grid_search.best_estimator_
        best_score = score

# cv for best model
cvscore = cross_val_score(best_model, X, y, cv=5, scoring='f1', n_jobs=-1)
print(f'best modele: {best_model.__class__.__name__}')
print(f'cross validation score is: {cvscore.mean():.5f}')

#submission
final_pred = best_model.predict(X_test)
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': final_pred})
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session