In [None]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, auc
from sklearn.model_selection import GridSearchCV


## 1) Data previews

In [None]:
test=pd.read_csv('/kaggle/input/titanic/test.csv')
train=pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
train.head()

In [None]:
test.head()

## 2) Data processing

### 2.1 Data Cleaning

**hundling the missing value in  both train and test dataframe**

In [None]:
#check the existing of null value in train dataframe 
train.isnull().sum()

In [None]:
#check the existing of null value in test dataframe 

test.isnull().sum()

In [None]:
### hundle missing values in Age columns for both trin and test
train['Age'].fillna(train['Age'].mean(), inplace= True)

test['Age'].fillna(test['Age'].mean(), inplace= True)

In [None]:
# hundling missing values in Embarked columns for train dataframe
train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)

In [None]:
#hundle the missing values in Fare columns for test dataframe
test.Fare.fillna(test.Fare.mean(),inplace=True)

In [None]:
# creating another columns from the cabin columns and fill null value with 'Unknown' for both of Datafrme
train['Deck']=train.Cabin.str[0]
train['Deck'].fillna('Unknown', inplace= True)
train.drop('Cabin', axis=1, inplace=True)

test['Deck']=test.Cabin.str[0]
test['Deck'].fillna('Unknown', inplace= True)
test.drop('Cabin', axis=1, inplace=True)

### 2.2) Data Transforming

**Encode columns that has categorical values for both Dataframe**

In [None]:
train['Sex'] = train['Sex'].astype(str)
train['Embarked'] = train['Embarked'].astype(str)
train['Deck'] = train['Deck'].astype(str)

test['Sex'] = test['Sex'].astype(str)
test['Embarked'] = test['Embarked'].astype(str)
test['Deck'] = test['Deck'].astype(str)

categories1=[['female', 'male']]
categories2=[['C','Q','S']]
categories3=[['Unknown','A','B','C','D','E','F','G','T']]
encoder1=OrdinalEncoder(categories=categories1,dtype=float)
encoder2=OrdinalEncoder(categories=categories2,dtype=float)
encoder3=OrdinalEncoder(categories=categories3,dtype=float)

train['Sex']=encoder1.fit_transform(train[['Sex']])
train['Embarked']=encoder2.fit_transform(train[['Embarked']])
train['Deck']=encoder3.fit_transform(train[['Deck']])

test['Sex']=encoder1.fit_transform(test[['Sex']])
test['Embarked']=encoder2.fit_transform(test[['Embarked']])
test['Deck']=encoder3.fit_transform(test[['Deck']])

**scaling the Age and Fare columns**

In [None]:
scaler = MinMaxScaler()
train[['Age', 'Fare']] = scaler.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = scaler.fit_transform(test[['Age', 'Fare']])


### 2.3) Feature Engineering

**Creating new feature for SibSp and Parch columns**

In [None]:
train['Family']=train.Parch+train.SibSp+1
test['Family']=test.Parch+test.SibSp+1

## 3) Data Analysis

### 3.1) Data Distribution

**some feature behavior**

In [None]:
sns.histplot(train['Sex'])
plt.title('number of females and male')

In [None]:
sns.histplot(train['Pclass'])
plt.title('number of passenger in each class')

In [None]:
sns.histplot(train['Embarked'])
plt.title('number of passenger in each Embarked')

In [None]:
sns.histplot(train['Deck'])
plt.title('number of passenger in each Deck')

In [None]:
sns.histplot(train['Family'])
plt.title('number of passenger in each Family number')

**Relation between Feature and Target**

In [None]:
sns.countplot(x='Sex', hue='Survived',data=train)
plt.title('Relation between number of passenger in Sex and Survived')

In [None]:
sns.countplot(x='Pclass', hue='Survived',data=train)
plt.title('Relation between number of passenger in Pclass and Survived')

In [None]:
sns.countplot(x='Embarked', hue='Survived',data=train)
plt.title('Relation between number of passenger in Embarked and Survived')

In [None]:
sns.countplot(x='Deck', hue='Survived',data=train)
plt.title('Relation between number of passenger in Deck and Survived')

In [None]:
sns.countplot(x='Family', hue='Survived',data=train)
plt.title('Relation between number of passenger in Family and Survived')

In [None]:
sns.scatterplot(x='Age', y='Fare', hue='Survived', data=train)

In [None]:
train.corr(numeric_only=True)['Survived'].sort_values(ascending=False)

## 4) Data Modeling

In [None]:
#data spliting
feature=['Sex', 'Pclass', 'Deck', 'Fare','Embarked']
x=train[feature]
y=train['Survived']
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)


In [None]:
#check the model accuracy to select
models = {
    'LogisticRegression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNN':KNeighborsClassifier(n_neighbors=5)
}
for name, model in models.items():
    model.fit(x_train,y_train)
    y_pred_train=model.predict(x_train)
    
    y_pred_test=model.predict(x_val)
    acc_train=accuracy_score(y_train,y_pred_train)
    acc_val=accuracy_score(y_val,y_pred_test)

    print(name+' acc_test accuracy: '+str(acc_val))
    print(name+' acc_train accuracy: '+str(acc_train))
    print(name+' Gap between val accuracy and train accuracy : '+str(acc_train-acc_val))
    print()

In [None]:
#selected  model
model= RandomForestClassifier(random_state=42)

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [None]:
# Grid search with 5-fold CV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1)

In [None]:
# Fit on training data
grid_search.fit(x_train, y_train)

In [None]:
best_model=grid_search.best_estimator_
y_pred=best_model.predict(x_val)

In [None]:
accuracy_score(y_val,y_pred)

In [None]:
prediction=best_model.predict(test[feature])
prediction

In [None]:
output = pd.DataFrame({'PassengerId': test.PassengerId,
                       'Survived': prediction})
output.to_csv('submission.csv', index=False)