# Importing data and libraries

In [None]:
#importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cd /kaggle/input/titanic

In [None]:
df=pd.read_csv("train.csv",na_values=['.',':','?','!','@','#','$','%','^','&','*',','])

In [None]:
df

In [None]:
df.info()

In [None]:
#checking for cardinality

df.nunique()

In [None]:
#Dropping unnecessary variables and saving into a dataframe

df = df.drop(["Name","Ticket","Fare","Cabin","PassengerId"], axis=1)

In [None]:
#checking for null values

df.isnull().sum()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
# Changing data types into categorical

for col in ['Pclass','Sex','Survived','Embarked']:
    df[col] = df[col].astype('category')

In [None]:
df.dtypes

# Spliting data into category and numerical columns

In [None]:
cat_cols=['Pclass', 'Sex','Embarked']

In [None]:
num_cols=['Age','SibSp', 'Parch']

In [None]:
cat_cols

In [None]:
num_cols

In [None]:
df[num_cols].describe()

# Splitting data into Train and Validation

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop('Survived',axis=1)
y= df['Survived']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

In [None]:
df["Survived"].value_counts(normalize=True)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_val.value_counts(normalize=True)

# Imputation

In [None]:
from sklearn.impute import SimpleImputer
imputer_num = SimpleImputer(strategy='median')
imputer_num = imputer_num.fit(X_train[num_cols])
X_train_num_imp = pd.DataFrame(imputer_num.transform(X_train[num_cols]), columns=(X_train[num_cols]).columns)
X_val_num_imp = pd.DataFrame(imputer_num.transform(X_val[num_cols]), columns=(X_val[num_cols]).columns)

In [None]:
X_train_num_imp.isnull().sum()

In [None]:
X_val_num_imp.isnull().sum()

In [None]:
imputer_cat = SimpleImputer(strategy='most_frequent')
imputer_cat = imputer_cat.fit(X_train[cat_cols])
X_train_cat_imp = pd.DataFrame(imputer_cat.transform(X_train[cat_cols]), columns=(X_train[cat_cols]).columns)
X_val_cat_imp = pd.DataFrame(imputer_cat.transform(X_val[cat_cols]), columns=(X_val[cat_cols]).columns)

In [None]:
X_train_cat_imp.isnull().sum()

In [None]:
X_val_cat_imp.isnull().sum()

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(X_train[num_cols])
X_train_num_std = pd.DataFrame(scaler.transform(X_train_num_imp), columns=X_train_num_imp.columns)
X_val_num_std = pd.DataFrame(scaler.transform(X_val_num_imp), columns=X_val_num_imp.columns)

# Dummification

In [None]:
X_train_cat_dum = pd.get_dummies((X_train_cat_imp),drop_first=True)
X_val_cat_dum = pd.get_dummies((X_val_cat_imp),drop_first=True)

In [None]:
X_train_cat_dum

In [None]:
X_val_cat_dum

In [None]:
X_train_num_std = X_train_num_std.reset_index()
X_train_num_std

In [None]:
X_train_num_std = X_train_num_std.drop(["index"], axis=1)
X_train_num_std

In [None]:
X_val_num_std = X_val_num_std.reset_index()
X_val_num_std

In [None]:
X_val_num_std = X_val_num_std.drop(["index"], axis=1)
X_val_num_std

In [None]:
X_train_cat_dum = X_train_cat_dum.reset_index()
X_train_cat_dum

In [None]:
X_train_cat_dum = X_train_cat_dum.drop(["index"], axis=1)
X_train_cat_dum

In [None]:
X_val_cat_dum = X_val_cat_dum.reset_index()
X_val_cat_dum

In [None]:
X_val_cat_dum = X_val_cat_dum.drop(["index"], axis=1)
X_val_cat_dum

# Concating

In [None]:
final_X_train = pd.concat([X_train_num_std, X_train_cat_dum], axis=1)

In [None]:
final_X_val = pd.concat([X_val_num_std, X_val_cat_dum], axis=1)

In [None]:
final_X_train.isnull().sum()

In [None]:
final_X_val.isnull().sum()

In [None]:
final_X_train

In [None]:
final_X_val

# Model Building

## Decision Tree

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree

In [None]:
dt = tree.DecisionTreeClassifier()

In [None]:
# Define hyperparameter space
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 2, 4, 6, 10],
    'max_features': ['sqrt', 'log2', None]
}

# Perform grid search CV
grid_search = GridSearchCV(dt, param_grid=param_grid, cv=5)
grid_search.fit(final_X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
dt = tree.DecisionTreeClassifier(criterion = 'gini', max_depth = 3, max_features = None, min_samples_leaf = 1, min_samples_split = 10)

In [None]:
dt = dt.fit(final_X_train, y_train)

In [None]:
y_train_preds = dt.predict(final_X_train)
y_train_preds[0:10]

In [None]:
y_val_preds = dt.predict(final_X_val)
y_val_preds[0:10]

In [None]:
from sklearn.metrics import confusion_matrix
confusionmatrix = confusion_matrix(y_train,y_train_preds)
confusionmatrix

In [None]:
confusionmatrix = confusion_matrix(y_val,y_val_preds)
confusionmatrix

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_train,y_train_preds))

In [None]:
print(metrics.classification_report(y_val,y_val_preds))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(final_X_train, y_train)

# Print the best parameters and score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

In [None]:
rfc = RandomForestClassifier(max_depth = 5, min_samples_leaf = 1, min_samples_split = 10, n_estimators = 100)
rfc = rfc.fit(final_X_train, y_train)

In [None]:
y_train_preds = rfc.predict(final_X_train)
y_train_preds[0:10]

In [None]:
y_val_preds = rfc.predict(final_X_val)
y_val_preds[0:10]

In [None]:
from sklearn.metrics import confusion_matrix
confusionma2trix = confusion_matrix(y_train,y_train_preds)
confusionmatrix

In [None]:
confusionmatrix = confusion_matrix(y_val,y_val_preds)
confusionmatrix

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_train,y_train_preds))

In [None]:
print(metrics.classification_report(y_val,y_val_preds))

## XG Booost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.5]
}

In [None]:
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1)

In [None]:
grid_search.fit(final_X_train, y_train)
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [None]:
xgb = XGBClassifier(gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=50)
xgb = xgb.fit(final_X_train, y_train)

In [None]:
y_train_preds = xgb.predict(final_X_train)
y_train_preds[0:10]

In [None]:
y_val_preds = xgb.predict(final_X_val)
y_val_preds[0:10]

In [None]:
from sklearn.metrics import confusion_matrix
confusionmatrix = confusion_matrix(y_train,y_train_preds)
confusionmatrix

In [None]:
confusionmatrix = confusion_matrix(y_val,y_val_preds)
confusionmatrix

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_train,y_train_preds))

In [None]:
print(metrics.classification_report(y_val,y_val_preds))

# Importing the test data


In [None]:
test_df=pd.read_csv("test.csv",na_values=['.',':','?','!','@','#','$','%','^','&','*',','])

In [None]:
test_df

In [None]:
test_df.describe()

In [None]:
test_df.dtypes

In [None]:
test_df.nunique()

In [None]:
label = test_df.filter(["PassengerId"], axis=1)
label

In [None]:
test_df = test_df.drop(["Name","Ticket","Fare","Cabin","PassengerId"], axis=1)

In [None]:
test_df.isnull().sum()

In [None]:
test_df.columns

In [None]:
for col in ['Pclass','Sex','Embarked']:
    test_df[col] = test_df[col].astype('category')

In [None]:
test_df.dtypes

## Imputation

In [None]:
test_num_imp = pd.DataFrame(imputer_num.transform(test_df[num_cols]), columns=(test_df[num_cols]).columns)

In [None]:
test_num_imp.isnull().sum()

In [None]:
test_cat_imp = pd.DataFrame(imputer_cat.transform(test_df[cat_cols]), columns=(test_df[cat_cols]).columns)

In [None]:
test_cat_imp.isnull().sum()

## Standardization

In [None]:
test_num_std = pd.DataFrame(scaler.transform(test_num_imp), columns=(test_num_imp).columns)
test_num_std

## Dummification

In [None]:
test_cat_dum = pd.get_dummies((test_cat_imp), drop_first=True)
test_cat_dum

In [None]:
test_num_std = test_num_std.reset_index()
test_num_std

In [None]:
test_num_std = test_num_std.drop(["index"], axis=1)
test_num_std

In [None]:
test_cat_dum = test_cat_dum.reset_index()
test_cat_dum

In [None]:
test_cat_dum = test_cat_dum.drop(["index"], axis=1)
test_cat_dum

## Concating

In [None]:
Final_test_df = pd.concat([test_num_std,test_cat_dum], axis=1)
Final_test_df

In [None]:
Final_test_df.isnull().sum()

# Model prediction on test data
## XG Boost

In [None]:
xgb_preds = xgb.predict(Final_test_df)
xgb_preds[0:10]

In [None]:
len(xgb_preds)

In [None]:
label["Survived"] = xgb_preds
label["Survived"][0:10]

In [None]:
label

In [None]:
label = label.set_index(["PassengerId"])
label

In [None]:
label.to_csv('/kaggle/working/submission.csv')