In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("/kaggle/input/titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
numerical_features = df_train.select_dtypes(include=np.number)
correlation_matrix = numerical_features.corr()

# Plotting the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Print the correlation with 'Survived' specifically
print(correlation_matrix['Survived'])


In [None]:
df_train_female =  df_train[df_train['Sex'] == 'female']
df_train_male = df_train[df_train['Sex'] == 'male']

In [None]:
df_train_female['Survived'].value_counts()

In [None]:
df_train_male['Survived'].value_counts()

In [None]:
df_train.drop(['Embarked','Cabin','Ticket','Parch','SibSp','Name','PassengerId' ], axis=1, inplace=True)
df_train

In [None]:
df_train.info()

In [None]:
df_train['Sex'] = df_train['Sex'].map({'female': 0, 'male': 1})

In [None]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)

In [None]:
plt.figure(figsize=(8, 6))
df_train['Survived'].value_counts().sort_index().plot(kind='bar')
plt.title('Survived')
plt.xlabel('score')
plt.ylabel('count')
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

X = df_train.drop('Survived', axis=1)
y = df_train['Survived']


smote = SMOTE(random_state=42)


X_resampled, y_resampled = smote.fit_resample(X, y)

# Create a new balanced DataFrame
df_train = pd.DataFrame(X_resampled, columns=X.columns)
df_train['Survived'] = y_resampled

# Now df_train_balanced has a balanced 'Survived' column
plt.figure(figsize=(8, 6))
df_train['Survived'].value_counts().sort_index().plot(kind='bar')
plt.title('Balanced Survived')
plt.xlabel('score')
plt.ylabel('count')
plt.show()

In [None]:
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost classifier
model = XGBClassifier(random_state=42)  # You can tune hyperparameters here
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Assuming 'grid_search' refers to hyperparameter tuning using GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting rounds
    'max_depth': [3, 5, 7],          # Maximum depth of each tree
    'learning_rate': [0.01, 0.1, 0.3] # Step size shrinkage used in update to prevent overfitting
}


# Initialize the XGBoost classifier
model = XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy') # 5-fold cross-validation

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best cross-validation score: {best_score}")


# Train a new model with the best parameters
best_model = XGBClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the best model: {accuracy}")

In [None]:
# Preprocess the test data similarly to the training data
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1})
df_test['Age'].fillna(df_test['Age'].mean(), inplace=True)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)  # Fill missing 'Fare' values
df_test = df_test.drop(['Embarked','Cabin','Ticket','Parch','SibSp','Name', ], axis=1)

# Make sure the order of columns matches the training data
X_test_final = df_test[X_train.columns]

# Predict using the best model
predictions = best_model.predict(X_test_final)

# Create a submission DataFrame
submission = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': predictions})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)