In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the data

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

# Explore the data 

Check for the percentage of female passengers and the percentage of male passengers who survived.

In [None]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [None]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

Almost 75% of the women on board survived, whereas only 19% of the men lived to tell about it. Since gender seems to be such a strong indicator of survival, it should be considered as one of the features for the model.

# Build the model and predict

Choose the features and target then split the train dataset for evaluation.


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

Testing the **random forest model**.  This model is constructed of several "trees" that will individually consider each passenger's data and vote on whether the individual survived.  Then, the random forest model makes a democratic decision: the outcome with the most votes wins!

The code cell below looks for patterns in four different columns (**"Pclass"**, **"Sex"**, **"SibSp"**, and **"Parch"**) of the data.  It constructs the trees in the random forest model based on patterns in the **train.csv** file.

In [None]:
# Define the model
model = RandomForestClassifier(random_state=1)

# Define the parameter grid for grid search
param_grid = {
    "n_estimators": [100, 150, 200],
    "max_depth": [3, 5, 7]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Train the model on the training set with the best parameters
best_model_rf = grid_search.best_estimator_
best_model_rf.fit(X_train, y_train)

val_predictions = best_model_rf.predict(X_val)

# Calculate and print the accuracy
accuracy = accuracy_score(y_val, val_predictions)
print("Accuracy:", accuracy)

# Calculate and print the F1-score
f1 = f1_score(y_val, val_predictions)
print("F1-score:", f1)

Testing logistic regression.

In [None]:
logistic_model = LogisticRegression(C=1.0, random_state=1)

# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

logistic_model.fit(X_train, y_train)

logistic_val_predictions = logistic_model.predict(X_val)

# Calculate and print the accuracy
accuracy = accuracy_score(y_val, logistic_val_predictions)
print("Accuracy:", accuracy)

# Calculate and print the F1-score
f1 = f1_score(y_val, logistic_val_predictions)
print("F1-score:", f1)

The Random Forest model perfroms better than Logistic Regression after finding the suitable parameters

## Predict on test dataset and submit result

In [None]:
predictions = best_model_rf.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Submission saved!")