Read CSV files into a DataFrame.

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

Check and print the dataset shape.

In [None]:
shape = df.shape
print('Dataset Shape:', shape)

Get a statistical overview of the dataset.

In [None]:
description = df.describe()
print(description)

Overview of the columns in the dataset.

In [None]:
columns = df.columns.tolist()
print('Columns:', columns)

Inspect data types and non-null values in the dataset.

In [None]:
df.info()

Plot the age distribution of the dataset.

In [None]:
import matplotlib.pyplot as plt

plt.hist(df['Age'].dropna(), bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

Fill missing values in the 'Age' column with the median.

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)

Drop the 'Cabin' column from the dataset.

In [None]:
df.drop('Cabin', axis=1, inplace=True)

Fill missing values in 'Embarked' column with 'S'.

In [None]:
df['Embarked'].fillna('S', inplace=True)

Encode categorical features into numerical format.

In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

Plot the correlation heatmap of the features.

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

Plot the count of survival outcomes.

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Survived', data=df)
plt.title('Survival Counts')
plt.show()

Convert ages into categorical age groups.

In [None]:
bins = [0, 12, 18, 30, 50, 100]
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

Convert fares into categorical fare groups.

In [None]:
fare_bins = [0, 10, 30, 100, 500]
df['FareGroup'] = pd.cut(df['Fare'], bins=fare_bins, labels=['Low', 'Medium', 'High', 'Very High'])

Combine siblings and parents to create a family size feature.

In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

Create family groups based on family size.

In [None]:
df['FamilyGroup'] = pd.cut(df['FamilySize'], bins=[0, 1, 3, 5, 100], labels=['Single', 'Small', 'Medium', 'Large'])

Select features and target variable for modeling.

In [None]:
X = df.drop(['Survived'], axis=1)
Y = df['Survived']

Train a Random Forest model.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X, Y)

Train an XGBoost model.

In [None]:
from xgboost import XGBClassifier
model_xgb = XGBClassifier()
model_xgb.fit(X, Y)

Train a Logistic Regression model.

In [None]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(X, Y)

Make predictions using the trained models.

In [None]:
predictions_rf = model_rf.predict(X)
predictions_xgb = model_xgb.predict(X)
predictions_lr = model_lr.predict(X)

Save the output predictions to a CSV file.

In [None]:
output_df = pd.DataFrame({'Predictions': predictions_rf})
output_df.to_csv('predictions.csv', index=False)