In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Load the dataset
data = pd.read_csv("test.csv")

In [None]:
# Display the first few rows of the dataset
print(data.head())


In [None]:
# Check the shape of the dataset
print("Shape of the dataset:", data.shape)

In [None]:
# Get information about the dataset
print(data.info())

In [None]:
# Check for missing values
print("Missing values:\n", data.isnull().sum())

In [None]:
# Drop the 'Cabin' column as it has too many missing values
data = data.drop(columns='Cabin', axis=1)

In [None]:
# Fill missing values in 'Age' with the mean
data['Age'].fillna(data['Age'].mean(), inplace=True)

In [None]:
# Fill missing values in 'Embarked' with the mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [None]:
# Fill missing values in 'Fare' with the mode
data['Fare'].fillna(data['Fare'].mode()[0], inplace=True)

In [None]:
# Check if there are any remaining missing values
print("Total missing values after imputation:", data.isnull().sum().sum())

In [None]:
# Count the number of survivors
print("Survived count:\n", data['Survived'].value_counts())

In [None]:
# Display statistical summary of the dataset
print("Statistical summary:\n", data.describe())

In [None]:
# Visualize the count of survivors
sns.set()
sns.countplot(x='Survived', data=data)
plt.show()

In [None]:
# Visualize the count of passengers by sex
sns.countplot(x='Sex', data=data)
plt.show()

In [None]:
# Visualize the count of survivors by sex
sns.countplot(x='Sex', hue='Survived', data=data)
plt.show()

In [None]:
# Visualize the count of passengers by passenger class
sns.countplot(x='Pclass', data=data)
plt.show()

In [None]:
# Visualize the count of survivors by passenger class
sns.countplot(x='Pclass', hue='Survived', data=data)
plt.show()

In [None]:
# Replace categorical values with numerical values
data.replace({'Sex': {'male': 0, 'female': 1}, 'Embarked': {'S': 0, 'C': 1, 'Q': 2}}, inplace=True)

In [None]:
# Prepare features (X) and target variable (Y)
X = data.drop(columns=['PassengerId', 'Name', 'Ticket'], axis=1)
Y = data['Survived']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, Y_train)

In [None]:
# Predictions on training set
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(Y_train, train_predictions)
print("Training Accuracy:", train_accuracy)

In [None]:
# Predictions on test set
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, test_predictions)
print("Test Accuracy:", test_accuracy)
