<a href="https://colab.research.google.com/github/payallgupta/CODSOFT/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install kagglehub

In [None]:
# =======================================================
# PROJECT: TITANIC SURVIVAL PREDICTION (Colab Corrected)
# =======================================================
import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os


print("Accessing dataset...")
dataset_dir = kagglehub.dataset_download("yasserh/titanic-dataset")
print(f"Dataset is available in directory: {dataset_dir}")

csv_file_path = os.path.join(dataset_dir, 'Titanic-Dataset.csv')

df = pd.read_csv(csv_file_path)

print("\n--- Step 1: Let's explore the data ---")


print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset information:")
df.info()

print("\nStatistical summary:")
print(df.describe())

print("\nGenerating plots...")
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=df)
plt.title('Survival Count (0 = Died, 1 = Survived)')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', hue='Sex', data=df)
plt.title('Survival Count by Gender')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', hue='Pclass', data=df)
plt.title('Survival Count by Passenger Class')
plt.show()

print("\n--- Step 2: Cleaning the data ---")

median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
print(f"Filled missing 'Age' values with the median: {median_age}")

df.drop('Cabin', axis=1, inplace=True)
print("Dropped the 'Cabin' column because it had too many missing values.")

mode_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(mode_embarked, inplace=True)
print(f"Filled missing 'Embarked' values with the mode: '{mode_embarked}'")

print("\nMissing values after cleaning:")
print(df.isnull().sum())
print("\n--- Step 3: Feature Engineering ---")

df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
print("Dropped 'PassengerId', 'Name', and 'Ticket' columns.")

df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Pclass'], drop_first=True)

print("\nData after converting categorical features (first 5 rows):")
print(df.head())

print("\n--- Step 4: Building and Training the Model ---")

X = df.drop('Survived', axis=1)
y = df['Survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data has {X_train.shape[0]} rows.")
print(f"Testing data has {X_test.shape[0]} rows.")

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

print("\nModel training is complete!")


print("\n--- Step 5: Evaluating the Model's Performance ---")

y_predictions = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_predictions)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_predictions))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_predictions)
print(cm)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Did not Survive', 'Survived'],
            yticklabels=['Did not Survive', 'Survived'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

print("\nProject complete.")