<a href="https://colab.research.google.com/github/nitinrajg/ML-Projects/blob/main/Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import files

print("Please upload your 'kaggle.json' file (from Kaggle Account -> Create New API Token):")
uploaded = files.upload()

!mkdir -p ~/.kaggle
for fn in uploaded.keys():
    !mv "{fn}" ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json

print("\nKaggle API key uploaded and configured successfully.")
print("You can now download datasets directly from Kaggle.")
print("-" * 50)

Please upload your 'kaggle.json' file (from Kaggle Account -> Create New API Token):


Saving kaggle.json to kaggle.json

Kaggle API key uploaded and configured successfully.
You can now download datasets directly from Kaggle.
--------------------------------------------------


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import kaggle
import os
import zipfile

In [3]:
# --- Download & Extract Dataset ---
dataset_name = "titanic"
extract_path = "."

print(f"Downloading '{dataset_name}' dataset from Kaggle...")
try:
    kaggle.api.competition_download_files(dataset_name, path=extract_path, quiet=True)
    print(f"'{dataset_name}.zip' downloaded.")
    with zipfile.ZipFile(os.path.join(extract_path, f"{dataset_name}.zip"), 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Files extracted.")
    os.remove(os.path.join(extract_path, f"{dataset_name}.zip")) # Clean up zip
    print("Zip file removed.")
except Exception as e:
    print(f"Error downloading/extracting: {e}. Check Kaggle API key setup.")
    exit()

Downloading 'titanic' dataset from Kaggle...
'titanic.zip' downloaded.
Files extracted.
Zip file removed.


In [10]:
# --- 1. Load Data & Initial Exploration ---
df = pd.read_csv('train.csv')
print("\n--- Initial Dataset Overview ---")
print("First 5 rows of the dataset:")
display(df.head()) # Use display() for cleaner output in Colab/Jupyter

print("\nDataset Info (data types, non-null counts):")
df.info()

print("\nMissing values before preprocessing:")
print(df.isnull().sum()[df.isnull().sum() > 0])

print("\nDistribution of 'Sex' column:")
print(df['Sex'].value_counts())

print("\nDistribution of 'Embarked' column:")
print(df['Embarked'].value_counts())


--- Initial Dataset Overview ---
First 5 rows of the dataset:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Dataset Info (data types, non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Missing values before preprocessing:
Age         177
Cabin       687
Embarked      2
dtype: int64

Distribution of 'Sex' column:
Sex
male      577
female    314
Name: count, dtype: int64

Distributi

In [12]:
# --- 2. Define Target and Features ---
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df[target]

In [13]:
# --- 3. Preprocessing Setup ---
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [14]:
# --- 4. Build Model Pipeline ---
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(random_state=42, solver='liblinear'))])


In [15]:
# --- 5. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# --- 6. Train Model ---
print("\nTraining model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete.")


Training model...
Model training complete.


In [17]:
# --- 7. Make Predictions ---
y_pred = model_pipeline.predict(X_test)

In [18]:
# --- 8. Evaluate Model ---
print("\n--- Model Evaluation ---")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy: 0.7989

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

