In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ------------------------- 
# Load data 
# -------------------------

df= pd.read_csv("Titanic_train.csv")

def inspect_df():
    print(f"shape:{df.shape}\n")
    print(f"columns: {df.columns.tolist()}\n")
    print(df.dtypes)
    print("\nMemory Usage (bytes):")
    print(df.memory_usage(deep=True))


def df_quality_report():
    print("\n=== Missing values per column ===")
    print(df.isna().sum())
    print("\n=== Duplicate row count ===")
    print(df.duplicated().sum())
    print("\n=== Columns that look numeric but need checking ===")
    for col in df.columns:
        if df[col].dtype == "object":
            converted = pd.to_numeric(df[col], errors="coerce")
            if converted.notna().sum() > 0 and converted.isna().sum() > df[col].isna().sum():
                print(col)


inspect_df()
df_quality_report()

for c in df.columns:
    print("\n" + "="*60)
    print("COLUMN:", c)
    print("dtype:", df[c].dtype)
    print("missing:", df[c].isna().sum())
    print("unique:", df[c].nunique(dropna=False))
    print("sample values:", df[c].dropna().astype(str).head(5).tolist())

    



shape:(891, 12)

columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Memory Usage (bytes):
Index            132
PassengerId     7128
Survived        7128
Pclass          7128
Name           67685
Sex            47851
Age             7128
SibSp           7128
Parch           7128
Ticket         49674
Fare            7128
Cabin          32712
Embarked       44514
dtype: int64

=== Missing values per column ===
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked     

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ------------------------- 
# Load data 
# -------------------------

df= pd.read_csv("Titanic_test.csv")

def inspect_df():
    print(f"shape:{df.shape}\n")
    print(f"columns: {df.columns.tolist()}\n")
    print(df.dtypes)
    print("\nMemory Usage (bytes):")
    print(df.memory_usage(deep=True))


def df_quality_report():
    print("\n=== Missing values per column ===")
    print(df.isna().sum())
    print("\n=== Duplicate row count ===")
    print(df.duplicated().sum())
    print("\n=== Columns that look numeric but need checking ===")
    for col in df.columns:
        if df[col].dtype == "object":
            converted = pd.to_numeric(df[col], errors="coerce")
            if converted.notna().sum() > 0 and converted.isna().sum() > df[col].isna().sum():
                print(col)

inspect_df()
df_quality_report()

for c in df.columns:
    print("\n" + "="*60)
    print("COLUMN:", c)
    print("dtype:", df[c].dtype)
    print("missing:", df[c].isna().sum())
    print("unique:", df[c].nunique(dropna=False))
    print("sample values:", df[c].dropna().astype(str).head(5).tolist())



shape:(418, 11)

columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Memory Usage (bytes):
Index            132
PassengerId     3344
Pclass          3344
Name           31970
Sex            22458
Age             3344
SibSp           3344
Parch           3344
Ticket         23356
Fare            3344
Cabin          15294
Embarked       20900
dtype: int64

=== Missing values per column ===
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

=== Duplicate row count ===
0

=== Columns that look nu

In [19]:
# train and test data cleaned

import pandas as pd
import numpy as np

train_path = "Titanic_train.csv"
test_path  = "Titanic_test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Separate target
y = train_df["Survived"]
X_train = train_df.drop(columns=["Survived"])
X_test  = test_df.copy()

def preprocess_titanic(X_train: pd.DataFrame, X_test: pd.DataFrame):
    Xtr = X_train.copy()
    Xte = X_test.copy()

    # -------------------------
    # 1) Embarked: fill with mode (learned from train)
    # -------------------------
    embarked_mode = Xtr["Embarked"].mode(dropna=True)[0]
    Xtr["Embarked"] = Xtr["Embarked"].fillna(embarked_mode)
    Xte["Embarked"] = Xte["Embarked"].fillna(embarked_mode)

    # -------------------------
    # 2) Cabin: extract Deck, fill missing as "Unknown", drop Cabin
    # -------------------------
    Xtr["Deck"] = Xtr["Cabin"].astype(str).str[0].replace("n", np.nan)  # 'nan' -> NaN
    Xte["Deck"] = Xte["Cabin"].astype(str).str[0].replace("n", np.nan)

    Xtr["Deck"] = Xtr["Deck"].fillna("Unknown")
    Xte["Deck"] = Xte["Deck"].fillna("Unknown")

    Xtr = Xtr.drop(columns=["Cabin"])
    Xte = Xte.drop(columns=["Cabin"])

    # -------------------------
    # 3) Age: group-based median by (Pclass, Sex) learned from train
    # -------------------------
    age_median_by_group = Xtr.groupby(["Pclass", "Sex"])["Age"].median()

    def fill_age(df):
        df = df.copy()
        # map each row to the group's median
        group_median = df.set_index(["Pclass", "Sex"]).index.map(age_median_by_group)
        df["Age"] = df["Age"].fillna(pd.Series(group_median, index=df.index))
        # fallback (in case any group median is missing)
        df["Age"] = df["Age"].fillna(Xtr["Age"].median())
        return df

    Xtr = fill_age(Xtr)
    Xte = fill_age(Xte)

    return Xtr, Xte



X_train_clean, X_test_clean = preprocess_titanic(X_train, X_test)

fare_median = X_train_clean["Fare"].median()
X_test_clean["Fare"] = X_test_clean["Fare"].fillna(fare_median)

# Quick sanity checks
print("Missing values (train cleaned):")
print(X_train_clean.isna().sum().sort_values(ascending=False).head(10))

print("\nMissing values (test cleaned):")
print(X_test_clean.isna().sum().sort_values(ascending=False).head(10))

print("\nColumns now:")
print(X_train_clean.columns.tolist())


Missing values (train cleaned):
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Missing values (test cleaned):
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Columns now:
['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked', 'Deck']


In [18]:
# extract title

import pandas as pd
import numpy as np

train_path = "Titanic_train.csv"
test_path  = "Titanic_test.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Separate target
y = train_df["Survived"]
X_train = train_df.drop(columns=["Survived"])
X_test  = test_df.copy()

def preprocess_titanic(X_train: pd.DataFrame, X_test: pd.DataFrame):
    Xtr = X_train.copy()
    Xte = X_test.copy()

    # -------------------------
    # 1) Embarked: fill with mode (learned from train)
    # -------------------------
    embarked_mode = Xtr["Embarked"].mode(dropna=True)[0]
    Xtr["Embarked"] = Xtr["Embarked"].fillna(embarked_mode)
    Xte["Embarked"] = Xte["Embarked"].fillna(embarked_mode)

    # -------------------------
    # 2) Cabin: extract Deck, fill missing as "Unknown", drop Cabin
    # -------------------------
    Xtr["Deck"] = Xtr["Cabin"].astype(str).str[0].replace("n", np.nan)  # 'nan' -> NaN
    Xte["Deck"] = Xte["Cabin"].astype(str).str[0].replace("n", np.nan)

    Xtr["Deck"] = Xtr["Deck"].fillna("Unknown")
    Xte["Deck"] = Xte["Deck"].fillna("Unknown")

    Xtr = Xtr.drop(columns=["Cabin"])
    Xte = Xte.drop(columns=["Cabin"])

    # -------------------------
    # 3) Age: group-based median by (Pclass, Sex) learned from train
    # -------------------------
    age_median_by_group = Xtr.groupby(["Pclass", "Sex"])["Age"].median()

    def fill_age(df):
        df = df.copy()
        # map each row to the group's median
        group_median = df.set_index(["Pclass", "Sex"]).index.map(age_median_by_group)
        df["Age"] = df["Age"].fillna(pd.Series(group_median, index=df.index))
        # fallback (in case any group median is missing)
        df["Age"] = df["Age"].fillna(Xtr["Age"].median())
        return df

    Xtr = fill_age(Xtr)
    Xte = fill_age(Xte)

    return Xtr, Xte



X_train_clean, X_test_clean = preprocess_titanic(X_train, X_test)

fare_median = X_train_clean["Fare"].median()
X_test_clean["Fare"] = X_test_clean["Fare"].fillna(fare_median)

def extract_title(df):
    df = df.copy()
    df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    return df

X_train_clean = extract_title(X_train_clean)
X_test_clean  = extract_title(X_test_clean)

# 4️⃣ Inspect Title distribution
print(X_train_clean["Title"].value_counts())



# Quick sanity checks
#print("Missing values (train cleaned):")
#print(X_train_clean.isna().sum().sort_values(ascending=False).head(10))

#print("\nMissing values (test cleaned):")
#print(X_test_clean.isna().sum().sort_values(ascending=False).head(10))

#print("\nColumns now:")
#print(X_train_clean.columns.tolist())


Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Mlle          2
Major         2
Ms            1
Mme           1
Don           1
Lady          1
Sir           1
Capt          1
Countess      1
Jonkheer      1
Name: count, dtype: int64
