In [None]:
import pandas as pd

In [None]:
# Check if the dataset exists
train_path = "../data/train.csv"
test_path = "../data/test.csv"

try:
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print("✅ Data loaded successfully!")
    print("Train shape:", train_df.shape)
    print("Test shape:", test_df.shape)
except FileNotFoundError:
    print("❌ Error: The dataset file was not found. Make sure train.csv and test.csv are in the 'data/' folder.")

In [None]:
# Fill missing values
categorical_cols = ["workclass", "marital.status", "occupation", "relationship", "sex"]
numerical_cols = ["age", "education.num", "capital.gain", "capital.loss", "hours.per.week"]

for col in categorical_cols:
    train_df[col] = train_df[col].fillna("Unknown").str.lower().str.strip()
    test_df[col] = test_df[col].fillna("Unknown").str.lower().str.strip()

for col in numerical_cols:
    median_value = train_df[col].median()
    train_df[col] = train_df[col].fillna(median_value)
    test_df[col] = test_df[col].fillna(median_value)

In [None]:
# Save cleaned datasets
train_df.to_csv("../data/train_cleaned.csv", index=False)
test_df.to_csv("../data/test_cleaned.csv", index=False)

print("✅ Data cleaning complete. Cleaned files saved!")

In [None]:
import pandas as pd

# Load cleaned datasets
train_df = pd.read_csv("../data/train_cleaned.csv")
test_df = pd.read_csv("../data/test_cleaned.csv")

# Debug: Check 'income' before encoding
print("Before mapping:")
print(train_df["income"].value_counts(dropna=False))

In [None]:
# 1️⃣ Convert binary categorical variables
binary_mappings = {"male": 1, "female": 0, "<=50K": 0, ">50K": 1}
train_df["sex"] = train_df["sex"].map(binary_mappings)
test_df["sex"] = test_df["sex"].map(binary_mappings)

# Ensure 'income' is mapped correctly
train_df["income"] = train_df["income"].str.strip().str.upper().map({
    ">50K": 1, "<=50K": 0})

# Debug: Check 'income' after mapping
print("After mapping:")
print(train_df["income"].value_counts(dropna=False))

In [None]:
# 2️⃣ One-Hot Encoding for multi-class categorical columns
categorical_cols = ["workclass", "marital.status",
                    "occupation", "relationship"]
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Ensure both train & test have the same columns (test may be missing some categories)
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  # Add missing columns with zero values

test_df = test_df[train_df.columns.drop("income")]  # Ensure same column order

# Debug: Ensure 'income' is still present
print("Final columns before saving:", train_df.columns)

In [None]:
# 3️⃣ Save processed datasets
train_df.to_csv("../data/train_processed.csv", index=False)
test_df.to_csv("../data/test_processed.csv", index=False)

print("✅ Feature Engineering complete! Processed files saved.")

# Verify saved data
train_df = pd.read_csv("../data/train_processed.csv")
print("Unique values in 'income' after encoding:", train_df["income"].unique())
print("Missing values in 'income':", train_df["income"].isnull().sum())