# Preprocessing Notebook
### Feature Engineering and Data Preparation


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import numpy as np


# Load data
df = pd.read_csv('../data/car_purchasing.csv', encoding='ISO-8859-1')

In [3]:
# Feature engineering
df['debt_to_income'] = df['credit card debt'] / df['annual Salary']
df['net_worth_tier'] = pd.qcut(df['net worth'], 
                              q=[0, 0.3, 0.7, 1],
                              labels=['Low', 'Medium', 'High'])


# Encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'annual Salary', 'credit card debt', 'net worth', 'debt_to_income']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['country', 'gender', 'net_worth_tier'])
    ])

In [4]:
# Prepare data
X = df.drop('car purchase amount', axis=1)
y = df['car purchase amount']


# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# Apply transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


# Get feature names
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out()
num_features = ['age', 'annual Salary', 'credit card debt', 'net worth', 'debt_to_income']
all_features = np.concatenate([num_features, cat_features])

In [6]:
# Save processed data (including target)
full_processed = pd.DataFrame(
    X_train_transformed.toarray(),  # Convert sparse matrix to dense array
    columns=all_features
)
full_processed['car purchase amount'] = y_train.values

# Save single file with all processed data
full_processed.to_csv('../data/processed_train.csv', index=False)