# Data Preprocessing

Cleaning and preparing Chinook data for modeling

## 1. Setup & Load Data

In [1]:
%pip install scikit-learn

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Database connection
engine = create_engine('postgresql://postgres:pass1234@localhost:5432/chinook')

# Load customer data with purchases
query = """
SELECT 
    c.customer_id,
    c.first_name || ' ' || c.last_name as customer_name,
    c.country,
    COUNT(i.invoice_id) as total_orders,
    SUM(i.total) as total_spent,
    AVG(i.total) as avg_order_value
FROM customer c
LEFT JOIN invoice i ON c.customer_id = i.customer_id
GROUP BY c.customer_id, c.first_name, c.last_name, c.country
"""

df = pd.read_sql(query, engine)
print(f"Data loaded: {len(df)} customers")
df.head()

Note: you may need to restart the kernel to use updated packages.
Data loaded: 59 customers


Unnamed: 0,customer_id,customer_name,country,total_orders,total_spent,avg_order_value
0,29,Robert Brown,Canada,7,37.62,5.374286
1,54,Steve Murray,United Kingdom,7,37.62,5.374286
2,4,Bjørn Hansen,Norway,7,39.62,5.66
3,34,João Fernandes,Portugal,7,39.62,5.66
4,51,Joakim Johansson,Sweden,7,38.62,5.517143


## 2. Basic Data Cleaning

In [2]:
# Check for issues
print("Missing values:")
print(df.isnull().sum())
print(f"\nDuplicates: {df.duplicated().sum()}")

# Simple fixes
df['total_orders'] = df['total_orders'].fillna(0)
df['total_spent'] = df['total_spent'].fillna(0)
df['avg_order_value'] = df['avg_order_value'].fillna(0)

# Drop duplicates if any
df = df.drop_duplicates()

print(f"\nCleaned data: {len(df)} customers")

Missing values:
customer_id        0
customer_name      0
country            0
total_orders       0
total_spent        0
avg_order_value    0
dtype: int64

Duplicates: 0

Cleaned data: 59 customers


## 3. Create Target Variable

In [3]:
# Define high spenders as top 25% by total spent
spending_threshold = df['total_spent'].quantile(0.75)
df['is_high_spender'] = (df['total_spent'] > spending_threshold).astype(int)

print(f"Spending threshold: ${spending_threshold:.2f}")
print(f"High spenders: {df['is_high_spender'].sum()} ({df['is_high_spender'].mean()*100:.1f}%)")

Spending threshold: $39.62
High spenders: 14 (23.7%)


## 4. Feature Engineering

In [4]:
# Create simple features
df['orders_per_dollar'] = df['total_orders'] / (df['total_spent'] + 1)  # +1 to avoid division by zero
df['high_avg_order'] = (df['avg_order_value'] > df['avg_order_value'].median()).astype(int)

# One-hot encode country (keep only top 5 countries)
top_countries = df['country'].value_counts().head(5).index
df['country_simplified'] = df['country'].where(df['country'].isin(top_countries), 'Other')
country_dummies = pd.get_dummies(df['country_simplified'], prefix='country')
df = pd.concat([df, country_dummies], axis=1)

print("Features created:")
print(f"- orders_per_dollar")
print(f"- high_avg_order")
print(f"- country dummies: {list(country_dummies.columns)}")

Features created:
- orders_per_dollar
- high_avg_order
- country dummies: ['country_Brazil', 'country_Canada', 'country_France', 'country_Germany', 'country_Other', 'country_USA']


## 5. Prepare Final Dataset

In [5]:
# Select features for modeling
feature_columns = ['total_orders', 'total_spent', 'avg_order_value', 'orders_per_dollar', 'high_avg_order'] + list(country_dummies.columns)

X = df[feature_columns]
y = df['is_high_spender']

print(f"Final dataset shape: {X.shape}")
print(f"Features: {list(X.columns)}")

Final dataset shape: (59, 11)
Features: ['total_orders', 'total_spent', 'avg_order_value', 'orders_per_dollar', 'high_avg_order', 'country_Brazil', 'country_Canada', 'country_France', 'country_Germany', 'country_Other', 'country_USA']


## 6. Train/Test Split & Save

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Save to CSV files
train_data = X_train.copy()
train_data['target'] = y_train
train_data.to_csv('train_data.csv', index=False)

test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv('test_data.csv', index=False)

print(f"Training data saved: {len(train_data)} rows")
print(f"Test data saved: {len(test_data)} rows")
print(f"Target distribution in training: {y_train.value_counts().values}")

Training data saved: 47 rows
Test data saved: 12 rows
Target distribution in training: [36 11]


## Summary

1. Loaded customer data with purchase history
2. Cleaned missing values and duplicates
3. Created target variable (high spender = top 25%)
4. Engineered simple features
5. Scaled numerical features
6. Split and saved data for modeling

Files `train_data.csv` and `test_data.csv` are ready for model training.