In [1]:
import pandas as pd
import numpy as np 
import os 
import sys 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

if '../src' not in sys.path: 
    sys.path.append('../src')
import config 
import feature_engineering as fe

In [2]:
df = pd.read_csv(config.RAW_DATA_PATH)
print(f"Initial data shape: {df.shape}")
display(df.head())


Initial data shape: (50000, 33)


Unnamed: 0,user_id,age,country,city,reg_days,marketing_source,sessions_30d,sessions_90d,avg_session_duration_90d,median_pages_viewed_30d,...,support_tickets_2024,avg_csat_2024,emails_open_rate_90d,emails_click_rate_90d,review_count_2024,avg_review_stars_2024,rfm_recency,rfm_frequency,rfm_monetary,churn_label
0,U00001,20,Thailand,Bangkok,262,ads_fb,2,4,728.93,4.41,...,1,4.3,0.252,0.029,0,4.46,55,4,80.58,0
1,U00002,34,Indonesia,Jakarta,908,organic,2,6,671.11,7.75,...,0,4.27,0.388,0.023,0,4.79,59,2,49.11,0
2,U00003,31,Indonesia,Surabaya,406,referral,0,3,493.29,2.58,...,0,4.35,0.343,0.014,0,4.59,73,1,11.95,1
3,U00004,23,Malaysia,Johor Bahru,698,ads_fb,0,4,305.83,4.4,...,0,4.54,0.27,0.027,0,4.52,65,1,14.63,1
4,U00005,28,Vietnam,Ho Chi Minh City,650,influencer,1,7,946.16,6.04,...,0,4.04,0.212,0.073,1,4.79,68,5,116.32,1


In [3]:
print(f"Dropping initial columns: {config.INITIAL_COLS_TO_DROP}")
df = fe.drop_weak_features(df, config.INITIAL_COLS_TO_DROP)
print(f"Shape after dropping columns: {df.shape}")

Dropping initial columns: ['user_id', 'marketing_source', 'app_version_major']
Dropped ['user_id', 'marketing_source', 'app_version_major']
Shape after dropping columns: (50000, 30)


In [4]:
print("Creating interaction features from Screening Round...")
df = fe.create_interaction_features(df)
print("Screening Round interaction features created.")

Creating interaction features from Screening Round...
Created interact features.
Screening Round interaction features created.


In [5]:
print("Creating advanced features...")
df = fe.create_advanced_features(df) 
print("Advanced features created.")

Creating advanced features...
Advanced features created.


In [6]:
print(df.columns.tolist())
display(df.head())

['age', 'country', 'city', 'reg_days', 'sessions_30d', 'sessions_90d', 'avg_session_duration_90d', 'median_pages_viewed_30d', 'search_queries_30d', 'device_mix_ratio', 'orders_30d', 'orders_90d', 'orders_2024', 'aov_2024', 'gmv_2024', 'category_diversity_2024', 'days_since_last_order', 'discount_rate_2024', 'refunds_count_2024', 'refund_rate_2024', 'support_tickets_2024', 'avg_csat_2024', 'emails_open_rate_90d', 'emails_click_rate_90d', 'review_count_2024', 'avg_review_stars_2024', 'rfm_recency', 'rfm_frequency', 'rfm_monetary', 'churn_label', 'satisfaction_x_recency', 'gmv_per_session_90d', 'session_decay_ratio', 'order_decay_ratio', 'session_to_order_conversion', 'gmv_per_reg_day']


Unnamed: 0,age,country,city,reg_days,sessions_30d,sessions_90d,avg_session_duration_90d,median_pages_viewed_30d,search_queries_30d,device_mix_ratio,...,rfm_recency,rfm_frequency,rfm_monetary,churn_label,satisfaction_x_recency,gmv_per_session_90d,session_decay_ratio,order_decay_ratio,session_to_order_conversion,gmv_per_reg_day
0,20,Thailand,Bangkok,262,2,4,728.93,4.41,1,0.861,...,55,4,80.58,0,236.5,16.116,0.6,0.5,0.2,0.306388
1,34,Indonesia,Jakarta,908,2,6,671.11,7.75,8,0.897,...,59,2,49.11,0,251.93,7.015714,0.428571,1.0,0.0,0.054026
2,31,Indonesia,Surabaya,406,0,3,493.29,2.58,1,0.917,...,73,1,11.95,1,317.55,2.9875,0.25,1.0,0.0,0.029361
3,23,Malaysia,Johor Bahru,698,0,4,305.83,4.4,4,0.84,...,65,1,14.63,1,295.1,2.926,0.2,1.0,0.0,0.02093
4,28,Vietnam,Ho Chi Minh City,650,1,7,946.16,6.04,8,0.511,...,68,5,116.32,1,274.72,14.54,0.25,0.333333,0.25,0.178679


### Train-Test Split

In [7]:
X = df.drop(columns=[config.TARGET_VARIABLE])
y = df[config.TARGET_VARIABLE]

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=config.TEST_SET_SIZE,
    random_state=config.RANDOM_STATE,
    stratify=y 
)

print(f"X_train_full shape: {X_train_full.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Churn rate in y_train_full: {y_train_full.mean():.4f}")
print(f"Churn rate in y_test: {y_test.mean():.4f}")

X_train_full shape: (40000, 35)
X_test shape: (10000, 35)
Churn rate in y_train_full: 0.2500
Churn rate in y_test: 0.2500


In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=config.VALIDATION_SET_SIZE, 
    random_state=config.RANDOM_STATE,
    stratify=y_train_full 
)

print(f"X_train shape: {X_train.shape} (~65%)")
print(f"X_val shape: {X_val.shape} (~15%)")
print(f"X_test shape: {X_test.shape} (~20%)")
print(f"\nChurn rate in y_train: {y_train.mean():.4f}")
print(f"Churn rate in y_val: {y_val.mean():.4f}")
print(f"Churn rate in y_test: {y_test.mean():.4f}")

X_train shape: (32500, 35) (~65%)
X_val shape: (7500, 35) (~15%)
X_test shape: (10000, 35) (~20%)

Churn rate in y_train: 0.2500
Churn rate in y_val: 0.2500
Churn rate in y_test: 0.2500


In [9]:
print(f"Applying log transformation to: {config.COLS_TO_LOG_TRANSFORM}")
X_train = fe.handle_skewness(X_train.copy(), config.COLS_TO_LOG_TRANSFORM)
X_val = fe.handle_skewness(X_val.copy(), config.COLS_TO_LOG_TRANSFORM)
X_test = fe.handle_skewness(X_test.copy(), config.COLS_TO_LOG_TRANSFORM)
print("Skewness handled.")

Applying log transformation to: ['gmv_2024', 'sessions_90d']
Skewness handled.


### Handle Outliers

In [10]:
numerical_cols = [col for col in config.NUMERICAL_COLS_FOR_OUTLIERS if col in X_train.columns]
print(f"\nHandling outliers for {len(numerical_cols)} numerical columns...")
X_train = fe.handle_outliers(X_train.copy(), numerical_cols)
X_val = fe.handle_outliers(X_val.copy(), numerical_cols)
X_test = fe.handle_outliers(X_test.copy(), numerical_cols)
print("Outliers handled using Winsorizing.")


Handling outliers for 20 numerical columns...


Outliers handled using Winsorizing.


### Encode Categorical Features

In [11]:
print(f"\nApplying Frequency Encoding to: {config.HIGH_CARDINALITY_COLS}")
X_train, X_val, X_test = fe.encode_categorical_features(
    X_train.copy(), X_val.copy(), X_test.copy(), config.HIGH_CARDINALITY_COLS
)
print("Categorical features encoded.")
print("Final columns after encoding:", X_train.columns.tolist())


Applying Frequency Encoding to: ['country', 'city']
Categorical features encoded.
Final columns after encoding: ['age', 'reg_days', 'sessions_30d', 'sessions_90d', 'avg_session_duration_90d', 'median_pages_viewed_30d', 'search_queries_30d', 'device_mix_ratio', 'orders_30d', 'orders_90d', 'orders_2024', 'aov_2024', 'gmv_2024', 'category_diversity_2024', 'days_since_last_order', 'discount_rate_2024', 'refunds_count_2024', 'refund_rate_2024', 'support_tickets_2024', 'avg_csat_2024', 'emails_open_rate_90d', 'emails_click_rate_90d', 'review_count_2024', 'avg_review_stars_2024', 'rfm_recency', 'rfm_frequency', 'rfm_monetary', 'satisfaction_x_recency', 'gmv_per_session_90d', 'session_decay_ratio', 'order_decay_ratio', 'session_to_order_conversion', 'gmv_per_reg_day', 'country_freq', 'city_freq']


### Scale Numerical Features

In [12]:
scaler = StandardScaler()

numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])
print("Numerical features scaled successfully.")


Numerical features scaled successfully.


In [13]:
scaler_path = os.path.join(config.PROCESSED_DATA_PATH, 'scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to: {scaler_path}")

Scaler saved to: ../data/processed/scaler.pkl


In [14]:
os.makedirs(config.PROCESSED_DATA_PATH, exist_ok=True)

X_train.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_val.csv'), index=False)
X_test.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_test.csv'), index=False)

y_train.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_train.csv'), index=False, header=True)
y_val.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_val.csv'), index=False, header=True)
y_test.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_test.csv'), index=False, header=True)

print("All processed data artifacts have been saved to:", config.PROCESSED_DATA_PATH)
print("Files in processed data directory:", os.listdir(config.PROCESSED_DATA_PATH))

All processed data artifacts have been saved to: ../data/processed/
Files in processed data directory: ['scaler.pkl', 'X_test.csv', 'X_train.csv', 'X_val.csv', 'y_test.csv', 'y_train.csv', 'y_val.csv']
