In [1]:
import pandas as pd
import numpy as np 
import os 
import sys 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

if '../src' not in sys.path: 
    sys.path.append('../src')
import config 

In [2]:
import feature_engineering as fe

In [3]:
df = pd.read_csv(config.RAW_DATA_PATH)
print(f"Initial data shape: {df.shape}")

Initial data shape: (50000, 33)


In [4]:
COLS_TO_DROP = [
    'user_id',
    'marketing_source',
    'app_version_major',
    'age',
    'aov_2024', 
    'rfm_monetary',
    'orders_90d',
    'sessions_30d',
    'orders_30d',
    'orders_2024',
    'rfm_recency',
    'rfm_frequency'
]

df_selected = fe.drop_weak_features(df, COLS_TO_DROP)
print(f"Shape after feature selection: {df_selected.shape}")

Dropped ['user_id', 'marketing_source', 'app_version_major', 'age', 'aov_2024', 'rfm_monetary', 'orders_90d', 'sessions_30d', 'orders_30d', 'orders_2024', 'rfm_recency', 'rfm_frequency']
Shape after feature selection: (50000, 21)


### Train-Test Split

In [5]:
X = df_selected.drop(columns=[config.TARGET_VARIABLE])
y = df_selected[config.TARGET_VARIABLE]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=config.RANDOM_STATE,
    stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Train churn rate: {y_train.mean():.2f}")
print(f"Test churn rate:  {y_test.mean():.2f}")

X_train shape: (40000, 20)
X_test shape: (10000, 20)
Train churn rate: 0.25
Test churn rate:  0.25


In [6]:
COLS_TO_LOG = ['gmv_2024', 'sessions_90d']

X_train = fe.handle_skewness(X_train.copy(), COLS_TO_LOG)
X_test = fe.handle_skewness(X_test.copy(), COLS_TO_LOG)

Applied log to transform to: ['gmv_2024', 'sessions_90d']
Applied log to transform to: ['gmv_2024', 'sessions_90d']


In [7]:
X_train = fe.create_interaction_features(X_train.copy())
X_test = fe.create_interaction_features(X_test.copy())

Created interact features.
Created interact features.


### Handle Outliers

In [8]:
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()

X_train = fe.handle_outliers(X_train.copy(), numerical_cols)
X_test = fe.handle_outliers(X_test.copy(), numerical_cols)

Handled outliers for 20 columns.
Handled outliers for 20 columns.


### Encode Categorical Features

In [9]:
HIGH_CARDINALITY_COLS = ['country', 'city']

X_train, X_test = fe.encode_categorical_features(X_train.copy(), X_test.copy(), HIGH_CARDINALITY_COLS)

Applied Frequency Encoding to: ['country', 'city']


### Scale Numerical Features

In [10]:
scaler = StandardScaler()

numerical_cols_final = X_train.select_dtypes(include=np.number).columns.tolist()

X_train[numerical_cols_final] = scaler.fit_transform(X_train[numerical_cols_final])

X_test[numerical_cols_final] = scaler.transform(X_test[numerical_cols_final])

print("Numerical features scaled successfully.")
print("\nFinal X_train columns:")
print(X_train.columns)

Numerical features scaled successfully.

Final X_train columns:
Index(['reg_days', 'sessions_90d', 'avg_session_duration_90d',
       'median_pages_viewed_30d', 'search_queries_30d', 'device_mix_ratio',
       'gmv_2024', 'category_diversity_2024', 'days_since_last_order',
       'discount_rate_2024', 'refunds_count_2024', 'refund_rate_2024',
       'support_tickets_2024', 'avg_csat_2024', 'emails_open_rate_90d',
       'emails_click_rate_90d', 'review_count_2024', 'avg_review_stars_2024',
       'satisfaction_x_recency', 'gmv_per_session_90d', 'country_freq',
       'city_freq'],
      dtype='object')


In [11]:
os.makedirs(config.PROCESSED_DATA_PATH, exist_ok=True)

X_train.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(config.PROCESSED_DATA_PATH, 'y_test.csv'), index=False)

with open(os.path.join(config.PROCESSED_DATA_PATH, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)
    
print("All processed artifacts have been saved.")

All processed artifacts have been saved.


In [12]:
print("Files in processed data directory:")
os.listdir(config.PROCESSED_DATA_PATH)

Files in processed data directory:


['scaler.pkl', 'X_test.csv', 'X_train.csv', 'y_test.csv', 'y_train.csv']