In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# load data_processed/client_data as df
df = pd.read_csv('data/data_processed/client_data.csv')

In [None]:

# make categorical columns
cat_features  = ['disrict', 'client_catg', 'region']

# assign categorical columns in df
df[cat_features] = df[cat_features].astype('category')
df.info()

# numerical columns
num_col = df.select_dtypes(include=['int64', 'float64']).columns
num_features = [col for col in num_col if col != 'target']

In [None]:
n_unique = df[cat_features].nunique() # note "regions" are a lot"
print(n_unique)

## adress imbalance 
Best Approach for SunSafe Sentinel?
- Start with scale_pos_weight in XGBoost (since tree models handle imbalance better).
- If imbalance is extreme (>99% non-fraud), use SMOTE + weighted XGBoost.
- If dataset is massive, try undersampling + anomaly pre-filtering.



In [5]:
# pipeline for categorical columns
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# pipeline for numerical columns
num_transformer = Pipeline(steps=[
    ('scaler', RobustScaler()) # adjust scaler if needed
])

In [6]:
# columnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [7]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
RSEED=42
from sklearn.model_selection import RandomizedSearchCV
# Hyperparameter grid
# param_grid = {
#     'n_estimators': np.arange(10, 201).astype(int),
#     'max_depth': [None] + list(np.arange(3, 21).astype(int)),
#     'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
#     'max_leaf_nodes': [None] + list(np.arange(10, 51).astype(int)),
#     'min_samples_split': [2, 5, 10],
#     'bootstrap': [True, False]
# }

param_grid = {
    'n_estimators': [10], # number of trees
    'max_depth': [5] ,# maximum depth of the tree
    'max_features': ['sqrt'] ,# number of features to consider when looking for the best split
    'max_leaf_nodes': [None],# maximum number of leaf nodes in base trees
    'min_samples_split': [2, 5, 10],# minimum number of samples required to split an internal node
    'bootstrap': [True, False] # whether bootstrap samples are used when building trees 
}

In [9]:
estimator = RandomForestClassifier(random_state = RSEED)
# Create the random search model
rs = RandomizedSearchCV(estimator, param_grid, n_jobs = -1,
                        scoring = 'roc_auc', cv = 3,
                        n_iter = 10, verbose = 5, random_state=RSEED)

In [10]:
model_stoneage = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rs)])                                 

In [None]:
from joblib import dump
# Fit
model_stoneage.fit(X_train, y_train)

In [None]:
# Save the model, train-test split data, and grid search results in the same pkl file
dump({'model': model_stoneage,
          'X_train': X_train, 
          'X_test': X_test, 
          'y_train': y_train, 
          'y_test': y_test}, 
         'models/model_stoneage.pkl')

# # %%
# import joblib

# # Save the model and train-test split data in the same pkl file
# with open('model_and_data.pkl', 'wb') as f:
#     joblib.dump({'model': grid_search, 'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}, f)