In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# Generate synthetic data
np.random.seed(42)
n_records = 1000
fraud_rate = 0.02

categories = ['A', 'B', 'C']
merchant_names = ['X', 'Y', 'Z']

category_data = np.random.choice(categories, size=n_records)
merchant_name_data = np.random.choice(merchant_names, size=n_records)

amount = np.random.normal(loc=100, scale=50, size=n_records)
amount_last_30_days = np.random.normal(loc=50, scale=25, size=n_records)
count_last_30_days = np.random.poisson(lam=3, size=n_records)
address_change_last_30_days = np.random.choice([0, 1], size=n_records)
phone_change_last_60_days = np.random.choice([0, 1], size=n_records)

# Generate fraud labels
is_fraud = np.random.choice([0, 1], size=n_records, p=[1 - fraud_rate, fraud_rate])

# Create DataFrame
data = {
    'category': category_data,
    'merchant_name': merchant_name_data,
    'amount': amount,
    'amount_last_30_days': amount_last_30_days,
    'count_last_30_days': count_last_30_days,
    'address_change_last_30_days': address_change_last_30_days,
    'phone_change_last_60_days': phone_change_last_60_days,
    'target': is_fraud
}

df = pd.DataFrame(data)

In [2]:
df.head()

Unnamed: 0,category,merchant_name,amount,amount_last_30_days,count_last_30_days,address_change_last_30_days,phone_change_last_60_days,target
0,C,Z,183.576012,87.030964,3,1,0,0
1,A,Z,89.798471,38.035564,3,1,1,0
2,C,Z,90.697175,64.951442,2,0,1,0
3,C,Z,152.138259,28.765327,2,0,0,0
4,A,X,125.299555,46.580418,3,0,1,0


In [3]:
df['target'].value_counts()

0    980
1     20
Name: target, dtype: int64

In [4]:

def calculate_woe(df, cat_var, target_var):
    # Calculate the percentage of events and non-events for each category
    event_perc = df.groupby(cat_var)[target_var].mean()
    non_event_perc = 1 - event_perc

    # Calculate the WoE for each category
    woe = np.log((non_event_perc + 0.5) / (event_perc + 0.5))

    return woe

def calculate_adj_woe(df, cat_var, target_var):
    woe = calculate_woe(df, cat_var, target_var)
    n_event = df[target_var].sum()
    n_non_event = df.shape[0] - n_event
    adj_woe = woe * (np.log(n_non_event / n_event))

    return adj_woe

def calculate_smoothed_woe(df, cat_var, target_var, smooth_factor=10):
    # Calculate the percentage of events and non-events for each category
    event_perc = df.groupby(cat_var)[target_var].mean()
    non_event_perc = 1 - event_perc

    # Calculate smoothed WoE for each category
    smoothed_woe = np.log(((non_event_perc * (1 - smooth_factor)) + (event_perc * smooth_factor)) / \
                          ((event_perc * (1 - smooth_factor)) + (non_event_perc * smooth_factor)))

    return smoothed_woe

def calculate_iv(df, cat_var, target_var):
    event_perc = df.groupby(cat_var)[target_var].mean()
    non_event_perc = 1 - event_perc
    event_rate = df[target_var].mean()
    iv = ((non_event_perc - event_perc) * np.log(non_event_perc / event_perc)).sum()
    return iv

In [5]:


# Calculate WoE for 'category' and 'merchant_name'
df['category_woe'] = calculate_woe(df, 'category', 'target')
df['merchant_name_woe'] = calculate_woe(df, 'merchant_name', 'target')

# Calculate Adj-WoE for 'category' and 'merchant_name'
df['category_adj_woe'] = calculate_adj_woe(df, 'category', 'target')
df['merchant_name_adj_woe'] = calculate_adj_woe(df, 'merchant_name', 'target')

# Calculate Smoothed-WoE for 'category' and 'merchant_name'
df['category_smoothed_woe'] = calculate_smoothed_woe(df, 'category', 'target')
df['merchant_name_smoothed_woe'] = calculate_smoothed_woe(df, 'merchant_name', 'target')

# Calculate IV for 'category' and 'merchant_name'
iv_category = calculate_iv(df, 'category', 'target')
iv_merchant_name = calculate_iv(df, 'merchant_name', 'target')

print("Information Value (IV) for 'category':", iv_category)
print("Information Value (IV) for 'merchant_name':", iv_merchant_name)

# Define features and target variable
features_with_woe = ['amount', 'amount_last_30_days', 'count_last_30_days',
                     'address_change_last_30_days', 'phone_change_last_60_days',
                     'category_woe', 'merchant_name_woe']

features_without_woe = ['amount', 'amount_last_30_days', 'count_last_30_days',
                        'address_change_last_30_days', 'phone_change_last_60_days']

target = 'target'

# Split data into train and test sets
X_with_woe = df[features_with_woe]
y = df[target]
X_train_with_woe, X_test_with_woe, y_train, y_test = train_test_split(X_with_woe, y, test_size=0.2, random_state=42)

X_without_woe = df[features_without_woe]
X_train_without_woe, X_test_without_woe = train_test_split(X_without_woe, test_size=0.2, random_state=42)

# Train a random forest model with WoE variables
rf_model_with_woe = RandomForestClassifier(random_state=42)
rf_model_with_woe.fit(X_train_with_woe, y_train)

# Make predictions
y_pred_rf_with_woe = rf_model_with_woe.predict(X_test_with_woe)

# Evaluate the model with WoE variables
accuracy_rf_with_woe = accuracy_score(y_test, y_pred_rf_with_woe)
print("\nAccuracy with WoE variables (Random Forest):", accuracy_rf_with_woe)
print("Classification Report with WoE variables (Random Forest):")
print(classification_report(y_test, y_pred_rf_with_woe))

# Get feature importance for the model with WoE variables
importance_rf_with_woe = rf_model_with_woe.feature_importances_
feature_importance_rf_with_woe = pd.DataFrame({'Feature': features_with_woe, 'Importance': importance_rf_with_woe})
feature_importance_rf_with_woe_sorted = feature_importance_rf_with_woe.sort_values(by='Importance', ascending=False)
print("\nFeature Importance with WoE variables (Random Forest, sorted):")
print(feature_importance_rf_with_woe_sorted)

# Train a random forest model without WoE variables
rf_model_without_woe = RandomForestClassifier(random_state=42)
rf_model_without_woe.fit(X_train_without_woe, y_train)

# Make predictions
y_pred_rf_without_woe = rf_model_without_woe.predict(X_test_without_woe)

# Evaluate the model without WoE variables
accuracy_rf_without_woe = accuracy_score(y_test, y_pred_rf_without_woe)
print("\nAccuracy without WoE variables (Random Forest):", accuracy_rf_without_woe)
print("Classification Report without WoE variables (Random Forest):")
print(classification_report(y_test, y_pred_rf_without_woe))

# Get feature importance for the model without WoE variables
importance_rf_without_woe = rf_model_without_woe.feature_importances_
feature_importance_rf_without_woe = pd.DataFrame({'Feature': features_without_woe, 'Importance': importance_rf_without_woe})
feature_importance_rf_without_woe_sorted = feature_importance_rf_without_woe.sort_values(by='Importance', ascending=False)
print("\nFeature Importance without WoE variables (Random Forest, sorted):")
print(feature_importance_rf_without_woe_sorted)


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Information Value (IV) for 'category': 11.533866431176268
Information Value (IV) for 'merchant_name': 11.626707238843771


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def calculate_woe(df, cat_var, target_var):
    # Calculate the percentage of events and non-events for each category
    event_perc = df.groupby(cat_var)[target_var].mean()
    non_event_perc = 1 - event_perc

    # Calculate the WoE for each category
    woe = np.log((non_event_perc + 0.5) / (event_perc + 0.5))

    return woe

# Generate synthetic data
np.random.seed(42)
n_records = 1000
fraud_rate = 0.02

categories = ['A', 'B', 'C']
merchant_names = ['X', 'Y', 'Z']

category_data = np.random.choice(categories, size=n_records)
merchant_name_data = np.random.choice(merchant_names, size=n_records)

amount = np.random.normal(loc=100, scale=50, size=n_records)
amount_last_30_days = np.random.normal(loc=50, scale=25, size=n_records)
count_last_30_days = np.random.poisson(lam=3, size=n_records)
address_change_last_30_days = np.random.choice([0, 1], size=n_records)
phone_change_last_60_days = np.random.choice([0, 1], size=n_records)

# Generate fraud labels
is_fraud = np.random.choice([0, 1], size=n_records, p=[1 - fraud_rate, fraud_rate])

# Create DataFrame
data = {
    'category': category_data,
    'merchant_name': merchant_name_data,
    'amount': amount,
    'amount_last_30_days': amount_last_30_days,
    'count_last_30_days': count_last_30_days,
    'address_change_last_30_days': address_change_last_30_days,
    'phone_change_last_60_days': phone_change_last_60_days,
    'target': is_fraud
}

df = pd.DataFrame(data)

# Calculate WoE for 'category' and 'merchant_name'
df['category_woe'] = calculate_woe(df, 'category', 'target')
df['merchant_name_woe'] = calculate_woe(df, 'merchant_name', 'target')

# Define features and target variable
features_with_woe = ['amount', 'amount_last_30_days', 'count_last_30_days',
                     'address_change_last_30_days', 'phone_change_last_60_days',
                     'category_woe', 'merchant_name_woe']

features_without_woe = ['amount', 'amount_last_30_days', 'count_last_30_days',
                        'address_change_last_30_days', 'phone_change_last_60_days']

target = 'target'

# Split data into train and test sets
X_with_woe = df[features_with_woe]
y = df[target]
X_train_with_woe, X_test_with_woe, y_train, y_test = train_test_split(X_with_woe, y, test_size=0.2, random_state=42)

X_without_woe = df[features_without_woe]
X_train_without_woe, X_test_without_woe = train_test_split(X_without_woe, test_size=0.2, random_state=42)

# Handle missing values
X_train_with_woe.fillna(X_train_with_woe.mean(), inplace=True)
X_test_with_woe.fillna(X_test_with_woe.mean(), inplace=True)

from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_with_woe_imputed = imputer.fit_transform(X_train_with_woe)
X_test_with_woe_imputed = imputer.transform(X_test_with_woe)

# Train a random forest model with WoE variables
rf_model_with_woe = RandomForestClassifier(random_state=42)
rf_model_with_woe.fit(X_train_with_woe_imputed, y_train)

# Make predictions
y_pred_rf_with_woe = rf_model_with_woe.predict(X_test_with_woe_imputed)

# Evaluate the model with WoE variables
accuracy_rf_with_woe = accuracy_score(y_test, y_pred_rf_with_woe)
print("\nAccuracy with WoE variables (Random Forest):", accuracy_rf_with_woe)
print("Classification Report with WoE variables (Random Forest):")
print(classification_report(y_test, y_pred_rf_with_woe))

# Get feature importance for the model with WoE variables
importance_rf_with_woe = rf_model_with_woe.feature_importances_

# Ensure the lengths of feature names and importances are the same
if len(features_with_woe) == len(importance_rf_with_woe):
    feature_importance_rf_with_woe = pd.DataFrame({'Feature': features_with_woe, 'Importance': importance_rf_with_woe})
    feature_importance_rf_with_woe_sorted = feature_importance_rf_with_woe.sort_values(by='Importance', ascending=False)
    print("\nFeature Importance with WoE variables (Random Forest, sorted):")
    print(feature_importance_rf_with_woe_sorted)
else:
    print("Error: The lengths of feature names and importances are not the same.")



Accuracy with WoE variables (Random Forest): 0.99
Classification Report with WoE variables (Random Forest):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       198
           1       0.00      0.00      0.00         2

    accuracy                           0.99       200
   macro avg       0.49      0.50      0.50       200
weighted avg       0.98      0.99      0.99       200

Error: The lengths of feature names and importances are not the same.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
len(features_with_woe)

7

In [15]:
importance_rf_with_woe

array([0.41333626, 0.45805978, 0.07557519, 0.02548112, 0.02754765])

In [None]:
# Train a random forest model without WoE variables
rf_model_without_woe = RandomForestClassifier(random_state=42)
rf_model_without_woe.fit(X_train_without_woe, y_train)

# Make predictions
y_pred_rf_without_woe = rf_model_without_woe.predict(X_test_without_woe)

# Evaluate the model without WoE variables
accuracy_rf_without_woe = accuracy_score(y_test, y_pred_rf_without_woe)
print("\nAccuracy without WoE variables (Random Forest):", accuracy_rf_without_woe)
print("Classification Report without WoE variables (Random Forest):")
print(classification_report(y_test, y_pred_rf_without_woe))

# Get feature importance for the model without WoE variables
importance_rf_without_woe = rf_model_without_woe.feature_importances_
feature_importance_rf_without_woe = pd.DataFrame({'Feature': features_without_woe, 'Importance': importance_rf_without_woe})
feature_importance_rf_without_woe_sorted = feature_importance_rf_without_woe.sort_values(by='Importance', ascending=False)
print("\nFeature Importance without WoE variables (Random Forest, sorted):")
print(feature_importance_rf_without_woe_sorted)
