In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Load the data
data = pd.read_csv('../data/processed/processed_data.csv')
# Display the first few rows
print(data.head())

   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId CurrencyCode  \
0          78150    46980       2490            3535        2584          UGX   
1          75821    31755       3219            2366        2584          UGX   
2          39888    60272       2713             996        2806          UGX   
3          48738     1797       3351             974        3733          UGX   
4          41364    48941       3219            2366        3733          UGX   

   CountryCode    ProviderId     ProductId     ProductCategory  ...  \
0          0.0  ProviderId_6  ProductId_10             airtime  ...   
1          0.0  ProviderId_4   ProductId_6  financial_services  ...   
2          0.0  ProviderId_6   ProductId_1             airtime  ...   
3          0.0  ProviderId_1  ProductId_21        utility_bill  ...   
4          0.0  ProviderId_4   ProductId_6  financial_services  ...   

  PricingStrategy  FraudResult  TotalTransactionAmount  \
0       -0.349252    -0.0449

In [7]:
# Dropping unique identifier columns and other non-numeric columns
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId'], inplace=True)

In [8]:
# Encoding categorical variables
data = pd.get_dummies(data, columns=['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'])

In [9]:
# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)

In [10]:
# Drop the 'TransactionStartTime' column since it is non-numeric
data.drop(columns=['TransactionStartTime'], inplace=True)

In [11]:
# Check the columns and their types to ensure all are numeric
print("Data Types After Encoding and Dropping Columns:\n", data.dtypes)

Data Types After Encoding and Dropping Columns:
 CustomerId                              int64
CountryCode                           float64
Amount                                float64
Value                                 float64
PricingStrategy                       float64
FraudResult                           float64
TotalTransactionAmount                float64
AverageTransactionAmount              float64
TransactionCount                      float64
TransactionStdDev                     float64
TransactionHour                         int64
TransactionDay                          int64
TransactionMonth                        int64
TransactionYear                         int64
CurrencyCode_UGX                        int32
ProviderId_ProviderId_1                 int32
ProviderId_ProviderId_2                 int32
ProviderId_ProviderId_3                 int32
ProviderId_ProviderId_4                 int32
ProviderId_ProviderId_5                 int32
ProviderId_ProviderId_6        

In [12]:
# Investigate unique values in 'FraudResult' and their counts
print("Unique values in FraudResult before correction:", data['FraudResult'].unique())
print("Counts of unique values in FraudResult before correction:\n", data['FraudResult'].value_counts())

Unique values in FraudResult before correction: [-0.04496219 22.24090895]
Counts of unique values in FraudResult before correction:
 FraudResult
-0.044962     95469
 22.240909      193
Name: count, dtype: int64


In [13]:
# Ensure the target variable is binary
# Assuming values other than 0 should be treated as 1 (fraudulent)
# Check the minimum and maximum values to understand the range
print("Min value in FraudResult:", data['FraudResult'].min())
print("Max value in FraudResult:", data['FraudResult'].max())

Min value in FraudResult: -0.0449621911622598
Max value in FraudResult: 22.24090895372945


In [14]:
# Correcting the transformation
data['FraudResult'] = data['FraudResult'].apply(lambda x: 1 if x > 0 else 0)

In [15]:
# Verify unique values in target variable after correction
print("Unique values in FraudResult after correction:", data['FraudResult'].unique())
print("Counts of unique values in FraudResult after correction:\n", data['FraudResult'].value_counts())

Unique values in FraudResult after correction: [0 1]
Counts of unique values in FraudResult after correction:
 FraudResult
0    95469
1      193
Name: count, dtype: int64


In [16]:
# Define features and target
X = data.drop('FraudResult', axis=1)
y = data['FraudResult']

In [17]:
# Verify unique values in target variable before split
print("Unique values in target variable before split:", y.unique())

Unique values in target variable before split: [0 1]


In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
# Verify unique values in y_test
print("Unique values in y_test after split:", y_test.unique())

Unique values in y_test after split: [0 1]


In [20]:
# Check the balance of the classes in y_train and y_test
print("Counts of unique values in y_train after split:\n", y_train.value_counts())
print("Counts of unique values in y_test after split:\n", y_test.value_counts())

Counts of unique values in y_train after split:
 FraudResult
0    66826
1      137
Name: count, dtype: int64
Counts of unique values in y_test after split:
 FraudResult
0    28643
1       56
Name: count, dtype: int64


In [21]:
# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# Model selection and initial training
log_reg = LogisticRegression(max_iter=1000)
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()
gbm = GradientBoostingClassifier()

In [23]:
# Hyperparameter tuning for Logistic Regression
param_grid_log_reg = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='roc_auc')
grid_search_log_reg.fit(X_train, y_train)

In [24]:
# Hyperparameter tuning for Decision Tree
param_grid_dec_tree = {'max_depth': [None, 10, 20, 30, 40, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5, 10]}
grid_search_dec_tree = GridSearchCV(dec_tree, param_grid_dec_tree, cv=5, scoring='roc_auc')
grid_search_dec_tree.fit(X_train, y_train)

In [25]:
# Hyperparameter tuning for Random Forest
param_grid_rand_forest = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5]}
random_search_rand_forest = RandomizedSearchCV(rand_forest, param_grid_rand_forest, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_rand_forest.fit(X_train, y_train)

In [26]:
# Hyperparameter tuning for GBM
param_grid_gbm = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1.0]}
random_search_gbm = RandomizedSearchCV(gbm, param_grid_gbm, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_gbm.fit(X_train, y_train)

In [None]:
# Function to extract GridSearchCV results into a DataFrame
def extract_grid_search_results(grid_search):
    results = pd.DataFrame(grid_search.cv_results_)
    return results

##### Extract results

In [None]:
# Extract results
results_log_reg = extract_grid_search_results(grid_search_log_reg)
results_dec_tree = extract_grid_search_results(grid_search_dec_tree)

In [None]:
# Display the results DataFrame
print("Logistic Regression GridSearchCV Results:\n", results_log_reg.head())
print("Decision Tree GridSearchCV Results:\n", results_dec_tree.head())

In [None]:
# Model evaluation
models = {
    'Logistic Regression': grid_search_log_reg.best_estimator_,
    'Decision Tree': grid_search_dec_tree.best_estimator_,
    'Random Forest': random_search_rand_forest.best_estimator_,
    'Gradient Boosting': random_search_gbm.best_estimator_
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, pos_label=1)}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label=1)}")
    print(f"F1 Score: {f1_score(y_test, y_pred, pos_label=1)}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob)}")
    print("-" * 30)

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_test, y_pred_prob):.2f})')

# Finalize the ROC curve plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
# Function to assign risk probability
def assign_risk_probability(model, customer_data):
    risk_probability = model.predict_proba(customer_data)[:, 1]
    return risk_probability

In [None]:
# Function to map risk probability to credit score
def risk_probability_to_credit_score(risk_probability):
    # Example mapping function - can be adjusted
    credit_score = 850 - (risk_probability * 550)
    return credit_score

In [None]:
# Train loan prediction models
def train_loan_prediction_models(data):
    # Assuming 'loan_amount' and 'loan_duration' are available in your dataset
    # Features for prediction
    features = data.drop(columns=['loan_amount', 'loan_duration'])
    X = features

    # Predicting loan amount
    y_amount = data['loan_amount']
    amount_model = LinearRegression()
    amount_model.fit(X, y_amount)

    # Predicting loan duration
    y_duration = data['loan_duration']
    duration_model = LinearRegression()
    duration_model.fit(X, y_duration)

    return amount_model, duration_model

In [None]:
def predict_loan_amount_duration(amount_model, duration_model, customer_data):
    optimal_amount = amount_model.predict(customer_data)
    optimal_duration = duration_model.predict(customer_data)
    return optimal_amount, optimal_duration

#### Model to Assign Risk Probability for a New Customer

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('processed_data.csv')

# Dropping unique identifier columns and other non-numeric columns
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId'], inplace=True)

# Encoding categorical variables
data = pd.get_dummies(data, columns=['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'])

# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)

# Drop the 'TransactionStartTime' column since it is non-numeric
data.drop(columns=['TransactionStartTime'], inplace=True)

# Ensure the target variable is categorical
data['FraudResult'] = data['FraudResult'].astype(int)

# Define features and target
X = data.drop('FraudResult', axis=1)
y = data['FraudResult']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
rand_forest = RandomForestClassifier(random_state=42)
gbm = GradientBoostingClassifier(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid_rand_forest = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

random_search_rand_forest = RandomizedSearchCV(rand_forest, param_grid_rand_forest, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_rand_forest.fit(X_train, y_train)

# Hyperparameter tuning for GBM
param_grid_gbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}

random_search_gbm = RandomizedSearchCV(gbm, param_grid_gbm, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_gbm.fit(X_train, y_train)

# Model evaluation
models = {
    'Random Forest': random_search_rand_forest.best_estimator_,
    'Gradient Boosting': random_search_gbm.best_estimator_
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, pos_label=1)}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label=1)}")
    print(f"F1 Score: {f1_score(y_test, y_pred, pos_label=1)}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob)}")
    print("-" * 30)

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_test, y_pred_prob):.2f})')

# Finalize the ROC curve plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

# Model to assign risk probability for a new customer
def assign_risk_probability(new_customer_data):
    new_customer_data = scaler.transform(new_customer_data)
    risk_prob = random_search_rand_forest.best_estimator_.predict_proba(new_customer_data)[:, 1]
    return risk_prob

FileNotFoundError: [Errno 2] No such file or directory: 'processed_data.csv'

#### Model to Assign Credit Score from Risk Probability Estimates

In [None]:
# Model to assign credit score from risk probability estimates
def assign_credit_score(risk_prob):
    credit_score = np.where(risk_prob <= 0.2, 'Excellent',
                            np.where(risk_prob <= 0.4, 'Good',
                                     np.where(risk_prob <= 0.6, 'Fair',
                                              np.where(risk_prob <= 0.8, 'Poor', 'Very Poor'))))
    return credit_score

#### Model to Predict the Optimal Amount and Duration of the Loan

In [None]:
# Model to predict the optimal amount and duration of the loan
def predict_optimal_loan_amount_and_duration(risk_prob):
    max_loan_amount = np.where(risk_prob <= 0.2, 50000,
                               np.where(risk_prob <= 0.4, 25000,
                                        np.where(risk_prob <= 0.6, 10000,
                                                 np.where(risk_prob <= 0.8, 5000, 1000))))
    loan_duration_months = np.where(risk_prob <= 0.2, 60,
                                    np.where(risk_prob <= 0.4, 48,
                                             np.where(risk_prob <= 0.6, 36,
                                                      np.where(risk_prob <= 0.8, 24, 12))))
    return max_loan_amount, loan_duration_months

#### Example Usage

In [None]:
# Assuming new_customer_data is a DataFrame containing the new customer data
# new_customer_data = pd.DataFrame({...})

# Example new customer data (replace with actual data)
example_data = {
    'Amount': [5000],
    'Value': [5000],
    'PricingStrategy': [2],
    'CurrencyCode_UGX': [1],
    'CountryCode_256': [1],
    'ProviderId_1': [0],
    'ProviderId_4': [0],
    'ProviderId_6': [1],
    'ProductId_1': [0],
    'ProductId_3': [1],
    'ProductId_6': [0],
    'ProductId_10': [0],
    'ProductId_21': [0],
    'ProductCategory_airtime': [1],
    'ProductCategory_financial_services': [0],
    'ProductCategory_utility_bill': [0],
    'ChannelId_2': [0],
    'ChannelId_3': [1]
}
new_customer_data = pd.DataFrame(example_data)

# Assign risk probability
risk_prob = assign_risk_probability(new_customer_data)
print("Risk Probability:", risk_prob)

# Assign credit score
credit_score = assign_credit_score(risk_prob)
print("Credit Score:", credit_score)

# Predict the optimal loan amount and duration
max_loan_amount, loan_duration_months = predict_optimal_loan_amount_and_duration(risk_prob)
print("Max Loan Amount:", max_loan_amount)
print("Loan Duration (months):", loan_duration_months)
