In [2]:
import pandas as pd #importing essential library 
import numpy as np 

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from sklearn.metrics import mean_absolute_error


In [3]:
#renamed the files to make it more accessible and intuitive.
#reading csv files into pd dataframes for manipulation and data wrangling
cardholder_acc = pd.read_csv('../csvs_datathon/accy_dim.csv')
statement_data = pd.read_csv('../csvs_datathon/statement_fact.csv')
transaction_data = pd.read_csv('../csvs_datathon/transaction_fact.csv')
wrld_transaction_data = pd.read_csv('../csvs_datathon/wrld_stor_tran_fact.csv')
customer_id = pd.read_csv('../csvs_datathon/syf_id.csv')
acc_lvl_features = pd.read_csv('../csvs_datathon/rams_batch_cur.csv')
fraud_claim_case = pd.read_csv('../csvs_datathon/fraud_claim_case.csv')
fraud_claim_tran = pd.read_csv('../csvs_datathon/fraud_claim_tran.csv')

In [None]:
transaction_data_sales = transaction_data[(transaction_data['transaction_type'] != 'PAYMENT')]
wrld_transaction_data_sales = wrld_transaction_data[(wrld_transaction_data['transaction_type'] != "PAYMENT")] 

transaction_data_sales['transaction_date'] = pd.to_datetime(transaction_data_sales['transaction_date'])
transaction_data_salesf = transaction_data_sales[(transaction_data_sales['transaction_date'] >= "2024-08-01") & (transaction_data_sales['transaction_date'] <= "2025-03-31")] 

wrld_transaction_data_sales['transaction_date'] = pd.to_datetime(wrld_transaction_data_sales['transaction_date'])
wrld_transaction_data_salesf = wrld_transaction_data_sales[(wrld_transaction_data_sales['transaction_date'] >= "2024-08-01") & (wrld_transaction_data_sales['transaction_date'] <= "2025-03-31")]


total_sales_customer1 = transaction_data_sales.groupby('current_account_nbr')['transaction_amt'].sum().reset_index()
total_sales_customer1.rename(columns = {'transaction_amt' : 'total spent'})

total_sales_customer2 = wrld_transaction_data_sales.groupby('current_account_nbr')['transaction_amt'].sum().reset_index() 
total_sales_customer2.rename(columns = {'transaction_amt' : 'total spent'})

f_sales1 = transaction_data_salesf.groupby('current_account_nbr')['transaction_amt'].sum().reset_index()
f_sales2 = wrld_transaction_data_salesf.groupby('current_account_nbr')['transaction_amt'].sum().reset_index()

all_transactions = pd.DataFrame(columns = ['current account nbr', 'total spent'])
acc_nbrs = total_sales_customer1['current_account_nbr'].to_list() + total_sales_customer2['current_account_nbr'].to_list()
acc_total_spent = total_sales_customer1['transaction_amt'].to_list() + total_sales_customer2['transaction_amt'].to_list()


all_transactions['current_account_nbr'] = acc_nbrs
all_transactions['total spent'] = acc_total_spent

all_transactions['total spent per month'] = all_transactions['total spent'] / 12
all_transactions['total spent per quarter'] = all_transactions['total spent'] / 3
all_transactions

f_sales1.rename(columns = {'transaction_amt': 'total spent last 8 months'}, inplace = True)
f_sales2.rename(columns = {'transaction_amt': 'total spent last 8 months'}, inplace = True)
all_transactions = pd.merge(all_transactions, f_sales1, on = 'current_account_nbr', how = 'left')
all_transactions = pd.merge(all_transactions, f_sales2, on = 'current_account_nbr', how = 'left', suffixes = ('', '_wrld'))
all_transactions['total spent last 8 months'] = all_transactions[['total spent last 8 months', 'total spent last 8 months_wrld']].sum(axis=1)
all_transactions.drop(columns=['total spent last 8 months_wrld'], inplace = True)
all_transactions['total spent last 8 months'].fillna(0, inplace = True)

acc_lvl_features = acc_lvl_features.rename(columns = {'cu_account_nbr' : 'current_account_nbr'})
avg_util_3_months = acc_lvl_features.groupby('current_account_nbr')['ca_avg_utilz_lst_3_mnths'].mean().reset_index()
all_transactions = pd.merge(all_transactions, avg_util_3_months, on = 'current_account_nbr', how = 'left')
all_transactions['ca_avg_utilz_lst_3_mnths'].fillna(0, inplace = True)

acc_lvl_features['difference_behav_score'] = acc_lvl_features['rb_new_bhv_scr'] - acc_lvl_features['cu_bhv_scr']
diff_behav_scr = acc_lvl_features.groupby('current_account_nbr')['difference_behav_score'].mean().reset_index()
all_transactions = pd.merge(all_transactions, diff_behav_scr, on = 'current_account_nbr', how = 'left')
all_transactions['difference_behav_score'].fillna(0, inplace = True)
all_transactions = all_transactions.rename(columns = {'total spent' : 'total spent year'})

In [None]:
all_transactions

In [6]:
#https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
x = all_transactions[['total spent year', 'total spent per month', 'total spent last 8 months', 'ca_avg_utilz_lst_3_mnths', 'difference_behav_score']]
y = all_transactions[['total spent per quarter']]

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.20, random_state = 50)
xgboost_model = XGBRegressor(n_estimators = 200, learning_rate = 0.1)
xgboost_model.fit(X_train, Y_train)
predictions = xgboost_model.predict(X_test)

In [None]:
r_squared = r2_score(Y_test, predictions)
r_squared

In [None]:
rmse = np.sqrt(mean_squared_error(Y_test, predictions))
rmse

In [None]:
#https://medium.com/@rithpansanga/optimizing-xgboost-a-guide-to-hyperparameter-tuning-77b6e48e289d
param_dist = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.1),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(50, 200)
}
random_search = RandomizedSearchCV(xgboost_model, param_distributions = param_dist, n_iter = 30, cv = 5, scoring = 'accuracy')
random_search.fit(X_train, Y_train) 

In [None]:
random_search.best_params_

In [11]:
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(x, y, test_size = 0.20, random_state = 50)
xgboost_model = XGBRegressor(n_estimators = 50, learning_rate = 0.08866319931044156, max_depth = 9, subsample = 0.8036166601891328)
xgboost_model.fit(X_train2, Y_train2)
predictions2 = xgboost_model.predict(X_test2)

In [None]:
r_squared2 = r2_score(Y_test2, predictions2)
r_squared2

In [None]:
rmse2 = np.sqrt(mean_squared_error(Y_test2, predictions2))
rmse2

In [None]:
range_y2 = Y_test2.max() - Y_test2.min()
n_rmse2 = rmse2 / range_y2
n_rmse2

In [None]:
mae = mean_absolute_error(Y_test2, predictions2)
mae