In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
import datetime as dt

# Data Viz 
import seaborn as sns
import matplotlib.pyplot as plt

# Data Manipulation
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Similarity calculation
from sklearn.metrics.pairwise import cosine_similarity

# Import ML libraries
# import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# settings
pd.options.display.max_rows = 100
pd.options.display.max_columns = None

# Math
import math

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('synthetic_data_latest.csv')
df.head()

Unnamed: 0,Company Name,CIF Number,Sector Nm,Operations Start Date,Facility product Name,Match as per rating,Rating Description,Match as per Operational status,Match as per DPD Status
0,Abbott and Sons,6262535811,Tourism,04/26/2022,EYADA,1,Very low risk,Permanently closed,Defaulted
1,Abbott and Sons,6262535811,Tourism,08/08/2020,AL Dhameen,2,Low risk,Setup,Nil
2,Abbott and Sons,6262535811,Tourism,12/22/2020,MINHA Costing Analysis & Expense Reduction,4,High risk,Temporarily Closed,Defaulted
3,Abbott Group,9332362264,Food Processing,07/07/2020,Direct Lending,1,Very low risk,Setup,Nil
4,Abbott LLC,8730879310,Services,10/22/2020,Marketing & PR,5,Very high risk,Operational,Defaulted


In [3]:
df.rename(columns={'Facility product Name':'Facility_product_Name'}, inplace=True)

In [4]:
product_list = list(df.Facility_product_Name.unique())
feature_list = ['Company_Name', 'CIF_Number', 'Sector_Nm', 'Operations_Start_Date',
                'Match_as_per_rating', 'Rating_Description', 'Match_as_per_Operational_status', 
                'Match_as_per_DPD_Status']
col_list = feature_list + product_list

In [5]:
df_new = pd.DataFrame(columns = col_list)
df_new

Unnamed: 0,Company_Name,CIF_Number,Sector_Nm,Operations_Start_Date,Match_as_per_rating,Rating_Description,Match_as_per_Operational_status,Match_as_per_DPD_Status,EYADA,AL Dhameen,MINHA Costing Analysis & Expense Reduction,Direct Lending,Marketing & PR,Business Consultancy and Mentoring,OQOOD legal services,Export Finance,ISTITHMAR (SME EQUITY)


In [6]:
df_new['Company_Name'] = df['Company Name']
df_new['CIF_Number'] = df['CIF Number']
df_new['Sector_Nm'] = df['Sector Nm']
df_new['Operations_Start_Date'] = df['Operations Start Date']
df_new['Match_as_per_rating'] = df['Match as per rating']
df_new['Rating_Description'] = df['Rating Description']
df_new['Match_as_per_Operational_status'] = df['Match as per Operational status']
df_new['Match_as_per_DPD_Status'] = df['Match as per DPD Status']

In [7]:
unique_cif = list(df_new.CIF_Number.unique())
len(unique_cif)

24350

In [8]:
df_new.fillna(0, inplace=True)
# df_new

In [9]:
%%time
for cif_num in unique_cif:
    df_sub = df[df['CIF Number']==cif_num]
    products = list(df_sub.Facility_product_Name.unique())
    index = list(df_sub[df_sub['CIF Number']==cif_num].index.values.astype(int))
    for ind, name in zip(index, products):
        df_new.at[ind, name] = 1

CPU times: user 20.2 s, sys: 1.51 ms, total: 20.2 s
Wall time: 20.2 s


In [10]:
# df1 = df[df['CIF Number']==7820159106]
# prod_list = list(df1.Facility_product_Name.unique())
# prod_list

In [11]:
# index = list(df[df['CIF Number']==7820159106].index.values.astype(int))
# index

In [12]:
# for ind, name in zip(index, prod_list):
#     df_new.at[ind, name] = 1

In [13]:
df_new = df_new[0:10000]
df_new['Company_Name'] = df_new['Company_Name'].astype('string')
df_new['CIF_Number'] = df_new['CIF_Number'].astype('string')
df_new['Sector_Nm'] = df_new['Sector_Nm'].astype('string')
df_new['Operations_Start_Date'] = df_new['Operations_Start_Date'].astype('string')
df_new['Match_as_per_rating'] = df_new['Match_as_per_rating'].astype('string')
df_new['Rating_Description'] = df_new['Rating_Description'].astype('string')
df_new['Match_as_per_Operational_status'] = df_new['Match_as_per_Operational_status'].astype('string')
df_new['Match_as_per_DPD_Status'] = df_new['Match_as_per_DPD_Status'].astype('string')

GBDT + LR
The GBDT + LR (Gradient Boosting Decision Tree + Logistic Regression) recommender is a hybrid recommendation model that combines the strengths of gradient boosting decision trees and logistic regression. It leverages the powerful feature learning capabilities of gradient boosting decision trees and the interpretability and modeling flexibility of logistic regression.

This method combines the advantages of both models. The GBDT component captures complex patterns and interactions, while the LR component provides interpretability and handles feature engineering effectively. This hybrid approach often results in improved recommendation performance and can handle large-scale datasets with high-dimensional features.

When applied in the recommendation system field, once the GBDT + LR model is trained, it can be used to generate product recommendations for targeted customers. Given a customer's profile, item features and user-item interactions, the model predicts the probability of user-item preferences. Then the items with the highest predicted probabilities are recommended to the customer.

In [14]:
# Given the product list will be used as response variable to be predicted, the train_dummy will be 
# used as customer features to the training process
train_dummy = df_new.select_dtypes(include = 'string')
train_dummy = pd.get_dummies(train_dummy)

In [15]:
X = train_dummy

In [16]:
pred = {}

In [17]:
for i in product_list:
    print(i)
    # Use the target product column as the target variable
    if df_new[i].nunique() == 2:
        
        y = df_new[i] 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
        
        # Train the GBDT model
        gbdt_model = GradientBoostingClassifier(n_estimators=300,  # Number of boosting stages
                                           learning_rate=0.1,  # Learning rate
                                           max_depth=3,  # Maximum depth of each tree
                                           random_state=123) 
        gbdt_model.fit(X_train, y_train)
    
        # Generate GBDT features
        gbdt_features = gbdt_model.apply(X_train)[:, :, 0]
    
        # Train the LR model using the GBDT generated features
        lr_model = LogisticRegression(solver='lbfgs', C=1.0, random_state=123)

        lr_model.fit(gbdt_features, y_train)
    
        # Generate LR features
        gbdt_features_test = gbdt_model.apply(X_test)[:, :, 0]
        lr_features = lr_model.predict_proba(gbdt_features_test)[:, 1]
    
        # Store the predicted score for the product
        pred[i] = lr_features
    
    else: 
        pass

EYADA


KeyboardInterrupt: 

In [57]:
pred = pd.DataFrame(pred)
pred.head()

Unnamed: 0,Export Finance,ISTITHMAR (SME EQUITY),Company_Name,CIF_Number,EYADA,AL Dhameen,MINHA Costing Analysis & Expense Reduction,Direct Lending,Marketing & PR,Business Consultancy and Mentoring,OQOOD legal services
0,0.175103,0.076004,"Bauer, Thornton and Blake",6685864841,0.023908,0.309585,0.029609,0.182009,0.020353,0.035813,0.029517
1,0.187818,0.059247,"Ali, Mullins and Salazar",1133813567,0.028225,0.306685,0.020923,0.173729,0.011574,0.083028,0.027189
2,0.121785,0.085262,"Cruz, Cox and Payne",8056131489,0.021613,0.350492,0.01409,0.193179,0.01598,0.020906,0.360859
3,0.187818,0.059247,Aguirre-Thomas,1975959467,0.028225,0.306685,0.018647,0.173729,0.011574,0.083028,0.027189
4,0.488486,0.000201,Bradley-Bennett,3237624298,0.034403,0.003157,0.047687,0.001448,0.019523,0.048015,0.031068


In [58]:
index_list = list(X_test.index.values)

In [59]:
company_name = []
cif_number = []
for i in index_list:
    df_trunc = pd.DataFrame(df_new.iloc[i]).T
    company_name.append(df_trunc.at[i, 'Company_Name'])
    cif_number .append(df_trunc.at[i, 'CIF_Number'])

In [60]:
# df1 = pd.DataFrame(df_new.iloc[2656]).T
# com = df1.at[2656, 'Company_Name']
# cif_no = df1.at[2656, 'CIF_Number']
# com, cif_no

In [61]:
pred['Company_Name'] = company_name
pred['CIF_Number'] = cif_number

In [62]:
pred

Unnamed: 0,Export Finance,ISTITHMAR (SME EQUITY),Company_Name,CIF_Number,EYADA,AL Dhameen,MINHA Costing Analysis & Expense Reduction,Direct Lending,Marketing & PR,Business Consultancy and Mentoring,OQOOD legal services
0,0.175103,0.076004,"Bauer, Thornton and Blake",6685864841,0.023908,3.095849e-01,0.029609,0.182009,0.020353,3.581337e-02,0.029517
1,0.187818,0.059247,"Ali, Mullins and Salazar",1133813567,0.028225,3.066847e-01,0.020923,0.173729,0.011574,8.302826e-02,0.027189
2,0.121785,0.085262,"Cruz, Cox and Payne",8056131489,0.021613,3.504925e-01,0.014090,0.193179,0.015980,2.090567e-02,0.360859
3,0.187818,0.059247,Aguirre-Thomas,1975959467,0.028225,3.066847e-01,0.018647,0.173729,0.011574,8.302826e-02,0.027189
4,0.488486,0.000201,Bradley-Bennett,3237624298,0.034403,3.156600e-03,0.047687,0.001448,0.019523,4.801545e-02,0.031068
...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000479,0.004841,"Cline, Turner and Franco",4924570915,0.204505,8.088428e-08,0.316580,0.000003,0.001191,2.005158e-01,0.295184
1996,0.121785,0.085262,Carey Inc,2714649531,0.021613,3.504925e-01,0.020297,0.193179,0.015980,2.090567e-02,0.024685
1997,0.175103,0.076004,Bell Ltd,9419413771,0.023908,3.095849e-01,0.028761,0.182009,0.020353,2.401491e-02,0.029517
1998,0.187818,0.059247,Clarke-Miller,9791243105,0.028225,6.869148e-01,0.020923,0.173729,0.011574,8.302826e-02,0.027189


In [53]:
pred = pred.iloc[:,[-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]]
pred

In [54]:
pred.to_csv('Results for 10000 trained.csv', index=False)

In [63]:
pred = pd.read_csv('Results for 10000 trained.csv')
pred

Unnamed: 0,Company_Name,CIF_Number,EYADA,AL Dhameen,MINHA Costing Analysis & Expense Reduction,Direct Lending,Marketing & PR,Business Consultancy and Mentoring,OQOOD legal services,Export Finance,ISTITHMAR (SME EQUITY)
0,"Bauer, Thornton and Blake",6685864841,0.023908,3.095849e-01,0.029609,0.182009,0.020353,3.581337e-02,0.029517,0.175103,0.076004
1,"Ali, Mullins and Salazar",1133813567,0.028225,3.066847e-01,0.020923,0.173729,0.011574,8.302826e-02,0.027189,0.187818,0.059247
2,"Cruz, Cox and Payne",8056131489,0.021613,3.504925e-01,0.014090,0.193179,0.015980,2.090567e-02,0.360859,0.121785,0.085262
3,Aguirre-Thomas,1975959467,0.028225,3.066847e-01,0.018647,0.173729,0.011574,8.302826e-02,0.027189,0.187818,0.059247
4,Bradley-Bennett,3237624298,0.034403,3.156600e-03,0.047687,0.001448,0.019523,4.801545e-02,0.031068,0.488486,0.000201
...,...,...,...,...,...,...,...,...,...,...,...
1995,"Cline, Turner and Franco",4924570915,0.204505,8.088428e-08,0.316580,0.000003,0.001191,2.005158e-01,0.295184,0.000479,0.004841
1996,Carey Inc,2714649531,0.021613,3.504925e-01,0.020297,0.193179,0.015980,2.090567e-02,0.024685,0.121785,0.085262
1997,Bell Ltd,9419413771,0.023908,3.095849e-01,0.028761,0.182009,0.020353,2.401491e-02,0.029517,0.175103,0.076004
1998,Clarke-Miller,9791243105,0.028225,6.869148e-01,0.020923,0.173729,0.011574,8.302826e-02,0.027189,0.187818,0.059247


In [24]:
# # Append the predicted results of the 2000 testing customers into the feature table and generate sample recommendation list
# # Could also use the testing dataset as input
# pred = pd.DataFrame(pred)
# test_df = pd.concat([X_test.reset_index(names = 'CIF_Number'), pred],axis = 1, ignore_index = True)

# col_list = ['CIF_Number'] + list(X_test.columns) + list(pred.columns)
# test_df.columns = col_list
# test_df['CIF_Number'] = test_df['CIF_Number'].astype('string')

In [26]:
# # Testing the Product Recommendation with three sample customers:

# def gbdt_product_recommender(df, cif_num, top_n):
    
#     cif_num = str(cif_num)
#     prod_list = df.loc[df['CIF_Number'] == cif_num, pred.columns].T
#     prod_list.columns = ['pred_score']
#     prod_list = prod_list.sort_values(by = 'pred_score', ascending = False)
    
#     # Ouput the top N recommended products based on the customer's features. If the probability is lower than 0.5 do not output (the customer would not want this one)
#     prod_list = prod_list[prod_list['pred_score'] >= 0.5]
#     recommend_list = prod_list[0:top_n]
    
#     while len(recommend_list) == 0:
#         print("Based on the customer's info, there is no bank product recommended for now")
#         break
    
#     return recommend_list

In [2]:
# Test Case 1: Customer ID "12718116", Top 10 products
gbdt_product_recommender(df = test_df, cif_num = "8018", top_n = 5)

NameError: name 'gbdt_product_recommender' is not defined