# Use Credit Risk Analytics Notebook Template

In [1]:
from fosforio import snowflake
import pandas as pd
import datetime as dt

Connection manager service url initialised to http://fdc-project-manager:80/project-manager
If you need to update its value then update the variable CONNECTION_MANAGER_BASE_URL in os env.


In [2]:
import seaborn as sns
import subprocess

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-6wlic7cv because the default path (/home/mosaic-ai/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [3]:
from fosforml import *
from fosforml.constants import MLModelFlavours
import requests

In [4]:
# Import label encoder 
from sklearn import preprocessing 
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np

In [5]:
# To get snowflake connection object with a default snowflake connection created by the user, if available.
#snowflake.get_connection()

# To get snowflake connection object with a specific connection name
snowflake.get_connection(connection_name="FDC_Banking_FS_SNOWFLAKE")

Exception occurred in getting snowflake connection: 'connectionSources'


In [6]:
# To read a specific dataset published from a snowflake connection
df_original = snowflake.get_dataframe("ATM_TRANSACTION_MASTER_DATA")
df = snowflake.get_dataframe("ATM_TRANSACTION_MASTER_DATA")

In [7]:
df.shape

(328860, 11)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328860 entries, 0 to 328859
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DATE                     328860 non-null  object 
 1   WEEKDAY_FLAG             328860 non-null  object 
 2   HOLIDAY_FLAG             328860 non-null  object 
 3   BANK_ID                  328860 non-null  int8   
 4   STATE                    328860 non-null  object 
 5   BANK_NAME                328860 non-null  object 
 6   ATM_ID                   328860 non-null  object 
 7   DISPENSED_AMOUNT         328860 non-null  float64
 8   TOTAL_TRANSACTION_COUNT  328860 non-null  float64
 9   DOWNTIME_IN_MINS         328860 non-null  float64
 10  ATM_MAX_CAPACITY         328860 non-null  float64
dtypes: float64(4), int8(1), object(6)
memory usage: 25.4+ MB


In [9]:
df.head()

Unnamed: 0,DATE,WEEKDAY_FLAG,HOLIDAY_FLAG,BANK_ID,STATE,BANK_NAME,ATM_ID,DISPENSED_AMOUNT,TOTAL_TRANSACTION_COUNT,DOWNTIME_IN_MINS,ATM_MAX_CAPACITY
0,2023-01-29,Y,N,3,Maine,Citibank,TBH000274025,225180.0,90.0,0.0,2860000.0
1,2023-01-30,Y,N,3,Maine,Citibank,TBH000274025,262440.0,117.0,0.0,2860000.0
2,2023-01-31,Y,N,3,Maine,Citibank,TBH000274025,561150.0,160.0,0.0,2860000.0
3,2023-02-01,Y,N,3,Maine,Citibank,TBH000274025,437220.0,129.0,0.0,2860000.0
4,2023-02-02,Y,N,3,Maine,Citibank,TBH000274025,360900.0,119.0,0.0,2860000.0


In [10]:
df.columns

Index(['DATE', 'WEEKDAY_FLAG', 'HOLIDAY_FLAG', 'BANK_ID', 'STATE', 'BANK_NAME',
       'ATM_ID', 'DISPENSED_AMOUNT', 'TOTAL_TRANSACTION_COUNT',
       'DOWNTIME_IN_MINS', 'ATM_MAX_CAPACITY'],
      dtype='object')

In [11]:
df.isnull().sum()

DATE                       0
WEEKDAY_FLAG               0
HOLIDAY_FLAG               0
BANK_ID                    0
STATE                      0
BANK_NAME                  0
ATM_ID                     0
DISPENSED_AMOUNT           0
TOTAL_TRANSACTION_COUNT    0
DOWNTIME_IN_MINS           0
ATM_MAX_CAPACITY           0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328860 entries, 0 to 328859
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DATE                     328860 non-null  object 
 1   WEEKDAY_FLAG             328860 non-null  object 
 2   HOLIDAY_FLAG             328860 non-null  object 
 3   BANK_ID                  328860 non-null  int8   
 4   STATE                    328860 non-null  object 
 5   BANK_NAME                328860 non-null  object 
 6   ATM_ID                   328860 non-null  object 
 7   DISPENSED_AMOUNT         328860 non-null  float64
 8   TOTAL_TRANSACTION_COUNT  328860 non-null  float64
 9   DOWNTIME_IN_MINS         328860 non-null  float64
 10  ATM_MAX_CAPACITY         328860 non-null  float64
dtypes: float64(4), int8(1), object(6)
memory usage: 25.4+ MB


In [13]:
df['DATE'] = pd.to_datetime(df['DATE'])
df['HOUR'] = df['DATE'].dt.hour
df['DAYOFWEEK'] = df['DATE'].dt.dayofweek
df['QUARTER'] = df['DATE'].dt.quarter
df['MONTH'] = df['DATE'].dt.month
df['YEAR'] = df['DATE'].dt.year
df['DAYOFYEAR'] = df['DATE'].dt.dayofyear
df['DAYOFMONTH'] = df['DATE'].dt.day
df['DATE'] = pd.to_datetime(df['DATE']).dt.strftime("%Y-%m-%d %H:%M:%S.%f")

In [14]:
df['WEEKDAY_FLAG'] = df['WEEKDAY_FLAG'].map({'Y': 1, 'N': 0})
df['HOLIDAY_FLAG'] = df['HOLIDAY_FLAG'].map({'Y': 1, 'N': 0})

In [15]:
df.head()

Unnamed: 0,DATE,WEEKDAY_FLAG,HOLIDAY_FLAG,BANK_ID,STATE,BANK_NAME,ATM_ID,DISPENSED_AMOUNT,TOTAL_TRANSACTION_COUNT,DOWNTIME_IN_MINS,ATM_MAX_CAPACITY,HOUR,DAYOFWEEK,QUARTER,MONTH,YEAR,DAYOFYEAR,DAYOFMONTH
0,2023-01-29 00:00:00.000000,1,0,3,Maine,Citibank,TBH000274025,225180.0,90.0,0.0,2860000.0,0,6,1,1,2023,29,29
1,2023-01-30 00:00:00.000000,1,0,3,Maine,Citibank,TBH000274025,262440.0,117.0,0.0,2860000.0,0,0,1,1,2023,30,30
2,2023-01-31 00:00:00.000000,1,0,3,Maine,Citibank,TBH000274025,561150.0,160.0,0.0,2860000.0,0,1,1,1,2023,31,31
3,2023-02-01 00:00:00.000000,1,0,3,Maine,Citibank,TBH000274025,437220.0,129.0,0.0,2860000.0,0,2,1,2,2023,32,1
4,2023-02-02 00:00:00.000000,1,0,3,Maine,Citibank,TBH000274025,360900.0,119.0,0.0,2860000.0,0,3,1,2,2023,33,2


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328860 entries, 0 to 328859
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DATE                     328860 non-null  object 
 1   WEEKDAY_FLAG             328860 non-null  int64  
 2   HOLIDAY_FLAG             328860 non-null  int64  
 3   BANK_ID                  328860 non-null  int8   
 4   STATE                    328860 non-null  object 
 5   BANK_NAME                328860 non-null  object 
 6   ATM_ID                   328860 non-null  object 
 7   DISPENSED_AMOUNT         328860 non-null  float64
 8   TOTAL_TRANSACTION_COUNT  328860 non-null  float64
 9   DOWNTIME_IN_MINS         328860 non-null  float64
 10  ATM_MAX_CAPACITY         328860 non-null  float64
 11  HOUR                     328860 non-null  int64  
 12  DAYOFWEEK                328860 non-null  int64  
 13  QUARTER                  328860 non-null  int64  
 14  MONT

In [17]:
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['STATE']= label_encoder.fit_transform(df['STATE'])

In [18]:
# how to understand word labels. 
atm_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df['ATM_ID']= atm_encoder.fit_transform(df['ATM_ID'])

In [19]:
df.drop(['DATE','BANK_NAME','TOTAL_TRANSACTION_COUNT','DOWNTIME_IN_MINS','ATM_MAX_CAPACITY'], axis=1,inplace=True)

In [20]:
df.columns

Index(['WEEKDAY_FLAG', 'HOLIDAY_FLAG', 'BANK_ID', 'STATE', 'ATM_ID',
       'DISPENSED_AMOUNT', 'HOUR', 'DAYOFWEEK', 'QUARTER', 'MONTH', 'YEAR',
       'DAYOFYEAR', 'DAYOFMONTH'],
      dtype='object')

In [21]:
features = ['WEEKDAY_FLAG', 'HOLIDAY_FLAG', 'BANK_ID', 'STATE', 'ATM_ID',
    'HOUR', 'DAYOFWEEK', 'QUARTER', 'MONTH', 'YEAR',
       'DAYOFYEAR', 'DAYOFMONTH']

In [22]:
print (df.shape)
df = df[df['DISPENSED_AMOUNT'] > 0]
print (df.shape)

(328860, 13)
(271800, 13)


In [23]:
X = df[features]
y = df['DISPENSED_AMOUNT']

In [24]:
X.head()

Unnamed: 0,WEEKDAY_FLAG,HOLIDAY_FLAG,BANK_ID,STATE,ATM_ID,HOUR,DAYOFWEEK,QUARTER,MONTH,YEAR,DAYOFYEAR,DAYOFMONTH
0,1,0,3,18,315,0,6,1,1,2023,29,29
1,1,0,3,18,315,0,0,1,1,2023,30,30
2,1,0,3,18,315,0,1,1,1,2023,31,31
3,1,0,3,18,315,0,2,1,2,2023,32,1
4,1,0,3,18,315,0,3,1,2,2023,33,2


In [25]:
y

0         225180.0
1         262440.0
2         561150.0
3         437220.0
4         360900.0
            ...   
328855    195300.0
328856    245160.0
328857    222480.0
328858    198360.0
328859    314190.0
Name: DISPENSED_AMOUNT, Length: 271800, dtype: float64

In [26]:
# Split the data into training and test sets. (0.75, 0.25) split.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

In [27]:
print(f'Total # of sample in whole dataset: {len(X)}')
print("*****"*10)
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Shape of X_train: {X_train.shape}')
print("*****"*10)
print(f'Total # of sample in test dataset: {len(X_test)}')
print(f'Shape of X_test: {X_test.shape}')

Total # of sample in whole dataset: 271800
**************************************************
Total # of sample in train dataset: 203850
Shape of X_train: (203850, 12)
**************************************************
Total # of sample in test dataset: 67950
Shape of X_test: (67950, 12)


# Gradient Boosting

In [28]:
tree = GradientBoostingRegressor()

In [29]:
tree.fit(X_train, y_train)

In [30]:
predictions = tree.predict(X_test)

In [31]:
print('Mean Absolute Error:', mean_absolute_error(y_test,predictions))
print('Mean Squared Error:', mean_squared_error(y_test,predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,predictions)))
print('r2_score:', r2_score(y_test,predictions))

Mean Absolute Error: 136512.88007116865
Mean Squared Error: 35097287722.21261
Root Mean Squared Error: 187342.70127819927
r2_score: 0.5002903979950657


In [32]:
tree.feature_importances_

array([1.67115593e-03, 1.57666815e-04, 1.16560606e-02, 0.00000000e+00,
       7.66571179e-01, 0.00000000e+00, 1.05383538e-02, 2.42004025e-03,
       7.02987088e-05, 3.52936898e-02, 5.69333205e-02, 1.14688234e-01])

In [33]:
pd.Series(tree.feature_importances_,index=features).sort_values(ascending=False)

ATM_ID          0.766571
DAYOFMONTH      0.114688
DAYOFYEAR       0.056933
YEAR            0.035294
BANK_ID         0.011656
DAYOFWEEK       0.010538
QUARTER         0.002420
WEEKDAY_FLAG    0.001671
HOLIDAY_FLAG    0.000158
MONTH           0.000070
STATE           0.000000
HOUR            0.000000
dtype: float64

In [34]:
param_grid = [{"max_depth":[3,4,5, None], "max_features":[3,4,5,6,7]}]

In [35]:
gs = GridSearchCV(estimator=GradientBoostingRegressor(random_state=123),param_grid = param_grid,cv=10)

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.cv_results_['params']

In [None]:
gs.cv_results_['rank_test_score']

In [None]:
gs.best_estimator_

In [41]:
#Build Tree using Best parameters

tree1 = GradientBoostingRegressor(max_features=7, random_state=123)
tree1.fit(X_train, y_train)
predictions = tree1.predict(X_test)
y_pred = predictions

In [42]:
X_train.columns

Index(['WEEKDAY_FLAG', 'HOLIDAY_FLAG', 'BANK_ID', 'STATE', 'ATM_ID', 'HOUR',
       'DAYOFWEEK', 'QUARTER', 'MONTH', 'YEAR', 'DAYOFYEAR', 'DAYOFMONTH'],
      dtype='object')

In [43]:
print('Mean Absolute Error:', mean_absolute_error(y_test,predictions))
print('Mean Squared Error:', mean_squared_error(y_test,predictions))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test,predictions)))
print('r2_score:', r2_score(y_test,predictions))

Mean Absolute Error: 5537.898079470197
Mean Squared Error: 179667421.2646212
Root Mean Squared Error: 13404.007656839845
r2_score: 0.9974345654457842


In [44]:
@scoring_func
def score(model, request):
    payload_dict = request.json["payload"]
    df = pd.DataFrame(payload_dict,index=[0])
    
    df['DATE'] = pd.to_datetime(df['DATE'])
    df['HOUR'] = df['DATE'].dt.hour
    df['DAYOFWEEK'] = df['DATE'].dt.dayofweek
    df['QUARTER'] = df['DATE'].dt.quarter
    df['MONTH'] = df['DATE'].dt.month
    df['YEAR'] = df['DATE'].dt.year
    df['DAYOFYEAR'] = df['DATE'].dt.dayofyear
    df['DAYOFMONTH'] = df['DATE'].dt.day
    df['DATE'] = pd.to_datetime(df['DATE']).dt.strftime("%Y-%m-%d %H:%M:%S.%f")
    df['WEEKDAY_FLAG'] = df['WEEKDAY_FLAG'].map({'Y': 1, 'N': 0})
    df['HOLIDAY_FLAG'] = df['HOLIDAY_FLAG'].map({'Y': 1, 'N': 0})

    df['STATE']= label_encoder.transform(df['STATE'])
    df['ATM_ID']= atm_encoder.transform(df['ATM_ID'])
    
    if 'TOTAL_TRANSACTION_COUNT' in df.columns:
        df.drop(['TOTAL_TRANSACTION_COUNT'] ,axis=1, inplace=True)
    
    if 'DOWNTIME_IN_MINS' in df.columns:
        df.drop(['DOWNTIME_IN_MINS'] ,axis=1, inplace=True)
        
    if 'ATM_MAX_CAPACITY' in df.columns:
        df.drop(['ATM_MAX_CAPACITY'] ,axis=1, inplace=True)

    df.drop(['DATE','BANK_NAME'], axis=1,inplace=True)
    
    features = ['WEEKDAY_FLAG', 'HOLIDAY_FLAG', 'BANK_ID', 'STATE', 'ATM_ID','HOUR', 'DAYOFWEEK', 
                'QUARTER', 'MONTH', 'YEAR','DAYOFYEAR', 'DAYOFMONTH']

    data = df[features]

    y_pred = model.predict(data)
    return y_pred

In [45]:
df_original_1 = df_original.copy()
payload  = df_original.iloc[0].to_dict()
#payload1  = df_original.iloc[0].to_dict()
payload

{'DATE': '2023-01-29',
 'WEEKDAY_FLAG': 'Y',
 'HOLIDAY_FLAG': 'N',
 'BANK_ID': 3,
 'STATE': 'Maine',
 'BANK_NAME': 'Citibank',
 'ATM_ID': 'TBH000274025',
 'DISPENSED_AMOUNT': 225180.0,
 'TOTAL_TRANSACTION_COUNT': 90.0,
 'DOWNTIME_IN_MINS': 0.0,
 'ATM_MAX_CAPACITY': 2860000.0}

In [46]:
print ('{ "payload": ', payload, "}")

{ "payload":  {'DATE': '2023-01-29', 'WEEKDAY_FLAG': 'Y', 'HOLIDAY_FLAG': 'N', 'BANK_ID': 3, 'STATE': 'Maine', 'BANK_NAME': 'Citibank', 'ATM_ID': 'TBH000274025', 'DISPENSED_AMOUNT': 225180.0, 'TOTAL_TRANSACTION_COUNT': 90.0, 'DOWNTIME_IN_MINS': 0.0, 'ATM_MAX_CAPACITY': 2860000.0} }


In [47]:
df_original_1.drop(['TOTAL_TRANSACTION_COUNT','DOWNTIME_IN_MINS','ATM_MAX_CAPACITY'], inplace=True, axis=1)
payload1  = df_original_1.iloc[0].to_dict()

In [48]:
print ('{ "payload": ', payload1, "}")

{ "payload":  {'DATE': '2023-01-29', 'WEEKDAY_FLAG': 'Y', 'HOLIDAY_FLAG': 'N', 'BANK_ID': 3, 'STATE': 'Maine', 'BANK_NAME': 'Citibank', 'ATM_ID': 'TBH000274025', 'DISPENSED_AMOUNT': 225180.0} }


In [49]:
req = requests.Request()
req.json = {"payload":payload1}
#req.json = {"payload":payload}
y_req = req
score(tree, y_req)

array([344277.48507498])

In [50]:
req = requests.Request()
req.json = {"payload":payload}
y_req = req
score(tree, y_req)

array([344277.48507498])

In [51]:
y_prediction = pd.Series(y_pred)

In [52]:
type(X_train), type(X_test), type(y_train), type(y_test), type(y_pred), type(y_prediction)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series,
 numpy.ndarray,
 pandas.core.series.Series)

In [53]:
## registering the model in Fosfor.
model_reg = register_model(tree1,
               score, 
               name="ATM_DispenseAMT_GradientBoost_Regression", 
               description="ATM Dispense Amount GradientBoost Regression",
               flavour=MLModelFlavours.sklearn,
               model_type="regression",
               init_script="pip install snowflake-ml-python==1.0.11",
               y_true=y_test,
               y_pred=y_prediction,
               #prob=y_prob,
               features=X_train.columns,
               input_type="json", 
               explain_ai=True,
               x_train=X_train, 
               x_test=X_test, 
               y_train=y_train.tolist(),
               y_test=y_test.tolist(),
               feature_names=X_train.columns.tolist(),
               original_features=X_train.columns.tolist(),
               feature_ids=X_train.columns,
               kyd=True, kyd_score = True)

Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


VBox(children=(HTML(value='<style>.grad_1{background: #2468a4;} .grad_2{ color:white; background: #2468a4;}</s…