In [1]:
%cd "~/okcredit_github/retraining_pipeline/training_pipeline"

/home/nikhilmishra_okcredit_in/okcredit_github/retraining_pipeline/training_pipeline


In [34]:
import pandas as pd
import numpy as np
import sys
import os
from omegaconf import OmegaConf
from okcredit_ml.ml_pipeline import classifier, model, validation

from okcredit_ml.cloud_connectors import gcp
from copy import deepcopy
from src import utils

sys.path.append('../feature_pipeline/')
sys.path.append('../feature_pipeline/svc')
from OKCFeaturePipeline import OKCFeaturePipeline

from pathlib import Path
from google.cloud import storage
import joblib

gcp_bq = gcp.BQPy(project_id='okcredit-data-science')

In [35]:
data_config = OmegaConf.load('./configs/data_config.yaml')
feature_pipe_config = OmegaConf.load("./configs/feature_pipeline_config_template.yaml")

In [36]:
# Delete the old feature pipeline output table if it exists, since feature pipeline always appends to the table

config = utils.create_config_from_template(feature_pipe_config, '0_to_7')
config.basetables.cohort='okcredit-data-science.nikhil.all_merchants_0_to_7'

gcp_bq.delete_table(config.sink)
pipeline = OKCFeaturePipeline(config=config)
pipeline.predict(X=['2022-08-16'], names= ['run_date'])

print(f'Completed feature pipeline')
    
print(f'Cleaning up temporary tables')
utils.cleanup_tables(project='okcredit-data-science', config=config)

In [37]:
%%time
query = f'SELECT * FROM `{config.sink}`\nWHERE\n'

for i, df_name in enumerate(data_config.create_features_for_files):
    
    feature_config = OmegaConf.load(os.path.join(data_config.config_path, f'feature_pipeline_config_{df_name}.yaml'))

    if i > 0:
        query += "AND \n"
    query += f'merchant_id NOT IN (SELECT merchant_id FROM `{feature_config.sink}`)\n'
    
fts_df = gcp_bq.bq_to_pandas(query)

CPU times: user 21.2 s, sys: 8.54 s, total: 29.8 s
Wall time: 52.1 s


In [38]:
fts_df['run_date'] = pd.to_datetime(fts_df['run_date'])

In [39]:
train_config = OmegaConf.load('./configs/train_config.yaml')

model_local_file_path = os.path.join(train_config.output_dir.model.local, train_config.output_dir.model.save_name)
classifier_obj = joblib.load(model_local_file_path)

fts_df['model_preds'] = classifier_obj.predict(fts_df)

In [40]:
def get_approval_and_threshold(test_df):
   
    res_df = pd.DataFrame()
    pct_approval_range = np.arange(0, 105, 5)

    res_df['0 to 7% approval'] = pct_approval_range
    
    thresholds = []
    
    for pct_approval in pct_approval_range:

        threshold = test_df['model_preds'].quantile(pct_approval/100)
        thresholds.append(threshold)
            
            
    res_df['threshold'] = thresholds
        
    return res_df

In [41]:
res_df = get_approval_and_threshold(fts_df)

In [42]:
res_df

Unnamed: 0,0 to 7% approval,threshold
0,0,0.03024
1,5,0.035185
2,10,0.038121
3,15,0.040682
4,20,0.042973
5,25,0.045059
6,30,0.047176
7,35,0.049338
8,40,0.051637
9,45,0.054126
