In [2]:
%cd "~/okcredit_github/retraining_pipeline/training_pipeline/"

/home/nikhilmishra_okcredit_in/okcredit_github/retraining_pipeline/training_pipeline


In [11]:
import pandas as pd
import numpy as np
import sys
import os
from omegaconf import OmegaConf
from okcredit_ml.ml_pipeline import classifier, model, validation

from okcredit_ml.cloud_connectors import gcp
from copy import deepcopy


sys.path.append('..')
from src import utils

sys.path.append('../feature_pipeline/')
sys.path.append('../feature_pipeline/svc')
from OKCFeaturePipeline import OKCFeaturePipeline

from pathlib import Path
from google.cloud import storage
import joblib
import gc
from sklearn.metrics import roc_auc_score

gcp_bq = gcp.BQPy(project_id='okcredit-data-science')

In [12]:
train_config = OmegaConf.load('./configs/train_config.yaml')

In [13]:
utils.download_files_from_gcs_folder(project=train_config.input_path.gcs.project,
                                     bucket_name=train_config.input_path.gcs.bucket_name,
                                     gcs_folder_path=train_config.input_path.gcs.features,
                                     local_save_path=train_config.input_path.local.base)

utils.download_files_from_gcs_folder(project=train_config.input_path.gcs.project,
                                     bucket_name=train_config.input_path.gcs.bucket_name,
                                     gcs_folder_path=train_config.input_path.gcs.input_data,
                                     local_save_path=train_config.input_path.local.base)

File test_bnpl_fts.csv downloaded to ./data/test_bnpl_fts.csv
File test_tl_fts.csv downloaded to ./data/test_tl_fts.csv
File train_bnpl_fts.csv downloaded to ./data/train_bnpl_fts.csv
File train_tl_fts.csv downloaded to ./data/train_tl_fts.csv
File test_bnpl.csv downloaded to ./data/test_bnpl.csv
File test_tl.csv downloaded to ./data/test_tl.csv
File train_bnpl.csv downloaded to ./data/train_bnpl.csv
File train_comb.csv downloaded to ./data/train_comb.csv
File train_tl.csv downloaded to ./data/train_tl.csv


In [6]:
train_df = utils.read_multiple_dfs(train_config.input_path.local.base, train_config.input_path.local.train.features)
if 'order' in train_config.input_path.local.train:
    order_df = utils.read_multiple_dfs(train_config.input_path.local.base, train_config.input_path.local.train.order)
    order_df = order_df.drop_duplicates(subset=['merchant_id', 'run_date', 'target'], keep='last').reset_index(drop=True)
    train_df = pd.merge(order_df, train_df, on=['merchant_id', 'run_date', 'target'], how='left')
    
    del order_df
    _ = gc.collect()


In [7]:
param_kwargs = {} if train_config.model.model_kwargs is None else train_config.model.model_kwargs
estimator_obj = getattr(model, train_config.model["class"])(**param_kwargs)

validation_kwargs = {} if train_config.validation.kwargs is None else train_config.validation.kwargs
validation_scheme = getattr(validation, train_config.validation['class'])(**validation_kwargs)

classifier_obj = classifier.Classifier(model=estimator_obj,
                                       validation=validation_scheme ,
                                       modelling_columns=train_config.column_profile.features,
                                       categorical_columns=train_config.column_profile.categorical,
                                       target='target')

classifier_obj.fit(train_df, train_df['target'])

In [9]:
if train_config.output_path.model.local is not None:
    Path(train_config.output_path.model.local).mkdir(parents=True, exist_ok=True)
    local_file_path = os.path.join(train_config.output_path.model.local, train_config.output_path.model.save_name)
    joblib.dump(classifier_obj, local_file_path)
    
    if train_config.output_path.model.gcs is not None:
        utils.create_folder_in_gcs(
            project=train_config.output_path.model.gcs.project,
            bucket_name=train_config.output_path.model.gcs.bucket_name,
            folder_name=train_config.output_path.model.gcs.folder_name
        )

        utils.copy_from_local_to_gcs(
            project=train_config.output_path.model.gcs.project,
            bucket_name=train_config.output_path.model.gcs.bucket_name,
            gcs_file_path=os.path.join(train_config.output_path.model.gcs.folder_name, train_config.output_path.model.save_name),
            local_file_path=local_file_path
        )
        

Folder gs://okcredit-data-science/okc_underwriting/model_retraining_pipeline/v1_1/artifacts/ already exists in gcs
File ./artifacts/model.pkl uploaded to model_retraining_pipeline/v1_1/artifacts/model.pkl


In [10]:
if 'test' in train_config.input_path.local:
    
    test_df = utils.read_multiple_dfs(train_config.input_path.local.base, train_config.input_path.local.test.features)
    test_df['model_preds'] = classifier_obj.predict(test_df)
    
    scores_df = test_df[['target', 'model_preds', 'file_name']].groupby('file_name').apply(lambda x: roc_auc_score(x['target'], x['model_preds'])).rename('AUC score').reset_index()

    output_file_name = 'test_scores.csv' if train_config.output_path.test_scores is None else train_config.output_path.test_scores.save_name

    local_file_path = os.path.join(train_config.output_path.model.local, output_file_name)
    scores_df.to_csv(local_file_path, index=False)

    utils.copy_from_local_to_gcs(
        project=train_config.output_path.model.gcs.project,
        bucket_name=train_config.output_path.model.gcs.bucket_name,
        gcs_file_path=os.path.join(train_config.output_path.model.gcs.folder_name, output_file_name),
        local_file_path=local_file_path
    )

File ./artifacts/test_scores.csv uploaded to model_retraining_pipeline/v1_1/artifacts/test_scores.csv
