In [10]:
%cd "~/okcredit_github/retraining_pipeline/training_pipeline/nbs"

/home/nikhilmishra_okcredit_in/okcredit_github/retraining_pipeline/training_pipeline/nbs


In [73]:
import pandas as pd
import numpy as np
import sys
import os
from omegaconf import OmegaConf
from okcredit_ml.ml_pipeline import classifier, model, validation

from okcredit_ml.cloud_connectors import gcp
from copy import deepcopy


sys.path.append('..')
from src import utils

sys.path.append('../../feature_pipeline/')
sys.path.append('../../feature_pipeline/svc')
from OKCFeaturePipeline import OKCFeaturePipeline

from pathlib import Path
from google.cloud import storage

gcp_bq = gcp.BQPy(project_id='okcredit-data-science')

In [74]:
data_config = OmegaConf.load('../configs/data_config.yaml')
feature_pipe_config = OmegaConf.load("../configs/feature_pipeline_config_template.yaml")

In [75]:
utils.download_files_from_gcs_folder(project=data_config.data_path.gcs.project,
                                     bucket_name=data_config.data_path.gcs.bucket_name,
                                     gcs_folder_path=data_config.data_path.gcs.features,
                                     local_save_path=data_config.data_path.local)

File test_bnpl_fts.csv downloaded to ../data/test_bnpl_fts.csv
File test_tl_fts.csv downloaded to ../data/test_tl_fts.csv
File train_bnpl_fts.csv downloaded to ../data/train_bnpl_fts.csv
File train_tl_fts.csv downloaded to ../data/train_tl_fts.csv


In [172]:
train_tl_fts = pd.read_csv(os.path.join(data_config.data_path.local, 'train_tl_fts.csv'))
train_bnpl_fts = pd.read_csv(os.path.join(data_config.data_path.local, 'train_bnpl_fts.csv'))
train_comb = pd.read_csv(os.path.join(data_config.data_path.local, 'train_comb.csv'))
train_comb['run_date'] = pd.to_datetime(train_comb['run_date'])

In [173]:
train_fts_all = pd.concat([train_tl_fts, train_bnpl_fts], axis=0).reset_index(drop=True)
train_fts_all['run_date'] = pd.to_datetime(train_fts_all['run_date'])

In [174]:
train_fts_all.fillna(-999).drop_duplicates().shape

(5770, 115)

In [175]:

train_fts_all = train_fts_all.drop_duplicates(subset=['merchant_id', 'run_date', 'target'], keep='last').reset_index(drop=True)
print(train_comb.shape, train_fts_all.shape)
train_comb = pd.merge(train_comb, train_fts_all, on=['merchant_id', 'run_date', 'target'], how='left')
print(train_comb.shape)

(5770, 3) (5769, 115)
(5770, 115)


In [176]:
train_other = pd.read_csv('merchant_ids_comb.csv')

In [192]:
fts = ['pred_labels',
 'range_tnx_hour_last_90_days',
 'range_tnx_hour_last_180_days',
 'range_tnx_hour_last_365_days',
 'range_tnx_hour_last_150_days',
 'range_tnx_hour_last_120_days',
 'range_tnx_hour_last_60_days',
 'pincode_2',
 'pincode_3',
 'state',
 'total_sessions_diff_3_to_7_days',
 'seller_min_amount_ratio_15_to_30_days',
 'total_sessions_last_90_days',
 'score',
 'std_tnx_hour_last_90_days',
 'total_sessions_diff_7_to_30_days',
 'seller_min_amount_diff_15_to_30_days',
 'total_sessions_last_3_days',
 'total_sessions_last_60_days',
 'avg_tnx_hour_last_60_days',
 'seller_unique_account_ids_ratio_3_to_7_days',
 'range_tnx_hour_last_45_days',
 'avg_tnx_hour_last_45_days',
 'std_tnx_hour_last_120_days',
 'avg_tnx_hour_last_180_days',
 'total_sessions_last_30_days',
 'avg_tnx_hour_last_30_days',
 'std_tnx_hour_last_150_days',
 'total_sessions_last_45_days',
 'buyer_total_transactions_diff_150_to_180_days',
 'avg_tnx_hour_last_150_days',
 'total_sessions_last_15_days',
 'seller_avg_amount_last_15_days',
 'avg_tnx_hour_last_365_days',
 'avg_tnx_hour_last_90_days',
 'total_sessions_last_7_days',
 'seller_avg_num_transactions_per_account_id_ratio_7_to_15_days',
 'total_sessions_diff_30_to_90_days',
 'std_tnx_hour_last_180_days',
 'avg_tnx_hour_last_7_days',
 'seller_range_amount_diff_180_to_365_days',
 'range_tnx_hour_last_7_days',
 'std_tnx_dayofweek_last_150_days',
 'total_sessions_diff_15_to_60_days',
 'std_tnx_dayofweek_last_7_days',
 'avg_tnx_hour_last_120_days',
 'seller_range_amount_ratio_180_to_365_days',
 'total_sessions_diff_7_to_90_days',
 'total_sessions_ratio_3_to_7_days',
 'seller_unique_account_ids_last_30_days']

cat_cols=['pred_labels', 'city', 'district', 'state']

In [197]:
num_cols = [c for c in fts if c not in cat_cols]
for c in num_cols:
    train_comb[c] = train_comb[c].astype(np.float32)
    train_other[c] = train_other[c].astype(np.float32)

In [198]:
train_comb = train_comb.fillna(-999)
train_other = train_other.fillna(-999)

misses = {}
for f in fts:
    misses[f] = np.sum(train_comb[f] != train_other[f])  

In [199]:
c = 'avg_tnx_hour_last_365_days'
fltr = train_comb[c] != train_other[c]

In [200]:
train_comb[c][fltr]

1       11.096450
2        7.821205
3        7.916391
4        9.620403
5       10.059968
          ...    
5763     8.038825
5764    10.607505
5766     8.961100
5767    10.762291
5769     7.076483
Name: avg_tnx_hour_last_365_days, Length: 4193, dtype: float32

In [201]:
train_other[c][fltr]

1       11.081493
2        7.780958
3        7.914542
4        9.633314
5       10.074841
          ...    
5763     8.029166
5764    10.604863
5766     8.941449
5767    10.760653
5769     7.060959
Name: avg_tnx_hour_last_365_days, Length: 4193, dtype: float32

In [194]:
pd.Series(misses).sort_values(ascending=False)

avg_tnx_hour_last_365_days                                       5097
avg_tnx_hour_last_180_days                                       3429
avg_tnx_hour_last_150_days                                       3350
avg_tnx_hour_last_120_days                                       3320
avg_tnx_hour_last_90_days                                        3190
avg_tnx_hour_last_60_days                                        3054
avg_tnx_hour_last_45_days                                        2949
avg_tnx_hour_last_30_days                                        2874
seller_avg_amount_last_15_days                                   2868
std_tnx_hour_last_180_days                                       2485
std_tnx_hour_last_120_days                                       2369
std_tnx_hour_last_150_days                                       2344
std_tnx_dayofweek_last_150_days                                  2308
std_tnx_hour_last_90_days                                        2260
avg_tnx_hour_last_7_

In [159]:
training_config = OmegaConf.load('../configs/model_config.yaml')

In [151]:
param_kwargs = {} if training_config.model.model_kwargs is None else training_config.model.model_kwargs
estimator_obj = getattr(model, training_config.model["class"])(**param_kwargs)
validation_scheme = validation.BlockTimeSplit(n_splits=9)

classifier_obj = classifier.Classifier(model=estimator_obj,
                      validation=validation_scheme ,
                      modelling_columns=fts,
                      categorical_columns=cat_cols,
                      target='target')

classifier_obj.fit(train_comb, train_comb['target'])

In [152]:
classifier_obj.oof_scores

[0.43083735909822873,
 0.6137281910009182,
 0.6846593662176567,
 0.6831211692597832,
 0.6992722964968692,
 0.573108939558451,
 0.6422162666045211,
 0.5769039074960128,
 0.6207166516109606]