In [None]:
#Importing libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
! pip install catboost
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV

from sklearn.metrics import roc_auc_score

from time import time
import joblib


In [None]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#test data
test=pd.read_csv('/content/drive/MyDrive/CS1_bo/testpoints.csv')
print(test.shape)

(385987, 24)


In [None]:
y=test['went_on_backorder']
X=test.drop(['went_on_backorder','Unnamed: 0'],axis=1)
print(X.shape,y.shape)

(385987, 22) (385987,)


In [None]:
def predict(X):
  '''This function takes a datapoint as input , preprocess and predict using a pretrained model and returns prediction as output'''
  
  start=time()
  inp=np.array(X)  #input
  
  #preprocessing
  inp=np.where(inp=='Yes',1,inp)   
  inp=np.where(inp=='No',0,inp)
  
  features=dict()
  features['national_inv']=inp[1]
  features['lead_time']=inp[2]
  features['in_transit_qty']=inp[3]
  features['forecast_3_month']=inp[4]
  features['sales_3_month']=inp[8]
  features['min_bank']=inp[11]
  features['pieces_past_due']=inp[13]
  features['perf_6_month_avg']=inp[14]
  features['perf_12_month_avg']=inp[15]
  features['deck_risk']=inp[17]
  features['stop_auto_buy']=inp[20]
  
  # values for missing value imputation
  impute={'deck_risk': 0.0,
          'forecast_3_month': 0.0,
          'in_transit_qty': 0.0,
          'lead_time': 8.0,
          'min_bank': 0.0,
          'national_inv': 15.0,
          'perf_12_month_avg': 0.83,
          'perf_6_month_avg': 0.722,
          'pieces_past_due': 0.0,
          'sales_3_month': 0.48,
          'stop_auto_buy': 1.0}
  # columns to tranform
  skewed=['in_transit_qty','forecast_3_month','sales_3_month',
          'min_bank','pieces_past_due','reorder_point','usable_stock']
  
  skewed2=['forecast_3_month','sales_3_month','min_bank','perf_6_month_avg']
  
  to_scale=['national_inv', 'lead_time', 'in_transit_qty',
      'forecast_3_month', 'sales_3_month', 
      'min_bank', 'pieces_past_due']
  #scaler
  scaler=joblib.load('/content/drive/MyDrive/CS1_bo/scaler.pkl')
  base_models=[]
  base_predictions=[]
  threshold=0.00815
  
  for i in range(1,16):
    base= joblib.load('/content/drive/MyDrive/CS1_bo/model'+str(i)+'.pkl')
    base_models.append(base)
  meta=joblib.load('/content/drive/MyDrive/CS1_bo/metaclf_.pkl')
  
  
  #missing value imputation
  if features['perf_6_month_avg']==-99:
    features['perf_6_month_avg']=impute['perf_6_month_avg']
  if features['perf_12_month_avg']==-99:
    features['perf_12_month_avg']=impute['perf_12_month_avg']
  for i in features.keys():
    if (features[i]=='NaN')|(features[i]=='nan')|(np.isnan(features[i])):
      features[i]=impute[i]
  
  #feature engg
  features['reorder_point']=np.round(((features['sales_3_month']/30)*features['lead_time'])+features['min_bank'],5)
  features['usable_stock']=np.round(features['national_inv']-features['reorder_point'],5)
  features['neg_stock']=(features['usable_stock']<0).astype('int32')
  features['zero_stock']=(features['usable_stock']==0).astype('int32')
  features['min_stock']=(features['usable_stock']<features['min_bank']).astype('int32')
  
  
  #feature transformations
  for feat in skewed:
    features[feat]= np.round(np.log(abs(features[feat])+1)*np.sign(features[feat]),5)
  for feat in skewed2:
    features[feat]=np.round((features[feat])**2,5)
  
  features['pieces_past_due']=np.round((features['pieces_past_due'])**4,5)
  features['usable_stock']=np.round(((features['usable_stock'])**2)*np.sign(features['usable_stock']),5)
  
  #scaling
  scaled=scaler.transform(np.array([features[v] for v in to_scale]).reshape(1,-1))
  for i,v in enumerate(to_scale):
    features[v]=np.round(scaled[0][i],6)
  
  all_cols=['national_inv', 'lead_time', 'in_transit_qty',
       'forecast_3_month', 'sales_3_month', 'min_bank', 'pieces_past_due',
       'perf_6_month_avg', 'perf_12_month_avg', 'deck_risk', 'stop_auto_buy',
        'reorder_point', 'usable_stock', 'neg_stock','zero_stock', 'min_stock']
  #preprocessd data
  preprocessed=np.array([features[value] for value in all_cols]).reshape(1,-1)
  
  #prediction by base models
  for model in base_models:
    base_predictions.append(model.predict_proba(preprocessed)[0][1])
  
  #meta model prediction
  meta=joblib.load('/content/drive/MyDrive/CS1_bo/metaclf_.pkl')
  prediction=meta.predict_proba(np.array(base_predictions).reshape(1,-1))[0][1]
  prediction=(prediction>=threshold).astype('int32')
  print('prediction:',prediction )
  print('time taken: %0.2f seconds'%(time()-start))
  
  return prediction

In [None]:
#predict a point
predict(test.iloc[0])

prediction: 0
time taken: 2.53 seconds


0

In [None]:
def evaluate(X,y):
  '''This function take a dataframe as input , preprocess and predict using a pretrained model and returns prediction and roc-auc score as output'''
  start=time()
  data=X
  #columns
  all_cols=['national_inv', 'lead_time', 'in_transit_qty',
       'forecast_3_month', 'sales_3_month', 'min_bank', 'pieces_past_due',
       'perf_6_month_avg', 'perf_12_month_avg', 'deck_risk', 'stop_auto_buy',
        'reorder_point', 'usable_stock', 'neg_stock','zero_stock', 'min_stock']
  #values for imputation
  impute={'deck_risk': 0.0,
          'forecast_3_month': 0.0,
          'in_transit_qty': 0.0,
          'lead_time': 8.0,
          'min_bank': 0.0,
          'national_inv': 15.0,
          'perf_12_month_avg': 0.83,
          'perf_6_month_avg': 0.722,
          'pieces_past_due': 0.0,
          'sales_3_month': 0.48,
          'stop_auto_buy': 1.0,
            }
  #features to transform
  skewed=['in_transit_qty','forecast_3_month','sales_3_month',
          'min_bank','pieces_past_due','reorder_point','usable_stock']
  skewed2=['forecast_3_month','sales_3_month','min_bank','perf_6_month_avg']

  #features to scale
  to_scale=['national_inv', 'lead_time', 'in_transit_qty',
      'forecast_3_month', 'sales_3_month', 
      'min_bank', 'pieces_past_due']
  
  #scaler
  scaler=joblib.load('/content/drive/MyDrive/CS1_bo/scaler.pkl')
  
  #load base models
  base_models=[]
  for i in range(1,16):
    base= joblib.load('/content/drive/MyDrive/CS1_bo/model'+str(i)+'.pkl')
    base_models.append(base)
  
  #load meta model
  meta=joblib.load('/content/drive/MyDrive/CS1_bo/metaclf_.pkl')
  
  base_predictions=[]
  threshold=0.00815
  
  #drop nan
  data.dropna(thresh=5,inplace=True)
  
  #replace 'Yes'/'No' with 1/0 & missing value imputation
  for feat in impute.keys():
    if data[feat].dtype=='O':
      data[feat]=data[feat].replace({'Yes': 1,'No': 0})
    data[feat].fillna(impute[feat],inplace=True)
  
  data['perf_6_month_avg'] = data['perf_6_month_avg'].replace([-99],impute['perf_6_month_avg'])
  data['perf_12_month_avg'] = data['perf_12_month_avg'].replace([-99],impute['perf_12_month_avg'])
   
  #feature engg
  data['reorder_point']=np.round(((data['sales_3_month']/30)*data['lead_time'])+data['min_bank'],5)
  data['usable_stock']=np.round(data['national_inv']-data['reorder_point'],5)
  data['neg_stock']=(data['usable_stock']<0).astype('int32')
  data['zero_stock']=(data['usable_stock']==0).astype('int32')
  data['min_stock']=(data['usable_stock']<data['min_bank']).astype('int32')

  #feature transformations
  data['in_transit_qty']=data['in_transit_qty'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['forecast_3_month']=data['forecast_3_month'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['forecast_3_month']=data['forecast_3_month'].apply(lambda x: x**2)
  data['sales_3_month']=data['sales_3_month'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['sales_3_month']=data['sales_3_month'].apply(lambda x: x**2)
  data['min_bank']=data['min_bank'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['min_bank']=data['min_bank'].apply(lambda x:x**2)
  data['pieces_past_due']=data['pieces_past_due'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['pieces_past_due']=data['pieces_past_due'].apply(lambda x: x**4)
  data['perf_6_month_avg']=data['perf_6_month_avg'].apply(lambda x: x**2)
  data['reorder_point']=data['reorder_point'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['usable_stock']=data['usable_stock'].apply(lambda x: np.log(abs(x)+1)*np.sign(x))
  data['usable_stock']=data['usable_stock'].apply(lambda x: (x**2)*np.sign(x))
  
  data=data.loc[:,all_cols]
  data[to_scale]=scaler.transform(data[to_scale])
  
  #predictions
  for model in base_models:
    base_predictions.append(model.predict_proba(data)[:,1])
  base_predictions=np.transpose(np.array(base_predictions))
  yhat=meta.predict_proba(base_predictions)[:,1]
  score=roc_auc_score(y,yhat)
  print('time taken: %0.2f seconds'%(time()-start))
  joblib.dump(yhat,'/content/drive/MyDrive/CS1_bo/test_prediction_'+str(int(time()))+'.pkl')
  
  
  return score,yhat


In [None]:
#evaluate model
score,yhat=evaluate(X,y)
print('ROC AUC :',score)

time taken: 233.11 seconds
ROC AUC : 0.9639302290240149
