# Part 3: Unbiased Evaluation using a New Test Set

## Import modules as needed

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

## Load smart sample and the best pipeline from Part II

In [2]:
# for loading
#loading anamoly method and pipeline model 
iso, model = joblib.load('data_sample_data_v1.pkl')
#loading dataset
dataset=joblib.load('datasetnew.pkl')


##  Retrain a pipeline using the full sampled training data set

Use the full sampled training data set to train the pipeline.

In [3]:
X = np.array(dataset.iloc[:,0:21])
y = np.array(dataset.went_on_backorder)
print(X.shape)
print(y.shape)

(22586, 21)
(22586,)


In [4]:
# Add code below this comment  (Question #E301)
# ----------------------------------
#reducing outliers on training set: Anomaly detection methon
iso_outliers = iso.predict(X)==-1

X_iso_train = X[~iso_outliers]
y_iso_train = y[~iso_outliers]
print(X_iso_train.shape)
print(y_iso_train.shape)
#pipeline model to train full sample training data set
model_s=model.fit(X_iso_train, y_iso_train)


(21403, 21)
(21403,)


### Save the trained model with the pickle library.

In [5]:
# Add code below this comment  
# -----------------------------

joblib.dump(model_s, 'model_s.pkl')


['model_s.pkl']


## Load the Testing Data and evaluate your model

 * `/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv`
 
* We need to preprocess this test data (**follow** the steps similar to Part I)
* **If you have fitted any normalizer/standardizer in Part 2, then we have to transform this test data using the fitted normalizer/standardizer!**

In [6]:
DATASET = 'Kaggle_Test_Dataset_v2.csv'
assert os.path.exists(DATASET)

assert os.path.exists(DATASET)

dataset = pd.read_csv(DATASET, sep=',').sample(frac = 1).reset_index(drop=True)
dataset.head()



  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,3518906,2.0,8.0,0.0,3.0,6.0,10.0,1.0,4.0,6.0,...,0.0,0.66,0.66,0.0,No,No,No,Yes,No,No
1,3519799,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.54,0.71,0.0,No,No,No,Yes,No,No
2,3454288,3.0,9.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.7,0.77,0.0,No,No,No,Yes,No,No
3,3320245,21.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,...,0.0,0.84,0.8,0.0,No,No,No,Yes,No,No
4,3438836,155.0,9.0,0.0,0.0,0.0,0.0,10.0,45.0,119.0,...,0.0,0.94,0.83,0.0,No,No,Yes,Yes,No,No


In [7]:
#Preprocessing: using same steps used in part1 except undersampling
# Dropped column: sku
dataset= dataset.iloc[:,1:23]
# All the column names of these yes/no columns
yes_no_columns = list(filter(lambda i: dataset[i].dtype!=np.float64, dataset.columns))
print(yes_no_columns)

# Add code below this comment  (Question #E102)
# ----------------------------------

#finding wheather a column contain only yes/no

print("potential_issue:", dataset.potential_issue.unique())
print("deck_risk:", dataset.deck_risk.unique())
print("oe_constraint", dataset.oe_constraint.unique())
print("ppap_risk", dataset.ppap_risk.unique())
print("stop_auto_buy", dataset.stop_auto_buy.unique())
print("rev_stop", dataset.rev_stop.unique())
print("went_on_backorder:", dataset.went_on_backorder.unique())

for column_name in yes_no_columns:
    mode = dataset[column_name].apply(str).mode()[0]
    print('Filling missing values of {} with {}'.format(column_name, mode))
    dataset[column_name].fillna(mode, inplace=True)
    
cat = dataset.select_dtypes(include = ['object']).columns
for col in cat:
    dataset[col].replace({'No': 0, 'Yes': 1}, inplace=True)
    dataset[col] = dataset[col].astype(int)

['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']
potential_issue: ['No' 'Yes' nan]
deck_risk: ['No' 'Yes' nan]
oe_constraint ['No' 'Yes' nan]
ppap_risk ['No' 'Yes' nan]
stop_auto_buy ['Yes' 'No' nan]
rev_stop ['No' 'Yes' nan]
went_on_backorder: ['No' 'Yes' nan]
Filling missing values of potential_issue with No
Filling missing values of deck_risk with No
Filling missing values of oe_constraint with No
Filling missing values of ppap_risk with No
Filling missing values of stop_auto_buy with Yes
Filling missing values of rev_stop with No
Filling missing values of went_on_backorder with No


In [8]:

dataset.isnull().sum()
dataset['lead_time'].fillna(dataset['lead_time'].mode()[0], inplace=True) #replacing null values of lead time column with mode
dataset=dataset.dropna()
test_X = np.array(dataset.iloc[:,0:21])
test_y = np.array(dataset.went_on_backorder)
print(test_X.shape)
print(test_y.shape)

(242075, 21)
(242075,)


We can now predict and evaluate with the preprocessed test set. It would be interesting to see the performance with and without outliers removal from the test set. We can report confusion matrix, precision, recall, f1-score, accuracy, and other measures (if any). 

In [14]:
# Add code below this comment  (Question #E303)
# ----------------------------------
#scenario 1: without any outliers reduction on test data

predicted_y1 = model_s.predict(test_X)
print(classification_report(test_y, predicted_y1))
pd.DataFrame(confusion_matrix(test_y, predicted_y1))


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    239387
           1       0.09      0.80      0.16      2688

    accuracy                           0.90    242075
   macro avg       0.54      0.85      0.55    242075
weighted avg       0.99      0.90      0.94    242075



Unnamed: 0,0,1
0,216851,22536
1,551,2137


In [10]:
#Now, I have removed outliers from the test dataset

#outlier reduction on test set

iso_outliers = iso.predict(test_X)==-1

X_iso_test = test_X[~iso_outliers]
y_iso_test = test_y[~iso_outliers]
print(X_iso_test.shape)
print(y_iso_test.shape)

(229635, 21)
(229635,)


In [15]:
#Scenario 2: Outlier removed from train and test dataset

predicted_y2 = model_s.predict(X_iso_test)
print(classification_report(y_iso_test, predicted_y2))
pd.DataFrame(confusion_matrix(y_iso_test, predicted_y2))



              precision    recall  f1-score   support

           0       1.00      0.90      0.95    227091
           1       0.09      0.80      0.15      2544

    accuracy                           0.90    229635
   macro avg       0.54      0.85      0.55    229635
weighted avg       0.99      0.90      0.94    229635



Unnamed: 0,0,1
0,205295,21796
1,519,2025


In [16]:
#Scenario 3: No outlier reduction on train as well as test data
model_s_out=model.fit(X, y)

predicted_y3 = model_s_out.predict(test_X)
print(classification_report(test_y, predicted_y3))
pd.DataFrame(confusion_matrix(test_y, predicted_y3))

              precision    recall  f1-score   support

           0       1.00      0.91      0.95    239387
           1       0.09      0.79      0.16      2688

    accuracy                           0.90    242075
   macro avg       0.54      0.85      0.55    242075
weighted avg       0.99      0.90      0.94    242075



Unnamed: 0,0,1
0,216802,22585
1,557,2131


In [17]:
#Scenario 4: No outlier reduction on training set but inlier test data

predicted_y4 = model_s_out.predict(X_iso_test)
print(classification_report(y_iso_test, predicted_y4))
pd.DataFrame(confusion_matrix(y_iso_test, predicted_y4))

              precision    recall  f1-score   support

           0       1.00      0.90      0.95    227091
           1       0.08      0.79      0.15      2544

    accuracy                           0.90    229635
   macro avg       0.54      0.85      0.55    229635
weighted avg       0.99      0.90      0.94    229635



Unnamed: 0,0,1
0,205240,21851
1,525,2019


In [19]:
#checking test dataset wheather there is any class imbalance problem 

print(np.sum(dataset['went_on_backorder']==1))
print(np.sum(dataset['went_on_backorder']==0))
num_backorder = np.sum(dataset['went_on_backorder']==1)
print('backorder ratio:', num_backorder, '/', len(dataset), '=', num_backorder / len(dataset))

2688
239387
backorder ratio: 2688 / 242075 = 0.01110399669523908


## Conclusion

## Reflect