# Loading the required libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn import tree

from sklearn.model_selection import GridSearchCV

import graphviz

#!conda install -c conda-forge python-graphviz
#!conda install --yes python-graphviz
#!conda install --yes graphviz

import matplotlib.pyplot as plt

# Loading the data

In [2]:
data = pd.read_csv("BackOrders.csv")

In [3]:
data.head()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,1888279,117,,0,0,0,0,0,0,15,...,0,-99.0,-99.0,0,No,No,Yes,Yes,No,No
1,1870557,7,2.0,0,0,0,0,0,0,0,...,0,0.5,0.28,0,Yes,No,No,Yes,No,No
2,1475481,258,15.0,10,10,77,184,46,132,256,...,0,0.54,0.7,0,No,No,No,Yes,No,No
3,1758220,46,2.0,0,0,0,0,1,2,6,...,0,0.75,0.9,0,Yes,No,No,Yes,No,No
4,1360312,2,2.0,0,4,6,10,2,2,5,...,0,0.97,0.92,0,No,No,No,Yes,No,No


In [4]:
data.shape

(61589, 23)

In [5]:
data.dtypes

sku                    int64
national_inv           int64
lead_time            float64
in_transit_qty         int64
forecast_3_month       int64
forecast_6_month       int64
forecast_9_month       int64
sales_1_month          int64
sales_3_month          int64
sales_6_month          int64
sales_9_month          int64
min_bank               int64
potential_issue       object
pieces_past_due        int64
perf_6_month_avg     float64
perf_12_month_avg    float64
local_bo_qty           int64
deck_risk             object
oe_constraint         object
ppap_risk             object
stop_auto_buy         object
rev_stop              object
went_on_backorder     object
dtype: object

In [6]:
data.nunique()

sku                  61589
national_inv          2916
lead_time               28
in_transit_qty         908
forecast_3_month      1623
forecast_6_month      2195
forecast_9_month      2664
sales_1_month         1092
sales_3_month         1928
sales_6_month         2679
sales_9_month         3220
min_bank              1098
potential_issue          2
pieces_past_due        190
perf_6_month_avg       102
perf_12_month_avg      102
local_bo_qty           201
deck_risk                2
oe_constraint            2
ppap_risk                2
stop_auto_buy            2
rev_stop                 2
went_on_backorder        2
dtype: int64

In [7]:
data.isna().sum()

# Remove column name 'sku'
data.drop(['sku','lead_time'], axis = 1,inplace=True)
data.shape

(61589, 21)

In [8]:
data.drop_duplicates(inplace=True)
data.shape

(52883, 21)

In [9]:
cat_col=['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk',
       'stop_auto_buy', 'rev_stop']
data[cat_col]=data[cat_col].astype('category')
data.dtypes

national_inv            int64
in_transit_qty          int64
forecast_3_month        int64
forecast_6_month        int64
forecast_9_month        int64
sales_1_month           int64
sales_3_month           int64
sales_6_month           int64
sales_9_month           int64
min_bank                int64
potential_issue      category
pieces_past_due         int64
perf_6_month_avg      float64
perf_12_month_avg     float64
local_bo_qty            int64
deck_risk            category
oe_constraint        category
ppap_risk            category
stop_auto_buy        category
rev_stop             category
went_on_backorder      object
dtype: object

In [10]:
cat_col
X=data.drop(['went_on_backorder'],axis=1)
y=data['went_on_backorder']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
num_col=data.drop(['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk',
       'stop_auto_buy', 'rev_stop'],axis=1).columns
num_col


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train[num_col])
X_train[num_col]=scaler.transform(X_train[num_col])
X_test[num_col]=scaler.transform(X_test[num_col])

KeyError: "['went_on_backorder'] not in index"

In [None]:
## Convert Categorical Columns to Dummies
# Train
X_train = pd.get_dummies(X_train, columns=cat_col, drop_first=True)

# Test
X_test = pd.get_dummies(X_test, columns=cat_col, drop_first=True)


from sklearn.preprocessing import MinMaxScaler


y_train.value_counts()

y_train.replace({'No': 0, 'Yes': 1})
y_test.replace({'No': 0, 'Yes': 1})


In [None]:

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features