In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split #to create model and validation data splits
from sklearn.model_selection import cross_val_score #to perform k fold cross validation

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score,precision_score,recall_score, classification_report, confusion_matrix,f1_score, fbeta_score, roc_auc_score,roc_curve #all metrics related to classification

In [2]:
product_train = pd.read_csv('product_train.csv')
product_test = pd.read_csv('product_test.csv')

In [42]:
product_train.head()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,3921548,8,12,0,0,0,0,1,1,2,...,0,0.63,0.75,0,No,No,No,Yes,No,No
1,3191009,83,2,33,157,377,603,44,98,148,...,0,0.68,0.66,0,No,No,No,Yes,No,No
2,2935810,8,4,0,0,0,0,0,0,1,...,0,0.73,0.78,0,No,No,No,Yes,No,No
3,2205847,31,4,63,70,160,223,27,90,164,...,0,0.73,0.78,0,No,No,Yes,Yes,No,No
4,4953497,3,12,0,0,0,0,0,0,0,...,0,0.81,0.74,0,No,No,No,Yes,No,No


In [86]:
product_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250078 entries, 0 to 250077
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   sku                250078 non-null  int64  
 1   national_inv       250078 non-null  int64  
 2   lead_time          250078 non-null  int64  
 3   in_transit_qty     250078 non-null  int64  
 4   forecast_3_month   250078 non-null  int64  
 5   forecast_6_month   250078 non-null  int64  
 6   forecast_9_month   250078 non-null  int64  
 7   sales_1_month      250078 non-null  int64  
 8   sales_3_month      250078 non-null  int64  
 9   sales_6_month      250078 non-null  int64  
 10  sales_9_month      250078 non-null  int64  
 11  min_bank           250078 non-null  int64  
 12  potential_issue    250078 non-null  int64  
 13  pieces_past_due    250078 non-null  int64  
 14  perf_6_month_avg   250078 non-null  float64
 15  perf_12_month_avg  250078 non-null  float64
 16  lo

In [130]:
product_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62520 entries, 0 to 62519
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sku                62520 non-null  int64  
 1   national_inv       62520 non-null  int64  
 2   lead_time          62520 non-null  int64  
 3   in_transit_qty     62520 non-null  int64  
 4   forecast_3_month   62520 non-null  int64  
 5   forecast_6_month   62520 non-null  int64  
 6   forecast_9_month   62520 non-null  int64  
 7   sales_1_month      62520 non-null  int64  
 8   sales_3_month      62520 non-null  int64  
 9   sales_6_month      62520 non-null  int64  
 10  sales_9_month      62520 non-null  int64  
 11  min_bank           62520 non-null  int64  
 12  potential_issue    62520 non-null  int64  
 13  pieces_past_due    62520 non-null  int64  
 14  perf_6_month_avg   62520 non-null  float64
 15  perf_12_month_avg  62520 non-null  float64
 16  local_bo_qty       625

In [128]:
product_test['stop_auto_buy'].value_counts()

stop_auto_buy
1    60984
0     1536
Name: count, dtype: int64

In [126]:
product_test['stop_auto_buy'] =  product_test['stop_auto_buy'].map({'Yes': 1, 'No': 0})

In [50]:
product_train['potential_issue']

0         0
1         0
2         0
3         0
4         0
         ..
250073    0
250074    0
250075    0
250076    0
250077    0
Name: potential_issue, Length: 250078, dtype: int64

In [132]:
product_train.head()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,3921548,8,12,0,0,0,0,1,1,2,...,0,0.63,0.75,0,0,0,0,1,0,No
1,3191009,83,2,33,157,377,603,44,98,148,...,0,0.68,0.66,0,0,0,0,1,0,No
2,2935810,8,4,0,0,0,0,0,0,1,...,0,0.73,0.78,0,0,0,0,1,0,No
3,2205847,31,4,63,70,160,223,27,90,164,...,0,0.73,0.78,0,0,0,1,1,0,No
4,4953497,3,12,0,0,0,0,0,0,0,...,0,0.81,0.74,0,0,0,0,1,0,No


In [143]:
product_train.columns.tolist()

['sku',
 'national_inv',
 'lead_time',
 'in_transit_qty',
 'forecast_3_month',
 'forecast_6_month',
 'forecast_9_month',
 'sales_1_month',
 'sales_3_month',
 'sales_6_month',
 'sales_9_month',
 'min_bank',
 'potential_issue',
 'pieces_past_due',
 'perf_6_month_avg',
 'perf_12_month_avg',
 'local_bo_qty',
 'deck_risk',
 'oe_constraint',
 'ppap_risk',
 'stop_auto_buy',
 'rev_stop',
 'went_on_backorder']

In [149]:
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin


class VarSelector(BaseEstimator, TransformerMixin):

    def __init__(self,feature_names):

        self.feature_names=feature_names


    def fit(self,x,y=None):

        return self

    def transform(self,X):

        return X[self.feature_names]

    def get_feature_names(self):

        return self.feature_names
    
    def get_feature_names_out(self, feature_names_out):

        return self.feature_names


In [151]:
p1 = Pipeline(
    
[
('s1',VarSelector(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month', 'forecast_6_month', 'forecast_9_month', 'sales_1_month', 'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank', 'potential_issue', 'pieces_past_due', 'perf_6_month_avg', 'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop',]))
]
)

In [153]:
union_pipe = FeatureUnion(
    [
    ('pipe1',p1),
    ]
)

In [155]:
union_pipe.fit(product_train)

In [159]:
x_train = pd.DataFrame(data = union_pipe.fit_transform(product_train),
                       columns = union_pipe.get_feature_names_out())

x_train.head() #this goes into model training

Unnamed: 0,pipe1__national_inv,pipe1__lead_time,pipe1__in_transit_qty,pipe1__forecast_3_month,pipe1__forecast_6_month,pipe1__forecast_9_month,pipe1__sales_1_month,pipe1__sales_3_month,pipe1__sales_6_month,pipe1__sales_9_month,...,pipe1__potential_issue,pipe1__pieces_past_due,pipe1__perf_6_month_avg,pipe1__perf_12_month_avg,pipe1__local_bo_qty,pipe1__deck_risk,pipe1__oe_constraint,pipe1__ppap_risk,pipe1__stop_auto_buy,pipe1__rev_stop
0,8.0,12.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,5.0,...,0.0,0.0,0.63,0.75,0.0,0.0,0.0,0.0,1.0,0.0
1,83.0,2.0,33.0,157.0,377.0,603.0,44.0,98.0,148.0,156.0,...,0.0,0.0,0.68,0.66,0.0,0.0,0.0,0.0,1.0,0.0
2,8.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.73,0.78,0.0,0.0,0.0,0.0,1.0,0.0
3,31.0,4.0,63.0,70.0,160.0,223.0,27.0,90.0,164.0,219.0,...,0.0,0.0,0.73,0.78,0.0,0.0,0.0,1.0,1.0,0.0
4,3.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.81,0.74,0.0,0.0,0.0,0.0,1.0,0.0


In [161]:
x_test = pd.DataFrame(data = union_pipe.fit_transform(product_test),
                       columns = union_pipe.get_feature_names_out())

x_test.head() #this goes into model training

Unnamed: 0,pipe1__national_inv,pipe1__lead_time,pipe1__in_transit_qty,pipe1__forecast_3_month,pipe1__forecast_6_month,pipe1__forecast_9_month,pipe1__sales_1_month,pipe1__sales_3_month,pipe1__sales_6_month,pipe1__sales_9_month,...,pipe1__potential_issue,pipe1__pieces_past_due,pipe1__perf_6_month_avg,pipe1__perf_12_month_avg,pipe1__local_bo_qty,pipe1__deck_risk,pipe1__oe_constraint,pipe1__ppap_risk,pipe1__stop_auto_buy,pipe1__rev_stop
0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.98,0.99,0.0,0.0,0.0,0.0,1.0,0.0
1,149.0,8.0,42.0,238.0,438.0,598.0,56.0,192.0,399.0,604.0,...,0.0,0.0,0.84,0.87,0.0,0.0,0.0,0.0,1.0,0.0
2,11.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.4,0.42,0.0,0.0,0.0,0.0,1.0,0.0
3,272.0,4.0,0.0,0.0,0.0,0.0,2.0,4.0,5.0,17.0,...,0.0,0.0,0.73,0.78,0.0,0.0,0.0,0.0,1.0,0.0
4,41.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.99,0.99,0.0,0.0,0.0,0.0,1.0,0.0


In [163]:
y_train = (product_train['went_on_backorder']=='Yes').astype(int) #false as 0, true as 1
y_train

0         0
1         0
2         0
3         0
4         0
         ..
250073    0
250074    0
250075    0
250076    0
250077    0
Name: went_on_backorder, Length: 250078, dtype: int64

In [165]:
product_train['went_on_backorder'].value_counts()

went_on_backorder
No     248463
Yes      1615
Name: count, dtype: int64

In [167]:
y_train.value_counts()

went_on_backorder
0    248463
1      1615
Name: count, dtype: int64

In [169]:
x_train_model, x_validation,y_train_model, y_validation = train_test_split(x_train, y_train,train_size = 0.8)

In [171]:
model_logit = LogisticRegression()

#fit the model on train data (80% here)
model_logit.fit(x_train_model,y_train_model)

y_validation_prediction = model_logit.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_logit.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1]))

F1 score:  0.0
ROC Score:  0.6795424960266189


In [173]:
rf_model = RandomForestClassifier() #default implementation

#fit the model on train data (80% here)
rf_model.fit(x_train_model,y_train_model)

y_validation_prediction = rf_model.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = rf_model.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1]))

F1 score:  0.08900523560209424
ROC Score:  0.8851450561486971


In [175]:
et_model = ExtraTreesClassifier() #default implementation

#fit the model on train data (80% here)
et_model.fit(x_train_model,y_train_model)

y_validation_prediction = et_model.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = et_model.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1]))

F1 score:  0.16161616161616163
ROC Score:  0.8487501510891625


In [177]:
model_AdaBoost = AdaBoostClassifier()

#fit the model on train data (80% here)
model_AdaBoost.fit(x_train_model,y_train_model)

y_validation_prediction = model_AdaBoost.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_AdaBoost.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1])) #needs actuals, predicted probs

F1 score:  0.075
ROC Score:  0.9390353589425019


In [179]:
model_gb = GradientBoostingClassifier()

#fit the model on train data (80% here)
model_gb.fit(x_train_model,y_train_model)

y_validation_prediction = model_gb.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_gb.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1]))

F1 score:  0.02168021680216802
ROC Score:  0.9301326920783188


In [183]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [185]:
model_xgb = XGBClassifier()

#fit the model on train data (80% here)
model_xgb.fit(x_train_model,y_train_model)

y_validation_prediction = model_xgb.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_xgb.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1])) #needs actuals, predicted probs

F1 score:  0.15865384615384615
ROC Score:  0.9485349670376069


In [187]:
model_lgbm = LGBMClassifier()

#fit the model on train data (80% here)
model_lgbm.fit(x_train_model,y_train_model)

y_validation_prediction = model_lgbm.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_lgbm.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1])) #needs actuals, predicted probs

[LightGBM] [Info] Number of positive: 1262, number of negative: 198800
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3058
[LightGBM] [Info] Number of data points in the train set: 200062, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.006308 -> initscore=-5.059602
[LightGBM] [Info] Start training from score -5.059602
F1 score:  0.15416666666666667
ROC Score:  0.9362198669457069


In [201]:
model_xgb = XGBClassifier()

#fit the model on train data (80% here)
model_xgb.fit(x_train_model,y_train_model)

y_validation_prediction = model_xgb.predict(x_validation)
#predict function by default uses a 0.5 threshold and provides the class output and not the probability

y_validation_prediction_prob = model_xgb.predict_proba(x_validation)
y_validation_prediction_prob #gives probability of class 0 and class 1

print("F1 score: ",f1_score(y_validation,y_validation_prediction)) #true, predicted values - most useful metric in general

print("ROC Score: ",roc_auc_score(y_validation,y_validation_prediction_prob[:,1])) #needs actuals, predicted probs

F1 score:  0.15865384615384615
ROC Score:  0.9485349670376069


In [191]:
from scipy.stats import ks_2samp #to get KS statistic

In [209]:
ks_2samp(y_validation_prediction_prob[:,1][y_validation==0],
        y_validation_prediction_prob[:,1][y_validation==1]) 

KstestResult(statistic=0.7570154855054512, pvalue=1.2855519785611154e-209, statistic_location=0.0027874424, statistic_sign=1)

In [195]:
1-(0.025/0.757) 

0.9669749009247027

In [203]:
y_validation_prediction_prob

array([[9.9779814e-01, 2.2018368e-03],
       [9.9996227e-01, 3.7726320e-05],
       [9.7903383e-01, 2.0966193e-02],
       ...,
       [9.9970299e-01, 2.9700127e-04],
       [9.8309499e-01, 1.6905028e-02],
       [9.9990630e-01, 9.3683433e-05]], dtype=float32)

In [205]:
y_validation_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [223]:
y_test_prediction = model_xgb.predict(x_test)

In [225]:
y_test_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [227]:
Final_class = np.where(y_test_prediction == 1, "Yes", "No")

In [233]:
Final_class

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype='<U3')

In [231]:
len(x_test)

62520

In [235]:
len(Final_class)

62520

In [237]:
len(y_test_prediction)

62520

In [243]:
from collections import Counter
Counter(y_test_prediction)

Counter({0: 62452, 1: 68})

In [245]:
Counter(Final_class)

Counter({'No': 62452, 'Yes': 68})

In [251]:
Final_class = pd.DataFrame(Final_class)



In [253]:
Final_class.to_csv("Final_class.csv", index=False, header=False)

In [5]:
product_train.head()

Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,3921548,8,12,0,0,0,0,1,1,2,...,0,0.63,0.75,0,No,No,No,Yes,No,No
1,3191009,83,2,33,157,377,603,44,98,148,...,0,0.68,0.66,0,No,No,No,Yes,No,No
2,2935810,8,4,0,0,0,0,0,0,1,...,0,0.73,0.78,0,No,No,No,Yes,No,No
3,2205847,31,4,63,70,160,223,27,90,164,...,0,0.73,0.78,0,No,No,Yes,Yes,No,No
4,4953497,3,12,0,0,0,0,0,0,0,...,0,0.81,0.74,0,No,No,No,Yes,No,No


In [17]:
df1 = product_train[product_train["went_on_backorder"] == "Yes"]

In [19]:
df1['perf_6_month_avg'].median()

0.79

In [21]:
df1['perf_12_month_avg'].median()

0.79

In [25]:
from scipy.stats import chi2_contingency

In [33]:
contingency_table = pd.crosstab(product_train["deck_risk"], product_train["went_on_backorder"])

In [39]:
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

Chi-Square Statistic: 10.466700387756221
P-value: 0.0012154561903127679
Degrees of Freedom: 1
Expected Frequencies:
 [[204962.74996201   1332.25003799]
 [ 43500.25003799    282.74996201]]


In [47]:
product_train['pieces_past_due'].value_counts()

pieces_past_due
0       247909
1          309
2          162
4          108
3           91
         ...  
135          1
767          1
497          1
137          1
4170         1
Name: count, Length: 271, dtype: int64

In [45]:
247909/250078

0.9913267060677069

In [49]:
30**0.5

5.477225575051661

In [51]:
5.47*5.47

29.920899999999996

In [53]:
5.5*5.5

30.25

In [55]:
product_train['forecast_9_month'].corr(product_train['sales_9_month'])

0.9228745416608127

In [61]:
group_0 = product_train[product_train["went_on_backorder"] == 'No'][product_train['min_bank'].mean()]
group_1 = product_train[product_train["went_on_backorder"] == 'Yes'][product_train['min_bank'].mean()]

# Perform an independent t-test
t_stat, p_value = ttest_ind(group_0, group_1)


KeyError: 51.25877526211822

In [63]:
df = product_train.groupby(['went_on_backorder']).agg(total_price = ('min_bank','mean'))

In [73]:
df['went_on_backorder'] = df.index

In [75]:
df

Unnamed: 0_level_0,total_price,went_on_backorder
went_on_backorder,Unnamed: 1_level_1,Unnamed: 2_level_1
No,51.307438,No
Yes,43.772136,Yes


In [69]:
from scipy.stats import ttest_ind

In [77]:
t_stat, p_value = ttest_ind(df[df['went_on_backorder'] == 'Yes'] , df[df['went_on_backorder'] == 'No'])

TypeError: unsupported operand type(s) for /: 'str' and 'int'