In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.preprocessing import RobustScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
import os,sys

In [4]:
df  = pd.read_parquet('sample_bo.parquet.gzip')
df = df.sample(1000)

In [5]:
df.head()


Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
261941,13.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.85,0.78,0.0,Yes,No,Yes,Yes,No,No
1249838,6.0,8.0,1.0,41.0,61.0,91.0,24.0,43.0,63.0,84.0,...,0.0,0.99,0.94,0.0,No,No,No,Yes,No,No
961046,6.0,8.0,0.0,0.0,0.0,4.0,0.0,1.0,2.0,3.0,...,0.0,0.87,0.78,0.0,No,No,No,Yes,No,No
35366,8.0,9.0,17.0,13.0,17.0,23.0,5.0,15.0,28.0,37.0,...,0.0,0.85,0.83,0.0,No,No,No,Yes,No,No
885815,10.0,8.0,18.0,0.0,17.0,34.0,3.0,17.0,38.0,53.0,...,0.0,1.0,0.99,0.0,No,No,No,Yes,No,No


In [25]:
df.loc[261941,:].tolist()

[13.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 'No',
 0.0,
 0.85,
 0.78,
 0.0,
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No']

In [6]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('went_on_backorder',axis=1),df['went_on_backorder'])


In [19]:
def get_categorical_encoder_object(categorical_features,numerical_features)->Pipeline:
    try:
        pipeline = ColumnTransformer([
            ('si',SimpleImputer(strategy='median'),numerical_features),
            ('rs',RobustScaler(),numerical_features),
            ('oh',OneHotEncoder(sparse=False,drop='if_binary'),categorical_features)
        ],remainder='passthrough')
        return pipeline
    except Exception as e:
        raise Exception(error=e)

In [7]:
numerical_features = [feature for feature in df.columns if df[feature].dtype !='O']
categorical_features = [feature for feature in df.columns if feature not in numerical_features]

In [11]:
tr1 = Pipeline([
            ('si',SimpleImputer(strategy='median')),
            ('rs',RobustScaler()),
        ])
tr2 = Pipeline([
        ('oh',OneHotEncoder(sparse=False,drop='if_binary'))
        ])

preprocessor = ColumnTransformer([
        ('num',tr1,numerical_features),
        ('cat',tr2,categorical_features[:-1])
],remainder='passthrough')

In [12]:
pipe = Pipeline(steps=[
    ('preprocessor',preprocessor)
])

In [14]:
pd.DataFrame(pipe.fit_transform(X_train))



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-0.141479,-1.50,0.0,0.2,0.253968,0.261682,0.000000,0.000000,-0.038710,-0.056872,...,0.0,0.470588,0.482143,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,0.655949,0.00,0.0,0.0,19.047619,11.214953,1.565217,1.113924,1.225806,1.364929,...,0.0,-0.588235,-0.589286,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,-0.154341,-0.50,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.064516,-0.075829,...,0.0,0.147059,0.232143,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,-0.090032,0.00,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.064516,-0.075829,...,0.0,0.441176,0.553571,0.0,0.0,0.0,1.0,0.0,1.0,1.0
4,-0.180064,-1.50,0.0,2.4,2.031746,1.196262,0.000000,-0.050633,-0.064516,-0.075829,...,0.0,-0.411765,0.160714,0.0,0.0,1.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,-0.115756,0.00,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.064516,-0.075829,...,0.0,-0.264706,-0.160714,0.0,0.0,1.0,1.0,0.0,1.0,1.0
746,-0.115756,0.25,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.038710,0.189573,...,0.0,0.500000,0.589286,0.0,0.0,0.0,1.0,0.0,1.0,1.0
747,-0.051447,0.00,0.0,1.4,0.952381,0.560748,0.000000,0.101266,0.064516,0.227488,...,0.0,0.529412,0.553571,0.0,0.0,0.0,1.0,1.0,1.0,1.0
748,-0.154341,0.25,2.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.012903,-0.018957,...,0.0,-0.411765,-0.553571,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [16]:
pd.DataFrame(pipe.transform(X_test))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.025723,1.00,0.0,0.0,0.126984,0.224299,0.521739,0.354430,0.400000,0.341232,...,0.0,-1.735294,-1.982143,0.0,0.0,0.0,1.0,0.0,1.0,1.0
1,-0.154341,0.25,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.064516,-0.056872,...,0.0,-0.382353,-0.482143,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,0.437299,0.00,23.0,22.2,9.333333,8.186916,3.478261,3.746835,3.909677,4.170616,...,5.0,-1.735294,-1.267857,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,-0.115756,0.00,0.0,0.0,0.063492,0.112150,0.000000,0.000000,0.038710,0.018957,...,0.0,0.500000,0.625000,0.0,0.0,0.0,1.0,1.0,1.0,1.0
4,0.064309,0.25,0.0,0.0,0.000000,0.299065,0.347826,0.253165,0.270968,0.398104,...,0.0,0.500000,0.625000,0.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,-0.128617,0.25,0.0,0.0,0.000000,0.000000,0.000000,-0.050633,-0.064516,-0.075829,...,0.0,0.500000,0.589286,0.0,0.0,0.0,1.0,0.0,1.0,1.0
246,-0.141479,0.00,0.0,0.0,0.000000,0.037383,0.000000,-0.050633,-0.012903,-0.018957,...,0.0,0.058824,0.017857,0.0,0.0,0.0,1.0,0.0,1.0,1.0
247,-0.128617,-1.50,3.0,1.2,0.761905,0.672897,0.173913,0.405063,0.322581,0.322275,...,0.0,0.382353,0.517857,0.0,0.0,0.0,1.0,0.0,1.0,1.0
248,0.591640,-1.50,0.0,6.0,4.444444,4.112150,2.260870,3.189873,2.232258,1.611374,...,0.0,0.235294,0.053571,0.0,0.0,1.0,1.0,0.0,1.0,1.0


In [27]:
pipe.inverse_transform([13.0,
 4.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.0,
 0.0,
 'No',
 0.0,
 0.85,
 0.78,
 0.0,
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'No'])

AttributeError: This 'Pipeline' has no attribute 'inverse_transform'

In [32]:
categorical_features

['potential_issue',
 'deck_risk',
 'oe_constraint',
 'ppap_risk',
 'stop_auto_buy',
 'rev_stop',
 'went_on_backorder']

In [33]:
cat_trans = get_categorical_encoder_object(categorical_features[:-1],numerical_features)

In [34]:
pipe = Pipeline(steps=[
('tr1',tr1),
('tr2',tr2),
('tr3',tr3)
])

In [37]:
X_train

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,sales_9_month,...,potential_issue,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
641053,397.0,8.0,30.0,7946.0,13790.0,18668.0,1891.0,6399.0,13048.0,20859.0,...,No,438.0,-99.00,-99.00,0.0,No,No,No,Yes,No
617629,5.0,8.0,0.0,0.0,1.0,5.0,0.0,1.0,1.0,1.0,...,No,0.0,0.94,0.87,0.0,No,No,No,Yes,No
481158,11.0,,0.0,0.0,0.0,0.0,10.0,37.0,55.0,70.0,...,No,0.0,-99.00,-99.00,0.0,Yes,No,No,Yes,No
572557,22.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,...,No,0.0,0.66,0.78,0.0,No,No,No,Yes,No
807756,14.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,No,0.0,0.99,0.99,0.0,No,No,No,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422611,495.0,2.0,211.0,1014.0,1950.0,3042.0,365.0,1203.0,2266.0,3344.0,...,No,0.0,0.88,0.88,0.0,No,No,No,Yes,No
1011962,10.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,No,0.0,0.58,0.58,0.0,Yes,No,No,Yes,No
57414,28.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,No,0.0,0.77,0.80,0.0,No,No,No,Yes,No
603364,5.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,No,0.0,0.89,0.90,0.0,No,No,No,Yes,No


In [36]:
pd.DataFrame(pipe.fit_transform(X_train))

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [25]:
pipe.steps

[('cat',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('si', SimpleImputer(strategy='median'),
                                   ['national_inv', 'lead_time', 'in_transit_qty',
                                    'forecast_3_month', 'forecast_6_month',
                                    'forecast_9_month', 'sales_1_month',
                                    'sales_3_month', 'sales_6_month',
                                    'sales_9_month', 'min_bank',
                                    'pieces_past_due', 'perf_6_month_avg',
                                    'perf_12_month_avg', 'local_bo_qty']),
                                  ('rs...
                                   ['national_inv', 'lead_time', 'in_transit_qty',
                                    'forecast_3_month', 'forecast_6_month',
                                    'forecast_9_month', 'sales_1_month',
                                    'sales_3_month', 'sales_6_month',
          

In [28]:
import os

In [30]:
os.path.dirname('/config/workspace/artifacts/26_01_23__13_30_42/data_validation/report.yaml')

'/config/workspace/artifacts/26_01_23__13_30_42/data_validation'

In [1]:
from backorder.utils import load_object

In [2]:
transformer = load_object('/config/workspace/artifacts/26_01_23__20_47_26/data_transformation/transformer/transformer.pkl')

In [3]:
transformer.feature_names_in_

array(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'potential_issue', 'pieces_past_due', 'perf_6_month_avg',
       'perf_12_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop'], dtype=object)

In [4]:
cat = load_object('/config/workspace/artifacts/26_01_23__20_47_26/data_transformation/transformer/categorical.pkl')

In [5]:
cat.feature_names_in_

array(['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk',
       'stop_auto_buy', 'rev_stop'], dtype=object)