## Import Packages

In [117]:
import os

##
## ===> Data Management <===
##
import numpy as np
import pandas as pd
pd.set_option( 'display.max_columns', 30 )


##
## ===> Visualization <===
##
import seaborn as sns
import matplotlib.pyplot as plt


##
## ===> Modeling <===
##
## Import train_test_split package from sklearn
##

from sklearn.model_selection import train_test_split
##
## For modeling, notice the new import command for
## the formula API and the summary option
##

import statsmodels.api as sm
import statsmodels.formula.api as smf 

##
## Import the r2_score function from the sklearn metrics package
##
from sklearn.metrics import r2_score
##
## Import confusion functions for classification
##
from sklearn.metrics import confusion_matrix, classification_report
##
## Import decision tree classifier functions
##
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.tree import export_graphviz
##
from sklearn.preprocessing import LabelEncoder

## Set folder path to data

In [118]:
path = os.getcwd() + '\\data\\'

## Import data

In [119]:
ordersFile = 'orders.csv'
df_orders = pd.read_csv(path + ordersFile, parse_dates = ['Tdate'])
df_orders.head()

Unnamed: 0,Onum,CID,Tdate,Pline,Pclass,Usales,Return,returnAmount,Mcost,Lprice,Ddisc,Cdisc,Odisc,Pdisc
0,585,1015,2004-01-25,Living Room,Window Treatment: Blinds,36,No,0,0.95,5.4,,,0.043,0.042
1,586,1015,2004-01-25,Living Room,Window Treatment: Blinds,57,No,0,0.95,5.4,0.157,0.075,0.041,0.031
2,587,1015,2004-01-25,Living Room,Window Treatment: Blinds,27,No,0,0.95,5.4,,0.048,0.053,0.021
3,588,1015,2004-01-25,Living Room,Window Treatment: Blinds,21,No,0,0.95,5.4,,0.072,,0.033
4,589,1015,2004-01-25,Living Room,Window Treatment: Blinds,56,No,0,0.95,5.4,0.14,0.056,0.041,0.055


## Explore table

In [120]:
df_orders.shape

(70270, 14)

In [121]:
df_orders.columns

Index(['Onum', 'CID', 'Tdate', 'Pline', 'Pclass', 'Usales', 'Return',
       'returnAmount', 'Mcost', 'Lprice', 'Ddisc', 'Cdisc', 'Odisc', 'Pdisc'],
      dtype='object')

In [122]:
df_orders.dtypes

Onum                     int64
CID                      int64
Tdate           datetime64[ns]
Pline                   object
Pclass                  object
Usales                   int64
Return                  object
returnAmount             int64
Mcost                  float64
Lprice                 float64
Ddisc                  float64
Cdisc                  float64
Odisc                  float64
Pdisc                  float64
dtype: object

**Onum** is the order ID

In [123]:
df_orders.Onum.nunique()

70270

**CID** is the customer ID

In [124]:
df_orders.CID.nunique()

779

Missing values

In [125]:
df_orders.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70270 entries, 0 to 70269
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Onum          70270 non-null  int64         
 1   CID           70270 non-null  int64         
 2   Tdate         70270 non-null  datetime64[ns]
 3   Pline         70270 non-null  object        
 4   Pclass        70270 non-null  object        
 5   Usales        70270 non-null  int64         
 6   Return        70270 non-null  object        
 7   returnAmount  70270 non-null  int64         
 8   Mcost         70270 non-null  float64       
 9   Lprice        70270 non-null  float64       
 10  Ddisc         70262 non-null  float64       
 11  Cdisc         70261 non-null  float64       
 12  Odisc         70266 non-null  float64       
 13  Pdisc         70268 non-null  float64       
dtypes: datetime64[ns](1), float64(6), int64(4), object(3)
memory usage: 7.5+ MB


In [126]:
df_orders.isna().sum()

Onum            0
CID             0
Tdate           0
Pline           0
Pclass          0
Usales          0
Return          0
returnAmount    0
Mcost           0
Lprice          0
Ddisc           8
Cdisc           9
Odisc           4
Pdisc           2
dtype: int64

Number of distinct values.

In [127]:
numDistinctValues = []
colNames = []

for col in df_orders.columns.to_list():
    numDistinctValues.append(df_orders[col].nunique())
    colNames.append(col)
    
distinctValues = pd.DataFrame({'nDistinct':numDistinctValues}, colNames)
distinctValues

Unnamed: 0,nDistinct
Onum,70270
CID,779
Tdate,439
Pline,1
Pclass,1
Usales,195
Return,2
returnAmount,111
Mcost,12
Lprice,13


## Prepare data

### Customer.csv file

In [128]:
customersFile = 'customers.csv'
df_cust = pd.read_csv(path + customersFile)
df_cust.head()

Unnamed: 0,CID,State,ZIP,Region
0,1700,MT,59821,West
1,850,ND,58068,Midwest
2,280,NY,10007,Northeast
3,1574,WY,83120,West
4,110,CO,80403,West


### Prep orders.csv

In [130]:
display(df_orders.head(), 
        df_cust.head())

discColumns = ['Ddisc', 'Odisc', 'Cdisc', 'Pdisc']

df_orders = (df_orders
             .assign(Tdisc = df_orders[discColumns].sum(axis = 'columns'),                     ## Total discount column
                     Pprice = lambda _df: _df.Lprice * (1 -  _df.Tdisc),                       ## Pocket price ( Pprice = Lprice x (1-Tdisc)) 
                     Rev = lambda _df: _df.Usales * _df.Pprice,                                ## Total Revenue
                     Con = lambda _df:_df.Rev - _df.Mcost,                                     ## Contribution: Subtract Material Cost (Mcost from the Data Dictionary) from Revenue 
                     CM = lambda _df: _df.Con/_df.Rev,                                         ## Contribution Margin: Divide Contribution by Revenue
                     netRev = lambda _df:(_df.Usales - _df.returnAmount) * _df.Pprice,         ## NetRevenue = (Unit Sales - Returns) * PocketPrice
                     lostRev = lambda _df: _df.Rev - _df.netRev,                               ## LostRevenue = Revenue - net Revenue
                    ) 
             .join(df_cust.set_index('CID'), on = 'CID', how = 'inner')                        ## Join orders and customers
             .astype({'Region':'category', 'Pline':'category', 'Pclass':'category', 'Return':'category'})
            )

df_orders.head()

Unnamed: 0,Onum,CID,Tdate,Pline,Pclass,Usales,Return,returnAmount,Mcost,Lprice,Ddisc,Cdisc,Odisc,Pdisc
0,585,1015,2004-01-25,Living Room,Window Treatment: Blinds,36,No,0,0.95,5.4,,,0.043,0.042
1,586,1015,2004-01-25,Living Room,Window Treatment: Blinds,57,No,0,0.95,5.4,0.157,0.075,0.041,0.031
2,587,1015,2004-01-25,Living Room,Window Treatment: Blinds,27,No,0,0.95,5.4,,0.048,0.053,0.021
3,588,1015,2004-01-25,Living Room,Window Treatment: Blinds,21,No,0,0.95,5.4,,0.072,,0.033
4,589,1015,2004-01-25,Living Room,Window Treatment: Blinds,56,No,0,0.95,5.4,0.14,0.056,0.041,0.055


Unnamed: 0,CID,State,ZIP,Region
0,1700,MT,59821,West
1,850,ND,58068,Midwest
2,280,NY,10007,Northeast
3,1574,WY,83120,West
4,110,CO,80403,West


Unnamed: 0,Onum,CID,Tdate,Pline,Pclass,Usales,Return,returnAmount,Mcost,Lprice,Ddisc,Cdisc,Odisc,Pdisc,Tdisc,Pprice,Rev,Con,CM,netRev,lostRev,State,ZIP,Region
0,585,1015,2004-01-25,Living Room,Window Treatment: Blinds,36,No,0,0.95,5.4,,,0.043,0.042,0.085,4.941,177.876,176.926,0.994659,177.876,0.0,MI,49093,Midwest
1,586,1015,2004-01-25,Living Room,Window Treatment: Blinds,57,No,0,0.95,5.4,0.157,0.075,0.041,0.031,0.304,3.7584,214.2288,213.2788,0.995565,214.2288,0.0,MI,49093,Midwest
2,587,1015,2004-01-25,Living Room,Window Treatment: Blinds,27,No,0,0.95,5.4,,0.048,0.053,0.021,0.122,4.7412,128.0124,127.0624,0.992579,128.0124,0.0,MI,49093,Midwest
3,588,1015,2004-01-25,Living Room,Window Treatment: Blinds,21,No,0,0.95,5.4,,0.072,,0.033,0.105,4.833,101.493,100.543,0.99064,101.493,0.0,MI,49093,Midwest
4,589,1015,2004-01-25,Living Room,Window Treatment: Blinds,56,No,0,0.95,5.4,0.14,0.056,0.041,0.055,0.292,3.8232,214.0992,213.1492,0.995563,214.0992,0.0,MI,49093,Midwest
