# Requirements
Input the data
 - Aggregate the data to the years each customer made an order
 - Calculate the year each customer made their First Purchase
 - Scaffold the dataset so that there is a row for each year after a customers First Purchase, even if they did not make an order
 - Create a field to flag these new rows, making it clear whether a customer placed an order in that year or not
 - Calculate the Year on Year difference in the number of customers from each Cohort in each year
 - Cohort = Year of First Purchase
 - Create a field which flags whether or not a customer placed an order in the previous year
 - Create the Customer Classification using the above definitions
 - Join back to the original input data
 - Ensure that in rows where a customer did not place an order, the majority of the original fields are null. The exceptions to this are the Customer Name and Customer ID fields.
 - Output the data


In [1]:
import os
import pandas as pd
import numpy as np
import datetime as dt

### Input data

In [426]:
df = pd.read_excel('Sample - Superstore.xls')

In [427]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          9994 non-null   int64         
 1   Order ID        9994 non-null   object        
 2   Order Date      9994 non-null   datetime64[ns]
 3   Ship Date       9994 non-null   datetime64[ns]
 4   Ship Mode       9994 non-null   object        
 5   Customer ID     9994 non-null   object        
 6   Customer Name   9994 non-null   object        
 7   Segment         9994 non-null   object        
 8   Country/Region  9994 non-null   object        
 9   City            9994 non-null   object        
 10  State           9994 non-null   object        
 11  Postal Code     9983 non-null   float64       
 12  Region          9994 non-null   object        
 13  Product ID      9994 non-null   object        
 14  Category        9994 non-null   object        
 15  Sub-

In [428]:
df.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9983.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55245.233297,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32038.715955,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,57103.0,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [429]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country/Region,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2020-152156,2020-11-08,2020-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2020-152156,2020-11-08,2020-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2020-138688,2020-06-12,2020-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2019-108966,2019-10-11,2019-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2019-108966,2019-10-11,2019-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [430]:
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country/Region', 'City',
       'State', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'],
      dtype='object')

###  - Aggregate the data to the years each customer made an order

In [431]:
cust_year = df[['Order Date','Customer ID']].copy()

In [432]:
cust_year['Order Date'] = cust_year['Order Date'].dt.year

In [433]:
cust_year = cust_year.groupby(['Customer ID','Order Date']).sum().reset_index().copy()

###  - Calculate the year each customer made their First Purchase

In [434]:
cust_min_max = cust_year.groupby(['Customer ID']).agg({'Order Date': [np.min,np.max]}).copy()

In [435]:
cust_min_max.columns = ['min_year','max_year']

In [436]:
cust_min_max.reset_index(inplace=True)

### Scaffold the dataset so that there is a row for each year after a customers First Purchase, even if they did not make an order

In [437]:
all_years = pd.DataFrame(cust_year['Order Date'].unique(),columns=['year'])

In [438]:
# add key column to create cross join
all_years['key'] = 1
cust_year['key'] = 1

In [439]:
cust_all_year = cust_year[['key','Customer ID']].merge(all_years,on='key').drop('key',1).drop_duplicates().copy()
cust_all_year

Unnamed: 0,Customer ID,year
0,AA-10315,2018
1,AA-10315,2019
2,AA-10315,2020
3,AA-10315,2021
16,AA-10375,2018
...,...,...
9971,ZC-21910,2021
9984,ZD-21925,2018
9985,ZD-21925,2019
9986,ZD-21925,2020


In [440]:
# Test that all customers have all years present
cust_all_year.groupby('Customer ID').count()


Unnamed: 0_level_0,year
Customer ID,Unnamed: 1_level_1
AA-10315,4
AA-10375,4
AA-10480,4
AA-10645,4
AB-10015,4
...,...
XP-21865,4
YC-21895,4
YS-21880,4
ZC-21910,4


###  Create a field to flag these new rows, making it clear whether a customer placed an order in that year or not

In [441]:
cust_year_test = cust_year.merge(cust_all_year, left_on=['Customer ID','Order Date'],right_on=['Customer ID','year'],how='outer').drop('key',1).copy()

In [442]:
cust_year_test['placed_order_test'] = ~cust_year_test['Order Date'].isna()
cust_year_test.drop('Order Date',1,inplace=True)

###  Cohort = Year of First Purchase

In [443]:
def testing(test):
    if test :
        return 1
    else:
        return 0
cust_year_test['placed_order_test'] = cust_year_test['placed_order_test'].apply(testing)

In [444]:
cust_year_test_add_calcs = cust_year_test.merge(cust_min_max,on='Customer ID',how='left')

In [445]:
cust_year_test_add_calcs['new'] = cust_year_test_add_calcs['year'] == cust_year_test_add_calcs['min_year']
cust_year_test_add_calcs

Unnamed: 0,Customer ID,year,placed_order_test,min_year,max_year,new
0,AA-10315,2018,1,2018,2021,True
1,AA-10315,2019,1,2018,2021,False
2,AA-10315,2020,1,2018,2021,False
3,AA-10315,2021,1,2018,2021,False
4,AA-10375,2018,1,2018,2021,True
...,...,...,...,...,...,...
3167,VT-21700,2020,0,2018,2019,False
3168,VT-21700,2021,0,2018,2019,False
3169,YS-21880,2018,0,2019,2021,False
3170,YS-21880,2020,0,2019,2021,False


### Create a field which flags whether or not a customer placed an order in the previous year

In [446]:
# cust_year_test_add_calcs[['Customer ID', 'year','placed_order_test']].groupby(['Customer ID', 'year'])
cust_year_test_add_calcs.set_index(['Customer ID', 'year'],inplace=True)
cust_year_test_add_calcs

Unnamed: 0_level_0,Unnamed: 1_level_0,placed_order_test,min_year,max_year,new
Customer ID,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA-10315,2018,1,2018,2021,True
AA-10315,2019,1,2018,2021,False
AA-10315,2020,1,2018,2021,False
AA-10315,2021,1,2018,2021,False
AA-10375,2018,1,2018,2021,True
...,...,...,...,...,...
VT-21700,2020,0,2018,2019,False
VT-21700,2021,0,2018,2019,False
YS-21880,2018,0,2019,2021,False
YS-21880,2020,0,2019,2021,False


In [447]:
cust_year_test_add_calcs.sort_index(inplace=True)

In [448]:
cust_year_test_add_calcs['placed_order_shifted'] = cust_year_test_add_calcs.groupby(level=0).shift()['placed_order_test']

In [449]:
cust_year_test_add_calcs.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,placed_order_test,min_year,max_year,new,placed_order_shifted
Customer ID,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AA-10315,2018,1,2018,2021,True,
AA-10315,2019,1,2018,2021,False,1.0
AA-10315,2020,1,2018,2021,False,1.0
AA-10315,2021,1,2018,2021,False,1.0
AA-10375,2018,1,2018,2021,True,


###  Create the Customer Classification using the above definitions

 - New = this is the first year the customer has ordered
 - Consistent = the customer ordered this year and last year
 - Sleeping = the customer has ordered in the past, but not this year
 - Returning = the customer did not order last year, but has ordered this year

In [450]:
cust_year_test_add_calcs['level'] = cust_year_test_add_calcs['new'].apply(lambda x: 'New' if x else x)

In [451]:
cust_year_test_add_calcs['level2'] = (~cust_year_test_add_calcs['new']) & (cust_year_test_add_calcs['placed_order_test'] == 1) & (cust_year_test_add_calcs['placed_order_shifted'] == 1)
cust_year_test_add_calcs['level2'] = cust_year_test_add_calcs['level2'].apply(lambda x: 'Consistent' if x else x)

In [452]:
cust_year_test_add_calcs['level3'] = (~cust_year_test_add_calcs['new']) & (cust_year_test_add_calcs['placed_order_test'] == 0) & (cust_year_test_add_calcs['placed_order_shifted'] == 1)
cust_year_test_add_calcs['level3'] = cust_year_test_add_calcs['level3'].apply(lambda x: 'Sleeping' if x else x)

In [453]:
cust_year_test_add_calcs['level4'] = (~cust_year_test_add_calcs['new']) & (cust_year_test_add_calcs['placed_order_test'] == 1) & (cust_year_test_add_calcs['placed_order_shifted'] == 0)
cust_year_test_add_calcs['level4'] = cust_year_test_add_calcs['level4'].apply(lambda x: 'Returning' if x else x)

In [454]:
conditions = [
    cust_year_test_add_calcs['level'] == 'New',
    cust_year_test_add_calcs['level2'] == 'Consistent',
    cust_year_test_add_calcs['level3'] == 'Sleeping',
    cust_year_test_add_calcs['level4'] == 'Returning'
]

choices = ['New','Consistent','Sleeping','Returning']

cust_year_test_add_calcs['level5']= np.select(conditions, choices, default='delete')
cust_year_test_add_calcs

Unnamed: 0_level_0,Unnamed: 1_level_0,placed_order_test,min_year,max_year,new,placed_order_shifted,level,level2,level3,level4,level5
Customer ID,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA-10315,2018,1,2018,2021,True,,New,False,False,False,New
AA-10315,2019,1,2018,2021,False,1.0,False,Consistent,False,False,Consistent
AA-10315,2020,1,2018,2021,False,1.0,False,Consistent,False,False,Consistent
AA-10315,2021,1,2018,2021,False,1.0,False,Consistent,False,False,Consistent
AA-10375,2018,1,2018,2021,True,,New,False,False,False,New
...,...,...,...,...,...,...,...,...,...,...,...
ZC-21910,2021,1,2018,2021,False,1.0,False,Consistent,False,False,Consistent
ZD-21925,2018,1,2018,2021,True,,New,False,False,False,New
ZD-21925,2019,0,2018,2021,False,1.0,False,False,Sleeping,False,Sleeping
ZD-21925,2020,1,2018,2021,False,0.0,False,False,False,Returning,Returning


In [459]:
cust_year_test_add_calcs = cust_year_test_add_calcs[cust_year_test_add_calcs['level5'] != 'delete']
cust_year_final = cust_year_test_add_calcs[['level5']].copy()
cust_year_final

Unnamed: 0_level_0,Unnamed: 1_level_0,level5
Customer ID,year,Unnamed: 2_level_1
AA-10315,2018,New
AA-10315,2019,Consistent
AA-10315,2020,Consistent
AA-10315,2021,Consistent
AA-10375,2018,New
...,...,...
ZC-21910,2021,Consistent
ZD-21925,2018,New
ZD-21925,2019,Sleeping
ZD-21925,2020,Returning


### Join back to the original input data
 - Ensure that in rows where a customer did not place an order, the majority of the original fields are null. The exceptions to this are the Customer Name and Customer ID fields.

In [460]:
#ensure that there are no duplicates
cust_year_final.reset_index().drop_duplicates().shape[0] == cust_year_final.shape[0]

True

In [461]:
# Prepare for join
cust_year_final.reset_index(inplace=True)
df['order year'] = df['Order Date'].dt.year

In [462]:
df_final = cust_year_final.merge(df,left_on=['Customer ID','year'],right_on=['Customer ID','order year'],suffixes=['','_x'],how='left').drop('order year',axis=1).copy()

In [463]:
df_final.rename(columns={'level5' : 'Customer classification'},inplace=True)

In [479]:
df_final.to_csv('preppindata_20220306.csv',index=False)
os.startfile('preppindata_20220306.csv')