In [None]:
### Dependencies

from glob import glob
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
### Pull in Amazon and Shopify order data

# combine all amazon FBA & FBM csv files since can only download by month
amazon_customers = sorted(glob('.../fba_order_data_****_**.csv'))
amazon_all = pd.concat((pd.read_csv(file, encoding='latin-1').assign(filename = file)
           for file in amazon_customers), ignore_index = True)

# combine all shopify csv files (2 because file too large from shopify to download as 1)
shopify_customers = sorted(glob('.../orders_export_*.csv'))
shopify_all = pd.concat((pd.read_csv(file, encoding='latin-1').assign(filename = file)
           for file in shopify_customers), ignore_index = True)

print("shopify order records: ", shopify_all.shape," - amazon order records: ",amazon_all.shape)

In [None]:
### Remove duplicate Fulfilled by Amazon orders that are tracked in Shopify and clean data

df = shopify_all 

# delete amazon when source is 'Amazon' or '338151' (amz equivalent) to avoid duplicates
df1 = (df.loc[(df['Source'] == 'Amazon')|(df['Source']  == '338151')])
shopify_all2 = df.loc[df.index.difference(df1.index), ]
df = shopify_all2

# if order number was the same, combined to be just 1 order
df = df.drop_duplicates(subset='Name',keep='first')

# make total orders column and set to 1 since orders haven't been combined
df['total_orders'] = 1

# delete unnecessary columns
df.drop(columns=['Financial Status','Paid at','Fulfillment Status','Fulfilled at',
                 'Accepts Marketing','Currency','Subtotal','Shipping','Taxes','Discount Code','Discount Amount',
                 'Shipping Method','Lineitem quantity','Lineitem name','Lineitem price',
                 'Lineitem compare at price','Lineitem requires shipping','Lineitem taxable',
                 'Lineitem fulfillment status','Payment Method','Payment Reference','Refunded Amount',
                 'Vendor','Employee','Cancelled at','Outstanding Balance','Location','Phone',
                 'Risk Level','Lineitem discount','Tax 1 Name','Tax 1 Value','Tax 2 Name','Tax 2 Value',
                 'Tax 3 Name','Tax 3 Value','Tax 4 Name','Tax 4 Value','Tax 5 Name','Tax 5 Value','Receipt Number',
                 'Device ID','Id','Notes','Note Attributes','Tags','Billing Address1','Billing Address2',
                 'Shipping Address1','Shipping Address2','Lineitem sku'],axis=1,inplace=True)

# add Id to each record
df = df.reset_index()

# rename columns
df.rename(columns={'index':'shop_id','Total':'total_spent','Created at':'created_at',
                    'Name': 'order_id', 'Email':'email', 'Billing Name':'billing_name',
                    'Billing Street':'billing_street', 'Billing Company':'billing_company', 
                    'Billing City':'billing_city', 'Billing Zip':'billing_zip',
                    'Billing Province':'billing_province', 'Billing Country':'billing_country', 'Billing Phone':'billing_phone', 
                    'Shipping Name':'shipping_name','Shipping Street':'shipping_street', 
                    'Shipping Company':'shipping_company', 'Shipping City':'shipping_city', 'Shipping Zip':'shipping_zip',
                    'Shipping Province':'shipping_province', 'Shipping Country':'shipping_country', 
                    'Shipping Phone':'phone', 'Source':'source'},inplace=True)

# separate crated_at, order_id, source with comma
df[['created_at','order_id','source']] = df[['created_at','order_id','source']].astype(str)
df['created_at'] = df['created_at'].str[:10]
df[['created_at','order_id','source']] = df[['created_at','order_id','source']] + ','

# email, name, and addresses are same: sum totals, keep first
df = df.groupby(['email','billing_name','shipping_name','billing_street','shipping_street']).agg({'shop_id':'min','total_spent':'sum','total_orders':'sum',
                                                                     'billing_company':'first','shipping_company':'first',
                                                                     'billing_city':'first','billing_zip':'first','billing_province':'first',
                                                                     'billing_country':'first','billing_phone':'first',
                                                                     'shipping_province':'first','shipping_city':'first',
                                                                     'shipping_zip':'first','shipping_country':'first','phone':'first',
                                                                     'created_at':'sum','order_id':'sum','source':'sum'}).reset_index()

# create distinct row for each order
shopify_all3 = df
explode_df3 = (shopify_all3[[
                            'email', 'billing_name', 'shipping_name', 'billing_street',
                            'shipping_street', 'shop_id','total_spent', 'total_orders', 'billing_company',
                            'shipping_company', 'billing_city', 'billing_zip', 'billing_province',
                            'billing_country', 'billing_phone', 'shipping_province',
                            'shipping_city', 'shipping_zip', 'shipping_country', 'phone',
                            'order_id', 'source','created_at'
                            ]].set_index([
                                                 'email', 'billing_name', 'shipping_name', 'billing_street',
                                                 'shipping_street', 'shop_id', 'total_spent', 'total_orders', 'billing_company',
                                                 'shipping_company', 'billing_city', 'billing_zip', 'billing_province',
                                                 'billing_country', 'billing_phone', 'shipping_province',
                                                 'shipping_city', 'shipping_zip', 'shipping_country'])
                .apply(lambda x: x.str.split(',').explode())
                .reset_index())

explode_mask = ~explode_df3['order_id'].isin([""])
shopify_all4 = explode_df3[explode_mask]
shopify_all4['shop_id'] = 'shop_' + shopify_all4['shop_id'].astype(str)

# shopify_all4.head() # check your work

In [None]:
### Clean Amazon data

df = amazon_all

# separate purchase dates with comma
df[['purchase-date','sku','amazon-order-id']] = df[['purchase-date','sku','amazon-order-id']].astype(str)
df['purchase-date'] = df['purchase-date'].str[:10]
df[['purchase-date','sku','amazon-order-id']] = df[['purchase-date','sku','amazon-order-id']] + ','

# same orders combined into one with totals summed
df = df.groupby(['amazon-order-id']).agg({'sku':'sum','item-price':'sum','item-tax':'sum','shipping-price':'sum',
                                     'shipping-tax':'sum','ship-promotion-discount':'sum','item-promotion-discount':'sum',
                                     'buyer-email':'first','buyer-name':'first','purchase-date':'first','recipient-name':'first',
                                     'ship-address-1':'first','ship-address-2':'first','ship-address-3':'first','ship-city':'first',
                                     'ship-state':'first','ship-postal-code':'first','ship-country':'first'}).reset_index()

# make total orders column and set to 1 since orders haven't been combined yet
df['total_orders'] = 1

# add Id to each record
df = df.reset_index()

# calculate Total Spent equivalent
sum_column = (df['item-price'] + df['item-tax'] + df['shipping-price'] + df['shipping-tax'] + df['item-promotion-discount'] + df['ship-promotion-discount'])
df['Total Spent'] = sum_column
df['Order Value'] = sum_column

# join shipping address fields into one (no billing address info given by amz)
df['Address'] = df['ship-address-1'].fillna('').astype(str) +' '+ df['ship-address-2'].fillna('').astype(str) +' '+ df['ship-address-3'].fillna('').astype(str)
df['Address'] = df['Address'].str.strip()
df['Address'] = df['Address'].str.title()

# delete unnecessary columns
df.drop(columns=['item-price','shipping-price','item-tax','shipping-tax','item-promotion-discount',
                 'ship-promotion-discount','ship-address-1','ship-address-2','ship-address-3'],axis=1,inplace=True)

# rename columns
df.rename(columns={'index':'amz_id','buyer-email':'email','ship-city':'shipping_city','ship-state':'shipping_state','ship-postal-code':'shipping_zip',
                    'ship-country':'shipping_country','purchase-date':'created_at','buyer-name':'billing_name','recipient-name':'shipping_name',
                    'amazon-order-id':'order_id','Total Spent':'total_spent', 'Order Value':'order_value', 'Address':'address'},inplace=True)

# sum totals and delete duplicates when email, name, and address are same
df = df.groupby(['email','shipping_name','address']).agg({'amz_id':'min','total_orders':'sum','total_spent':'sum','shipping_city':'first','created_at':'sum',
                                                        'shipping_state':'first','shipping_zip':'first','shipping_country':'first',
                                                        'billing_name':'first','order_id':'sum'}).reset_index()

df = df.groupby(['email','address','billing_name']).agg({'amz_id':'min','total_orders':'sum','total_spent':'sum','shipping_city':'first','shipping_state':'first',
                                                        'shipping_zip':'first','shipping_country':'first','shipping_name':'first',
                                                        'created_at':'sum','order_id':'sum'}).reset_index()

df = df.groupby(['email','address']).agg({'amz_id':'min','total_orders':'sum','total_spent':'sum','shipping_city':'first','shipping_state':'first','shipping_zip':'first',
                                            'shipping_country':'first','shipping_name':'first','billing_name':'first','created_at':'sum',
                                            'order_id':'sum'}).reset_index()

# set channel to amazon
df['source'] = 'amazon'

amazon_all2 = df
explode_amz = (amazon_all2[[
                            'email', 
                            'address', 'amz_id', 'total_orders', 'total_spent', 'shipping_city',
                            'shipping_state', 'shipping_zip', 'shipping_country', 'shipping_name',
                            'billing_name', 'source',
                            'created_at', 'order_id'
                            ]].set_index([
                                        'email',
                                        'address', 'amz_id', 'total_orders', 'total_spent', 'shipping_city',
                                        'shipping_state', 'shipping_zip', 'shipping_country', 'shipping_name',
                                        'billing_name', 'source'
                                        ])
                .apply(lambda x: x.str.split(',').explode())
                .reset_index())

explode_mask_amz = ~explode_amz['order_id'].isin([""])
amazon_all3 = explode_amz[explode_mask_amz]
amazon_all3['amz_id'] = 'amz_' + amazon_all3['amz_id'].astype(str)

# amazon_all3.head() # check your work

In [None]:
### Concatenate all data and use group bys to identify like records

df1 = shopify_all4 
df2 = amazon_all3

# combine files
df3 = pd.concat([df1,df2])

# remove beginning and end spaces
df3['shipping_name'] = df3['shipping_name'].str.strip()
df3['billing_name'] = df3['billing_name'].str.strip()
df3['shipping_city'] = df3['shipping_city'].str.strip()
df3['email'] = df3['email'].str.strip()
df3['address'] = df3['address'].str.strip()
df3['shipping_state'] = df3['shipping_state'].str.strip()
df3['shipping_zip'] = df3['shipping_zip'].str.strip()

# uniform captitalization
df3['address'] = df3['address'].str.lower()
df3['shipping_name'] = df3['shipping_name'].str.lower()
df3['billing_name'] = df3['billing_name'].str.lower()
df3['shipping_city'] = df3['shipping_city'].str.lower()
df3['email'] = df3['email'].str.lower()

# all zip codes = first 5 numbers
df3['shipping_zip'] = df3['shipping_zip'].replace("'", '', regex=True)
df3['shipping_zip'] = df3['shipping_zip'].astype(str)
df3['shipping_zip'] = df3['shipping_zip'].str[:5] + ' , '

# delete double spaces in names
df3['shipping_name'] = df3['shipping_name'].replace('\s+', ' ', regex=True)
df3['billing_name'] = df3['billing_name'].replace('\s+', ' ', regex=True)

# add space between strings when summed so able to split & remove duplicates
df3['email'] = df3['email'] + ','
df3['address'] = df3['address'] + '  &  '
df3['shipping_city'] = df3['shipping_city'] + ' , '
df3['shipping_state'] = df3['shipping_state'] + ' , '
df3['order_id'] = df3['order_id'] + ' , '
df3['source'] = df3['source'] + ' , '

df3['created_at'] = df3['created_at'].astype(str)
df3['created_at'] = df3['created_at'] + ' , '

# update id columns 
df3 = df3.astype(object).replace(np.nan, '')
df3['id']=[f"{a} {b}" for a,b in zip(df3.amz_id,df3.shop_id)]
df3.drop(columns=['amz_id','shop_id'],axis=1,inplace=True)
df3['id'] = df3['id'].str.strip()

# id: sum totals, keep first - grouping by ids preserves total spent and total orders
df3 = df3.groupby(['id']).agg({'shipping_name':'first','email':'first','address':'first','total_orders':'first','total_spent':'first','shipping_city':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','billing_name':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                            },axis=1,inplace=True).reset_index()

# name, email, address are same: sum totals, keep first
df3 = df3.groupby(['shipping_name','email','address']).agg({'total_orders':'sum','total_spent':'sum','shipping_city':'first','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','billing_name':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                            },axis=1,inplace=True).reset_index()

# when shipping name and address same: keep first, sum totals (including email)
df3 = df3.groupby(['shipping_name','address']).agg({'total_orders':'sum','total_spent':'sum','shipping_city':'first','email':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','billing_name':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                            },axis=1,inplace=True).reset_index()

# when billing name and address same: keep first, sum totals (including email)
df3 = df3.groupby(['billing_name','address']).agg({'total_orders':'sum','total_spent':'sum','shipping_city':'first','email':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_name':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                            },axis=1,inplace=True).reset_index()

# email and address same: sum totals
df3 = df3.groupby(['email','address']).agg({'total_orders':'sum','total_spent':'sum','shipping_city':'first','billing_name':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_name':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                            },axis=1,inplace=True).reset_index()


# when shipping name, city, state, and zip are same: keep first, sum totals (including email and address)
df3 = df3.groupby(['shipping_name','shipping_city']).agg({'total_orders':'sum','total_spent':'sum','address':'first','billing_name':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','email':'sum',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                           },axis=1,inplace=True).reset_index()

# name and email same: sum totals + address
df3 = df3.groupby(['shipping_name','email']).agg({'total_orders':'sum','total_spent':'sum','address':'first','billing_name':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                           },axis=1,inplace=True).reset_index()

df3 = df3.groupby(['billing_name','email']).agg({'total_orders':'sum','total_spent':'sum','address':'first','shipping_name':'sum','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                           },axis=1,inplace=True).reset_index()
                                                           
# split shipping name into first and last names
name = df3['shipping_name'].str.rsplit(' ',1)
df3['last_name'] = name.str.get(1)
df3['first_name'] = name.str.get(0)

# df3.head() # check your work

In [None]:
### Continue using groupd bys to identify like records

df = df3

# take first 3 digits of zipcode
df['first3'] = df['shipping_zip'].str[:3]

df['created_at'] = df['created_at'].astype(str)

# group by same name and first 3 of zip
df = df.groupby(['billing_name','first3']).agg({'total_orders':'sum','total_spent':'sum','address':'first','shipping_name':'sum','first_name':'first','last_name':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first','id':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum','email':'sum'
                                                           },axis=1,inplace=True).reset_index()

# group by same name and first 3 of zip
df = df.groupby(['shipping_name','first3']).agg({'total_orders':'sum','total_spent':'sum','address':'first','billing_name':'sum','first_name':'first','last_name':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first','id':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum','email':'sum'
                                                           },axis=1,inplace=True).reset_index()

# group by same email and first 3 of zip
df = df.groupby(['email','first3']).agg({'total_orders':'sum','total_spent':'sum','address':'first','billing_name':'sum','shipping_name':'sum','first_name':'first','id':'first',
                                                            'last_name':'first','shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum'
                                                           },axis=1,inplace=True).reset_index()

# get same intro of email (e.g. johnsmith@gmail.com & johnsmith@yahoo.com
begin = df['email'].str.rsplit('@',1)
df['intro'] = begin.str.get(0)

df = df.groupby(['intro','last_name']).agg({'total_orders':'sum','total_spent':'sum','address':'first','billing_name':'sum','shipping_name':'sum','first_name':'first','id':'first',
                                                            'shipping_state':'first','shipping_zip':'first','shipping_country':'first','shipping_city':'first',
                                                            'shipping_company':'first','phone':'first','created_at':'sum','order_id':'sum','source':'sum','email':'sum','first3':'sum'
                                                           },axis=1,inplace=True).reset_index()

df.drop(columns={'intro'},axis=1,inplace=True)

# strip spaces from end of row values
df['email'] = df['email'].str.strip()
df['shipping_city'] = df['shipping_city'].str.strip()
df['address'] = df['address'].str.strip()
df['shipping_zip'] = df['shipping_zip'].str.strip()
df['source'] = df['source'].str.strip()
df['shipping_state'] = df['shipping_state'].str.strip()
df['created_at'] = df['created_at'].str.strip()

combined_df2 = df 

# explode rows
combined_df2.columns

explode_comb = (combined_df2[[
                            'id', 'email',
                            'last_name', 'total_orders', 'total_spent', 'address', 'billing_name',
                            'shipping_name', 'first_name',  'shipping_state', 'shipping_zip',
                            'shipping_country', 'shipping_city', 'shipping_company', 'phone',
                            'created_at', 'order_id', 'source' #to be agregated
                            ]].set_index([
                                        'id', 'email',
                                        'last_name', 'total_orders', 'total_spent', 'address', 'billing_name',
                                        'shipping_name', 'first_name',  'shipping_state', 'shipping_zip',
                                        'shipping_country', 'shipping_city', 'shipping_company', 'phone'
                                        ])
                .apply(lambda x: x.str.split(',').explode())
                .reset_index())

explode_mask_comb = ~explode_comb['created_at'].isin([""])
combined_df3 = explode_comb[explode_mask_comb]

combined_df3['created_at'] = combined_df3['created_at'].str.strip()
combined_df3['order_id'] = combined_df3['order_id'].str.strip()
combined_df3 = combined_df3.sort_values(by=['id','created_at'])
combined_df3[['shipping_zip','shipping_state','shipping_city']] = combined_df3[['shipping_zip','shipping_state','shipping_city']].replace(',', ' ', regex=True)
combined_df3['order_num'] = combined_df3.groupby(['id']).cumcount()+1

# combined_df3.shape # check your work

In [None]:
### Clean up dates

# date for csv export
date = pd.to_datetime('today').date()
date = date.strftime('yyyy-mm-dd')

In [None]:
### Produce DF of orders by liked customer
# this is the primary output of this process. all calculations from this
# point rely on this dataframe

# collect order_id, order_email, and skus from shopify
df1 = shopify_all
df1['sku'] = df1['Lineitem sku'] + ','
df1.rename(columns={'Email':'order_email','Name':'source_ord_id','Total':'order_total'},inplace=True)
shopify_orders = df1.groupby(['source_ord_id']).agg({
                                        'sku':'sum', 'order_email':'first','order_total':'sum'
                                        },axis=1,inplace=True).reset_index()

# collect order_id, order_email, and skus from amazon
df2 = amazon_all
df2['sku'] = df2['sku'] + ','
sum_column = (df2['item-price'] + df2['item-tax'] + df2['shipping-price'] + df2['shipping-tax'] + df2['item-promotion-discount'] + df2['ship-promotion-discount'])
df2['order_total'] = sum_column
df2.rename(columns={'buyer-email':'order_email','amazon-order-id':'source_ord_id'},inplace=True)
amazon_orders = df2.groupby(['source_ord_id']).agg({
                                        'sku':'sum', 'order_email':'first','order_total':'sum'
                                        },axis=1,inplace=True).reset_index()

# combine amz and shopify order_ids, emails, skus
df3 = pd.concat([shopify_orders,amazon_orders])
df3['source_ord_id'] = df3['source_ord_id'].replace(',', ' ', regex=True)
df3['sku'] = df3['sku'].replace(',,', ',', regex=True)
df3['source_ord_id'] = df3['source_ord_id'].str.strip()
to_merge = df3

# merge sku and email data into record linkage
merged_df = combined_df3.merge(to_merge,how='left', left_on='order_id', right_on='source_ord_id')
merged_df = merged_df.drop('source_ord_id', axis=1)

# identify orders that include skincare 
conditions = [
    (merged_df['sku'].str.contains('TS-CG') == True),
    (merged_df['sku'].str.contains('TS-FM') == True),
    (merged_df['sku'].str.contains('TS-BL') == True),
    (merged_df['sku'].str.contains('TS-FC') == True),
    (merged_df['sku'].str.contains('TS-BW') == True),
    (merged_df['sku'].str.contains('FACIAL') == True),    
    (merged_df['sku'].str.contains('SKIN') == True),
    (merged_df['sku'].str.contains('JAMIE') == True)
    ]

values = [
    '1','1','1','1','1','1','1','1'
    ]

merged_df['skincare'] = np.select(conditions, values)

# merged_df.columns # check your work

In [None]:
### Identify customers who placed a repeat order in a given year
  
# create columns listing year and channel of first purchase  
df2 = merged_df
df2['first_year'] = df2['created_at'].str[:4]
df2 = df2.groupby(['id']).agg({
            'source':'first','first_year':'first'
    },axis=1,inplace=True).reset_index()

merged_df = merged_df.drop('first_year', axis=1)

df2.rename(columns={'source':'first_source'},inplace=True)
df2['first_source'] = df2['first_source'].str.strip()

merged_df2 = merged_df.merge(df2, how='left', left_on='id', right_on='id')

# create boolean columns for repeat year
df = merged_df2

cond_r = [df['order_num'] > 1]
val_r = [1]
df['repeat'] = np.select(cond_r, val_r)
df['year_rep'] = df['created_at'].str[:5] + df['repeat'].astype(str)

cond_18 = [df['year_rep'].str.contains('2018-1') == True]
val_18 = [1]
df['repeat_2018'] = np.select(cond_18, val_18)

cond_19 = [df['year_rep'].str.contains('2019-1') == True]
val_19 = [1]
df['repeat_2019'] = np.select(cond_19, val_19)

cond_20 = [df['year_rep'].str.contains('2020-1') == True]
val_20 = [1]
df['repeat_2020'] = np.select(cond_20, val_20)

cond_21 = [df['year_rep'].str.contains('2021-1') == True]
val_21 = [1]
df['repeat_2021'] = np.select(cond_21, val_21)

cond_22 = [df['year_rep'].str.contains('2022-1') == True]
val_22 = [1]
df['repeat_2022'] = np.select(cond_22, val_22)

# df.head() # check your work
final_rl_df = df


In [None]:
### Format data for analysis - i.e boolean ids

df = final_rl_df
df = df.groupby(['first_source','first_year','id']).agg({
        'repeat_2018':'sum','repeat_2019':'sum','repeat_2020':'sum','repeat_2021':'sum','repeat_2022':'sum'
    },axis=1, inplace=True).reset_index()

unique_cust = df

# return count of unique customers
print(unique_cust.shape[0]," unique customer records")

In [None]:
### Calculate repeat counts by year of first purchase

df2 = unique_cust

# repeat order count by year of first purchase
df3 = df2.groupby(['first_year']).agg({
        'id':'count','repeat_2018':'sum','repeat_2019':'sum','repeat_2020':'sum','repeat_2021':'sum'
    },axis=1, inplace=True).reset_index()

rep_by_first_year = df3

# repeat order count by year and channel of first purchase
df4 = df2.groupby(['first_source','first_year']).agg({
        'id':'count','repeat_2018':'sum','repeat_2019':'sum','repeat_2020':'sum','repeat_2021':'sum'
    },axis=1, inplace=True).reset_index()

rep_by_first_chan = df4

# rep_by_first_year.head() # check your work
# rep_by_first_chan.head() # check your work

In [None]:
### Calculate cumulative repeat rates

df2 = unique_cust

df2['rep_by_18'] = df2.apply(lambda x: 1 if (x['repeat_2018'] != 0) else 0, axis = 1)
df2['rep_by_19'] = df2.apply(lambda x: 1 if ((x['repeat_2018'] != 0) or (x['repeat_2019'] != 0)) else 0, axis = 1)
df2['rep_by_20'] = df2.apply(lambda x: 1 if ((x['repeat_2018'] != 0) or (x['repeat_2019'] != 0) or (x['repeat_2020'] != 0)) else 0, axis = 1)
df2['rep_by_21'] =  df2.apply(lambda x: 1 if ((x['repeat_2018'] != 0) or (x['repeat_2019'] != 0) or (x['repeat_2020'] != 0) or (x['repeat_2021'] != 0)) else 0, axis = 1)

# cumulative repeat by year of first purchase
df3 = df2.groupby(['first_year']).agg({
        'id':'count','rep_by_18':'sum','rep_by_19':'sum','rep_by_20':'sum','rep_by_21':'sum'
    },axis=1, inplace=True).reset_index()

cum_rep_by_first_year = df3

# cumulative repeat by year and channel of first purchase
df4 = df2.groupby(['first_source','first_year']).agg({
        'id':'count','rep_by_18':'sum','rep_by_19':'sum','rep_by_20':'sum','rep_by_21':'sum'
    },axis=1, inplace=True).reset_index()

cum_rep_by_first_chan = df4

# cum_rep_by_first_year.head() # check your work 
# cum_rep_by_first_chan.head() # check your work