In [1]:
import pandas as pd
import pickle

In [8]:
# import data
loyalty_scores = pd.read_excel("data/grocery_database.xlsx", sheet_name = 'loyalty_scores')
customer_details = pd.read_excel("data/grocery_database.xlsx", sheet_name = 'customer_details')
transactions = pd.read_excel("data/grocery_database.xlsx", sheet_name = 'transactions')

In [17]:
# create customer level dataset

# merge existing loyalty_scores with customer_detailes
customers_summary = pd.merge(customer_details, loyalty_scores, how = 'left', on = 'customer_id') # customers with missing loyalty scores have NaN

# create aggregated sales summary
sales_summary = transactions.groupby('customer_id').agg({
    'sales_cost': 'sum',
    'num_items': 'sum',
    'transaction_id': 'count',
    'product_area_id': 'nunique'
}).reset_index()

# rename sales_summary columns
sales_summary.columns = ['customer_id', 'total_sales', 'total_items', 'transaction_count', 'product_area_count']

# add average basket value to sales summary
sales_summary['avg_basket_value'] = sales_summary['total_sales'] / sales_summary['transaction_count']

# merge customers_summary and sales_summary
customers_sales = pd.merge(customers_summary, sales_summary, how = 'inner', on = 'customer_id')
customers_sales.head()

Unnamed: 0,customer_id,distance_from_store,gender,credit_score,customer_loyalty_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
0,74,3.38,F,0.59,0.263,2563.71,297,44,5,58.266136
1,524,4.76,F,0.52,0.298,2996.02,357,49,5,61.143265
2,607,4.45,F,0.49,0.337,2853.82,350,49,5,58.241224
3,343,0.91,M,0.54,0.873,2388.31,272,54,5,44.227963
4,322,3.02,F,0.63,0.35,2401.64,278,50,5,48.0328


In [25]:
# split customers_sales by loyalty_score
regression_modelling = customers_sales.loc[customers_sales['customer_loyalty_score'].notna()] # loyalty_score present
regression_scoring = customers_sales.loc[customers_sales['customer_loyalty_score'].isna()] # loyalty_score missing

# drop loyalty score from scoring data
regression_scoring = regression_scoring.drop(['customer_loyalty_score'], axis = 1)

In [26]:
# save final df as pickle object
pickle.dump(regression_modelling, open('data/abc_regression_modeling.p', 'wb')) # wb is writing
pickle.dump(regression_scoring, open('data/abc_regression_scoring.p', 'wb'))

In [27]:
# save final df as excel file
regression_modelling.to_excel('data/abc_regression_modeling.xlsx')
regression_scoring.to_excel('data/abc_regression_scoring.xlsx')