In [14]:
import sys
import os
import pandas as pd

share_libs_path =os.path.abspath(os.path.join('../..', 'libs'))
# Add the path to sys.path if it's not already there
if share_libs_path not in sys.path:
    sys.path.append(share_libs_path)

try:
    import minio_helper
except ImportError:
    raise ImportError("minio_reader is not found. Please install it from the shared_code folder.")

Start reading data from minio

In [15]:
bucket_name = "csv-data-files"
purchase_csv_file = "purchases.csv"
purchases_df = minio_helper.read_csv(bucket_name, purchase_csv_file)

# s3fs client object
s3fs_client = minio_helper.minio_client()

# define output spaces for re-structured dataframe
FORMATTED_DATA_OUTPUT_BUCKET = "formatted-data-files"

In [16]:
print(purchases_df.shape)
purchases_df.head()

(71519, 11)


Unnamed: 0,event_time,product_id,category_id,category_code,brand,price,session_id,customer_id,guest_first_name,guest_surname,guest_postcode
0,2022-10-01 02:26:08+00:00,32701106,2055156924466332447,,shimano,95.21,64c68405-7002-4ce0-9604-a4c2e1f7384b,,MICHAEL,MASON,RG497ZQ
1,2022-10-01 02:28:32+00:00,9400066,2053013566067311601,,jaguar,164.2,3b7d6741-3c82-4c75-8015-6f54b52612e0,7466.0,,,
2,2022-10-01 02:31:01+00:00,1004238,2053013555631882655,electronics.smartphone,apple,1206.4,38c6d3f7-6c32-4fed-bca6-ef98e1746386,,COLE,WILKINSON,SW75TQ
3,2022-10-01 02:33:31+00:00,11300059,2053013555531219353,electronics.telephone,texet,17.48,3398c966-7846-4186-89be-323daad735b9,,MOHAMMED,RICHARDS,RG150RE
4,2022-10-01 02:40:18+00:00,17300751,2053013553853497655,,versace,77.22,11e3a573-01b9-4794-b513-e7d8a4fcac83,31266.0,,,


Profiling dataset

In [17]:
print(purchases_df.isnull().sum())

event_time              0
product_id              0
category_id             0
category_code       16739
brand                5707
price                   0
session_id              0
customer_id         18448
guest_first_name    53071
guest_surname       53071
guest_postcode      53071
dtype: int64


Do analysis

In [18]:
purchases_df['is_guest'] = purchases_df['customer_id'].isnull()

# Check if there are any invalid records that purchase is guest, but the customer_id is presented
invalid_guest_records = purchases_df[purchases_df['is_guest'] & purchases_df['customer_id'].notnull()]
invalid_non_guest_records = purchases_df[(purchases_df['is_guest'] == False) & purchases_df['customer_id'].isnull()]
if invalid_guest_records.empty & invalid_non_guest_records.empty:
    print(f"No invalid records found for guest purchases")

No invalid records found for guest purchases


Check proportion of guest and non-guest checkouts

In [19]:
purchases_df['is_guest'].value_counts(normalize=True)

is_guest
False    0.742055
True     0.257945
Name: proportion, dtype: float64

In [20]:
# Calculate the actual number of guests
guest_names = ['guest_first_name', 'guest_surname', 'guest_postcode']
guest_unique_count = len(purchases_df[guest_names].drop_duplicates())
print(f'There are {guest_unique_count} unique guests')

# Calculate the actual number of customers
unique_customers_count = len(purchases_df['customer_id'].unique())
print(f'There are {unique_customers_count} unique customers')

actual_total = guest_unique_count + unique_customers_count
print(f'Actual total of customers is: {actual_total} from the {len(purchases_df)} purchases records')

# Calculate the proportion
#NOTE: we need to subtract 1 because the null value is included in the count
guest_proportion = guest_unique_count / (actual_total - 1)
print(f'Guest proportion: {guest_proportion}')

There are 8301 unique guests
There are 24962 unique customers
Actual total of customers is: 33263 from the 71519 purchases records
Guest proportion: 0.2495640671036017


Transformation Stage: Restructuring a dataset to the common structure

In [21]:
# first, extract guest data first
guest_columns = ["guest_first_name", "guest_surname", "guest_postcode", "is_guest"]
guests_df = purchases_df.loc[ purchases_df['is_guest'] == True , guest_columns]
guests_df.drop_duplicates()
guests_df.head()

Unnamed: 0,guest_first_name,guest_surname,guest_postcode,is_guest
0,MICHAEL,MASON,RG497ZQ,True
2,COLE,WILKINSON,SW75TQ,True
3,MOHAMMED,RICHARDS,RG150RE,True
7,KIAN,MILLS,SW332TF,True
13,RUBY,OWEN,PO377YS,True


In [22]:
customer_ids_df = purchases_df.loc[purchases_df['is_guest'] == False, ['customer_id']].drop_duplicates()
customer_ids_df.head()

Unnamed: 0,customer_id
1,7466.0
4,31266.0
5,534142828.0
6,1035.0
8,6985.0


Start to combine customer_ids_df and guests_df into a final sales_customer_df

In [23]:
sales_customer_df = pd.concat([customer_ids_df, guests_df], axis=0, ignore_index=True)
sales_customer_df.head()

Unnamed: 0,customer_id,guest_first_name,guest_surname,guest_postcode,is_guest
0,7466.0,,,,
1,31266.0,,,,
2,534142828.0,,,,
3,1035.0,,,,
4,6985.0,,,,


Do data transformation for the union dataframe

In [24]:
# Start rename columns data union dataset for better structure format
column_names = ["customer_id", "first_name", "surname", "postcode", "is_guest"]
sales_customer_df = sales_customer_df.set_axis(column_names, axis=1)

# Fill all empty is_guest blank values to False
sales_customer_df['is_guest'] = sales_customer_df['is_guest'].fillna(False)

# Mark that all records from this dataframe are from the purchase source
sales_customer_df['in_purchase_data'] = True

# Do trim all spaces for strings values (we also apply lower case for names components)
for col_name in ['first_name', 'surname']:
    sales_customer_df[col_name] = sales_customer_df[col_name].str.lower().str.strip()
sales_customer_df['postcode'] = sales_customer_df['postcode'].str.strip()
filtered_df = sales_customer_df[sales_customer_df['first_name'].notnull()]
filtered_df

  sales_customer_df['is_guest'] = sales_customer_df['is_guest'].fillna(False)


Unnamed: 0,customer_id,first_name,surname,postcode,is_guest,in_purchase_data
24961,,michael,mason,RG497ZQ,True,True
24962,,cole,wilkinson,SW75TQ,True,True
24963,,mohammed,richards,RG150RE,True,True
24964,,kian,mills,SW332TF,True,True
24965,,ruby,owen,PO377YS,True,True
...,...,...,...,...,...,...
43404,,harley,thomas,M625UB,True,True
43405,,harley,thomas,M625UB,True,True
43406,,harley,thomas,M625UB,True,True
43407,,harley,thomas,M625UB,True,True


In [25]:
# Save the generated dataframe into Minio bucket for later analysis
minio_helper.upload_df_to_remote(s3fs_client, FORMATTED_DATA_OUTPUT_BUCKET, sales_customer_df, "sales_customer_df.csv")

True