In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import json
import re

#Firstly, load the .json file to a DataFrame
with open('transaction-data-adhoc-analysis.json') as f:
    data = pd.read_json(f)

#Function to split multiple transactions by one buyer
def item_split_to_list(transaction_item):
    if ';' in transaction_item:
        item_list = transaction_item.split(';')
        return item_list
    else:
        return transaction_item
    
#Function to split the unique items into 'brand | item', and 'qty'
def brand_item(unique_item):
    item_detail_list = unique_item.split(',')
    return (item_detail_list[0]) + ' | ' + item_detail_list[1]
def qty(unique_item):
    item_detail_list = unique_item.split(',')
    return item_detail_list[2]

#Vectorize new function into the 'data' DataFrame (make a new column to preserve the 'transaction_items' col.)
data['transaction_item_list'] = data.apply(lambda x: item_split_to_list(x['transaction_items']), axis=1)

#Use .explode() function to expand the list into rows of the same index (will become a numpy array)
transaction_items_exploded = data['transaction_item_list'].explode()

#Create a new DF to store the numpy array
transaction_items_df = pd.DataFrame()

#Vectorize the 'brand | item' and 'qty' functions
vfunc_brand = np.vectorize(brand_item)
vfunc_qty = np.vectorize(qty)

#Dump the data into the 'transaction_items_df' using the vectorized functions
transaction_items_df['brand | item'] = vfunc_brand(transaction_items_exploded)
transaction_items_df['qty'] = vfunc_qty(transaction_items_exploded)

#Remove unwanted characters in the qty column (only gets the number part)
transaction_items_df['qty'] = transaction_items_df['qty'].str.replace(r'[^0-9]', '',regex=True)

#Convert 'qty' column into integers
transaction_items_df["qty"] = transaction_items_df["qty"].astype(str).astype(int)

#Create a new DataFrame to store unique store items w/o quantity getting in the way
inventory_df = pd.DataFrame(transaction_items_df['brand | item'].unique(), columns=['brand | item'])

#Get the quantity for each item
inventory_list = list(inventory_df['brand | item'])
total_qty = []
for i in range(len(inventory_list)):
    total_qty_item = 0
    for j in range(len(transaction_items_df)):
        if inventory_list[i] == transaction_items_df.iloc[j,0]:
            total_qty_item += transaction_items_df.iloc[j,1]
    total_qty.append(total_qty_item)

#Add the 'total_qty' list to the 'inventory_df' DataFrame
inventory_df['total qty']=total_qty
inventory_df

Unnamed: 0,brand | item,total qty
0,Exotic Extras | Beef Chicharon,59302
1,HealthyKid 3+ | Nutrional Milk,58728
2,Candy City | Orange Beans,59406
3,HealthyKid 3+ | Gummy Vitamins,59576
4,HealthyKid 3+ | Yummy Vegetables,59429
5,Candy City | Gummy Worms,59319
6,Exotic Extras | Kimchi and Seaweed,59390


In [113]:
#Identify the unit price of each item
dummy_df = pd.DataFrame()
def filter_single_items(trans_item):
    if ';' not in trans_item and '(x1)' in trans_item:
        return trans_item
dummy_df['brand | item']=data.apply(lambda x:filter_single_items(x['transaction_items']),axis=1)
def filter_unit_price(trans_item, value):
    if ';' not in trans_item and '(x1)' in trans_item:
        return value
dummy_df['unit price']=data.apply(lambda x:filter_unit_price(x['transaction_items'],x['transaction_value']),axis=1)
dummy_df.dropna(inplace=True)
dummy_df = dummy_df.drop_duplicates(keep='first')
dummy_df['brand | item'] = dummy_df.apply(lambda x:brand_item(x['brand | item']),axis=1)
dummy_df

Unnamed: 0,brand | item,unit price
3,HealthyKid 3+ | Yummy Vegetables,500.0
8,Candy City | Gummy Worms,150.0
55,Exotic Extras | Beef Chicharon,1299.0
61,Exotic Extras | Kimchi and Seaweed,799.0
142,Candy City | Orange Beans,199.0
204,HealthyKid 3+ | Gummy Vitamins,1500.0
227,HealthyKid 3+ | Nutrional Milk,1990.0


In [109]:
inventory_df

Unnamed: 0,brand | item,total qty
0,Exotic Extras | Beef Chicharon,59302
1,HealthyKid 3+ | Nutrional Milk,58728
2,Candy City | Orange Beans,59406
3,HealthyKid 3+ | Gummy Vitamins,59576
4,HealthyKid 3+ | Yummy Vegetables,59429
5,Candy City | Gummy Worms,59319
6,Exotic Extras | Kimchi and Seaweed,59390


In [115]:
inventory_df = pd.merge(inventory_df,dummy_df, how='inner')
inventory_df

Unnamed: 0,brand | item,total qty,unit price
0,Exotic Extras | Beef Chicharon,59302,1299.0
1,HealthyKid 3+ | Nutrional Milk,58728,1990.0
2,Candy City | Orange Beans,59406,199.0
3,HealthyKid 3+ | Gummy Vitamins,59576,1500.0
4,HealthyKid 3+ | Yummy Vegetables,59429,500.0
5,Candy City | Gummy Worms,59319,150.0
6,Exotic Extras | Kimchi and Seaweed,59390,799.0


In [117]:
def total_sale_per_item(qty,unit_price):
    return qty * unit_price

inventory_df['total sale']=inventory_df.apply(lambda x:total_sale_per_item(x['total qty'],x['unit price']), axis=1)
inventory_df

Unnamed: 0,brand | item,total qty,unit price,total sale
0,Exotic Extras | Beef Chicharon,59302,1299.0,77033298.0
1,HealthyKid 3+ | Nutrional Milk,58728,1990.0,116868720.0
2,Candy City | Orange Beans,59406,199.0,11821794.0
3,HealthyKid 3+ | Gummy Vitamins,59576,1500.0,89364000.0
4,HealthyKid 3+ | Yummy Vegetables,59429,500.0,29714500.0
5,Candy City | Gummy Worms,59319,150.0,8897850.0
6,Exotic Extras | Kimchi and Seaweed,59390,799.0,47452610.0
