<a href="https://colab.research.google.com/github/nethranatarajan3/nethranatarajan3.github.io/blob/main/portfolio_code/CC8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with Supermarket Big Data




In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import altair as alt

## Loading Data



In [2]:
# Load cleaned prices and items data
prices_df = pd.read_parquet('https://autocpi-public.s3.eu-west-2.amazonaws.com/pp434/pp434_semi_anonymised_prices.parquet')
items_df = pd.read_parquet('https://autocpi-public.s3.eu-west-2.amazonaws.com/pp434/pp434_semi_anonymised_items.parquet')

In [3]:
# Show sample rows from prices_df and items_df
prices_df.sample(5)

Unnamed: 0,store_id,product_id,date,price,unit_price,loyalty_price,original_price
5936321,2,7880069,2023-10-16,3.0,,,3.0
24848679,1,309749119,2024-09-13,8.5,8.5 per 75cl,,
25355557,2,7748241,2024-06-05,9.5,£14.25 / 75cl,,9.5
20377276,1,301443856,2023-09-15,2.95,0.22 per 100ml,,
23262936,2,2390109,2023-07-22,20.0,£57.14 / ltr,,20.0


In [4]:
items_df.sample(5)

Unnamed: 0,store_id,product_id,segment_code,description
76214,4,337012011,CP0112301,COOKED HAM AND CONTINENTAL MEATS (E.G. SALAMI)
8423,9,58869,CP0118902,"SWEETS, SOFT AND HARD (EXCL. MINTS AND CHEWING..."
71987,3,1000383193344,CP0111311,"CAKES, TARTS AND SWEET PIES"
27486,3,1000383235856,CP0111310,"BISCUITS, SAVOURY"
61074,2,8191861,CP0119401,SPICES AND CULINARY HERBS (EXCL. SEEDS)


</br></br>


# Merging the data



In [5]:
# Merge prices and items on store_id and product_id
df = pd.merge(prices_df, items_df, on=['store_id', 'product_id'], how='inner')
df.sample(5)

Unnamed: 0,store_id,product_id,date,price,unit_price,loyalty_price,original_price,segment_code,description
2619358,4,591344011,2023-12-12,3.79,1.4 per 100ml,,,CP0118602,"ICE CREAM BARS, LOLLIES AND CONES"
4436004,4,110487067,2025-02-14,13.0,,,15.0,CP0213002,"BEER, LAGER"
2233576,1,312350367,2023-09-20,3.5,15.49 per kg,,,CP0117906,VEGETARIAN AND VEGAN MEAT SUBSTITUTES
578348,9,93813,2023-10-15,3.5,,,,CP0111401,BREAKFAST CEREALS
4320244,5,4061464331235,2025-03-09,6.49,£6.49 / 75cl,,6.49,CP0212104,"WINE, CHAMPAGNE AND SPARKLING"


In [6]:
df.description.unique()

array(['RICE, IN ALL FORMS (EXCL. RICE FLOUR)', 'FLOUR, WHEAT-BASED',
       'BREAD, WHITE', 'BREAD, BROWN OR SEEDED',
       'BREAD ROLLS, BUNS, BAGUETTES AND OTHER LOAVES',
       'FLATBREADS, THINS AND PITTAS',
       'BREAD SIDE DISHES (E.G. GARLIC BREAD)',
       'OTHER BREAKFAST BAKERY PRODUCTS', 'BISCUITS, SWEET',
       'BISCUITS, SAVOURY', 'CAKES, TARTS AND SWEET PIES',
       'BREAKFAST CEREALS', 'CEREAL BARS AND CEREAL-BASED SNACKS',
       'OATS AND PORRIDGE', 'PASTA AND NOODLES, DRY OR FRESH',
       'PASTA AND NOODLES, PACKET OR POT', 'COUSCOUS',
       'MEAT OF COWS, FRESH, CHILLED OR FROZEN',
       'MEAT OF PIGS, FRESH, CHILLED OR FROZEN',
       'MEAT OF GOATS, LAMBS AND SHEEP, FRESH, CHILLED OR FROZEN',
       'MEAT OF CHICKEN, FRESH, CHILLED OR FROZEN',
       'COOKED HAM AND CONTINENTAL MEATS (E.G. SALAMI)',
       'COOKED POULTRY, SLICES AND DELI FOODS',
       'PORK, DRIED, SALTED OR SMOKED',
       'SAUSAGES AND SIMILAR MEAT PRODUCTS',
       'BREADED CHICKEN AN

</br></br></br></br>

# Ice Creams Chart

Let's chart all frozen fruit prices over time

In [9]:
ice_cream_df = df.query("description == 'ICE CREAM BARS, LOLLIES AND CONES'")

# This gives us a df with every price observation for frozen fruit products
# But we want the mean price over time

ice_cream_avg_df = ice_cream_df.groupby(['date']).agg({'price': 'mean'}).reset_index()


alt.Chart(ice_cream_avg_df).mark_line(
    interpolate='monotone',
).encode(
    x=alt.X('date:T', title=''),
    y=alt.Y('price:Q', title='Mean price'),
).properties(
    title='Average Price of Ice Cream Bars, Lollies and Cones',
)


## An Example: Pizza

Let's filter for Pizza and see its prices across stores.

In [10]:
ice_cream_df = df[df['description'].str.contains('ICE CREAM', case=False, na=False)]

In [14]:
# Calculate average price by store
store_avg_prices = ice_cream_df.groupby('store_id')['price'].mean().reset_index()
store_avg_prices.columns = ['store_id', 'avg_price']

# Create store labels
store_avg_prices['store_label'] = 'Store ' + store_avg_prices['store_id'].astype(str)

# Sort by price (ascending)
store_avg_prices = store_avg_prices.sort_values('avg_price', ascending=True)

# Create Altair bar chart
import altair as alt

alt.Chart(store_avg_prices).mark_bar(
    color='#4169E1'
).encode(
    x=alt.X('store_label:N', title='Store', sort=None),
    y=alt.Y('avg_price:Q', title='Average Price (£)')
).properties(
    title='Average Ice Cream Prices by Store',
    width=500,
    height=300
)