In [27]:
import requests
import pandas as pd
import json
from stats_can import StatsCan
sc = StatsCan(data_folder='../data_sources/raw_data/')

Note: to install the stats_can library, I used Anaconda Prompt with the following command:

    conda install -c conda-forge stats_can

other documentation on this library: https://stats-can.readthedocs.io/en/latest/ 

In [5]:
# test api access with the url provided in documentation
url = 'https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/14100287/en'

params = {
}

api_result = requests.get(url=url, params=params)

api_response = api_result.json()

print(api_response)

{'status': 'SUCCESS', 'object': 'https://www150.statcan.gc.ca/n1/tbl/csv/14100287-eng.zip'}


### Retrieve time series data for monthly retail averages (MRA)
- use fulltabledownloadcsv end point with MRA table ID

In [3]:
# retrieve data for monthly average retail prices for selected products (table id # 1810024501)
url1 = 'https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/18100245/en'

params = {
}

api_result = requests.get(url=url1, params=params)

api_response = api_result.json()

print(api_response)

{'status': 'SUCCESS', 'object': 'https://www150.statcan.gc.ca/n1/tbl/csv/18100245-eng.zip'}


In [4]:
# retrieve data for farm product prices, crops and livestock (table id # 3210007701)
url2 = 'https://www150.statcan.gc.ca/t1/wds/rest/getFullTableDownloadCSV/32100077/en'

params = {
}

api_result = requests.get(url=url2, params=params)

api_response = api_result.json()

print(api_response)

{'status': 'SUCCESS', 'object': 'https://www150.statcan.gc.ca/n1/tbl/csv/32100077-eng.zip'}


In [None]:
# retrieve data for meat, fruits/veg
# no API resource for AgriCanada

### Attempt to download specific time period data using "getDataFromVectorsAndLatestNPeriods" endpoint

In [None]:
# retrieve changed data for specific products (target using VectorID) over a specific time period

# POST URL = 'https://www150.statcan.gc.ca/t1/wds/rest/getDataFromVectorsAndLatestNPeriods'

# POST BODY = [{"vectorId":1353834271, "latestN":3}]


### Attempt to download data using stats_can module instead of the API


In [20]:
# download same MRA table as when using API method above, using full table ID
scdf = sc.table_to_df("181-002-45-01")

Downloading and loading table_18100245


18100245-eng.zip: 100%|██████████| 1.17M/1.17M [00:01<00:00, 663kB/s]


In [21]:
scdf.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Products,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2017-01-01,Canada,2016A000011124,"Beef stewing cuts, per kilogram",Dollars,81,units,0,v1353834271,11.1,12.66,,,,2
1,2017-01-01,Canada,2016A000011124,"Beef striploin cuts, per kilogram",Dollars,81,units,0,v1353834272,11.2,21.94,,,,2
2,2017-01-01,Canada,2016A000011124,"Beef top sirloin cuts, per kilogram",Dollars,81,units,0,v1353834273,11.3,13.44,,,,2
3,2017-01-01,Canada,2016A000011124,"Beef rib cuts, per kilogram",Dollars,81,units,0,v1353834311,11.41,20.17,,,,2
4,2017-01-01,Canada,2016A000011124,"Ground beef, per kilogram",Dollars,81,units,0,v1353834274,11.4,9.12,,,,2


In [22]:
scdf.tail()

Unnamed: 0,REF_DATE,GEO,DGUID,Products,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
99133,2023-10-01,British Columbia,2016A000259,"Sunflower seeds, 400 grams",Dollars,81,units,0,v1458870265,10.109,4.26,,,,2
99134,2023-10-01,British Columbia,2016A000259,"Deodorant, 85 grams",Dollars,81,units,0,v1353834714,10.75,7.42,,,,2
99135,2023-10-01,British Columbia,2016A000259,"Toothpaste, 100 millilitres",Dollars,81,units,0,v1353834715,10.76,4.06,,,,2
99136,2023-10-01,British Columbia,2016A000259,"Shampoo, 400 millilitres",Dollars,81,units,0,v1353834716,10.77,7.14,,,,2
99137,2023-10-01,British Columbia,2016A000259,"Laundry detergent, 4.43 litres",Dollars,81,units,0,v1458870250,10.11,16.48,,,,2


In [28]:
# investigate vectorID-based approach

# create list of irrelevant products
irrelevant = ['Baby food', 'Infant formula', 'Deodorant', 'Toothpaste', 'Shampoo', 'Laundry detergent']

# remove rows where the product is irrelevant

for term in irrelevant:
    scdf = scdf[~scdf.Products.str.contains(term)]

scdf

Unnamed: 0,REF_DATE,GEO,DGUID,Products,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2017-01-01,Canada,2016A000011124,"Beef stewing cuts, per kilogram",Dollars,81,units,0,v1353834271,11.1,12.66,,,,2
1,2017-01-01,Canada,2016A000011124,"Beef striploin cuts, per kilogram",Dollars,81,units,0,v1353834272,11.2,21.94,,,,2
2,2017-01-01,Canada,2016A000011124,"Beef top sirloin cuts, per kilogram",Dollars,81,units,0,v1353834273,11.3,13.44,,,,2
3,2017-01-01,Canada,2016A000011124,"Beef rib cuts, per kilogram",Dollars,81,units,0,v1353834311,11.41,20.17,,,,2
4,2017-01-01,Canada,2016A000011124,"Ground beef, per kilogram",Dollars,81,units,0,v1353834274,11.4,9.12,,,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99129,2023-10-01,British Columbia,2016A000259,"Pasta sauce, 650 millilitres",Dollars,81,units,0,v1458870255,10.105,3.18,,,,2
99130,2023-10-01,British Columbia,2016A000259,"Salad dressing, 475 millilitres",Dollars,81,units,0,v1458870257,10.106,3.35,,,,2
99131,2023-10-01,British Columbia,2016A000259,"Almonds, 200 grams",Dollars,81,units,0,v1458870235,10.107,4.80,,,,2
99132,2023-10-01,British Columbia,2016A000259,"Peanuts, 450 grams",Dollars,81,units,0,v1458870256,10.108,3.86,,,,2


In [29]:
# generate a list of unique products in monthly retail averages table
vector_list = scdf['VECTOR'].unique().tolist()
vector_list

['v1353834271',
 'v1353834272',
 'v1353834273',
 'v1353834311',
 'v1353834274',
 'v1353834275',
 'v1353834276',
 'v1353834312',
 'v1353834277',
 'v1353834278',
 'v1353834279',
 'v1353834313',
 'v1353834280',
 'v1353834281',
 'v1458869929',
 'v1458869931',
 'v1353834314',
 'v1353834282',
 'v1458869922',
 'v1353834283',
 'v1353834284',
 'v1353834285',
 'v1458869932',
 'v1458869923',
 'v1353834286',
 'v1353834287',
 'v1458869921',
 'v1353834288',
 'v1353834289',
 'v1353834290',
 'v1353834291',
 'v1353834292',
 'v1353834293',
 'v1353834294',
 'v1353834295',
 'v1353834296',
 'v1353834315',
 'v1353834297',
 'v1353834298',
 'v1458869934',
 'v1353834299',
 'v1353834300',
 'v1353834316',
 'v1353834317',
 'v1353834301',
 'v1353834302',
 'v1353834303',
 'v1353834304',
 'v1353834305',
 'v1353834306',
 'v1353834307',
 'v1353834308',
 'v1353834318',
 'v1353834319',
 'v1353834309',
 'v1353834310',
 'v1458869933',
 'v1458869928',
 'v1353834320',
 'v1353834321',
 'v1353834322',
 'v1353834323',
 'v13538

In [30]:
# count number of items in list
len(vector_list)

1143

In [None]:
# 

In [15]:
vdf = sc.vectors_to_df(vector_list)
vdf

Unnamed: 0_level_0,v1159446976,v1159446977,v1159446978,v1159446979,v1159446980,v1159446981,v1159446982,v1159446983,v1159446984,v1159446985,...,v1458870258,v1458870259,v1458870260,v1458870261,v1458870262,v1458870263,v1458870264,v1458870265,v1458870266,v1458870267
refPer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10-01,17.45,23.89,21.61,11.78,8.04,10.26,6.4,14.99,8.11,6.6,...,4.75,26.11,4.73,7.72,4.37,3.49,4.68,4.26,2.67,9.72


In [17]:
sc.list_downloaded_tables

<function stats_can.sc.list_downloaded_tables(path=None, h5file='stats_can.h5')>

Ideas about what functions/features the tool will have:
- predict cost fluctuations in food products
    - Clarify: what LOD? Category or individual products? (individual products would not be a comprehensive resource; not all ingredients are available with StatsCan database)
    - If prediction at the categorical level:
        - users would input their menu items, including ingredients
            - if the user has an ingredient not listed in our database, they can simply select its category
        - we will use Stats Canada retail database to predict monthly price average for its selected products
        
        - our tool would break down their menu into weighted average by category (30% wheat products, 25% protein, etc)

- two stages of analysis:
    - time-series prediction:
        - use historic product price data to predict fluctuations in monthly average cost
    - farm product price data:
        - track changes in supplier sale prices
        - if suppliers start selling at higher/lower price points, that percentage change will also be applied to our tool's predicted product price
