In [2]:
import requests
import json
import pandas as pd
import time
import random
import os
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
input_path = '/content/gdrive/MyDrive/COMP631-CravingMeter/Tracking & Reports/'
output_path = '/content/gdrive/MyDrive/COMP631-CravingMeter/Files/'
URL = "https://api.nutritionix.com/v1_1/search"

In [24]:
# Count how many files there are in the folder
path, dirs, files = next(os.walk(output_path))
len(files)

80724

### Get unique brand IDs: Run this if new brands are added to new_snack_brands.csv ###

**List of API Fields:** https://docs.google.com/spreadsheets/d/1TTQ8eJDViI0M1Wh8Qf0rcDiHU3euS_7WiM6MO2JPn0w/edit#gid=0

**Snack brands source:** https://today.yougov.com/ratings/food/popularity/food-snack-brands/all

In [71]:
# read in csv containing brands
input_file = input_path + 'new_snack_brands.csv'
df_brands = pd.read_csv(input_file)
# find brands with brand id missing
df_loop = df_brands[df_brands['brand_id'].isnull()]
df_loop = df_loop.reset_index(drop=True)
df_loop

Unnamed: 0,brand,brand_id
0,Sun Chips,


In [72]:
# send query for 50 returns
# compare returned brand names with input brand names
# if match, retrieve brand id, if not print a message

num_first_return = 50
page_offset = 0

dict_brand_id = {}
for idx_brand in range(0, df_loop.shape[0]):
    time.sleep(max(random.gauss(5, 1), 3))
    brand_name = df_loop['brand'][idx_brand]
    # Posted data body
    payload = json.dumps({
      "fields": [
        "brand_name",
        "brand_id",
        "item_description"    
      ],

      "offset": page_offset,
      "limit": num_first_return, # max limit 50
      "sort": {
        "field": "_score",
        "order": "desc"
      },
      "queries": {
          "brand_name": brand_name     
      },
      "filters": {
        "item_type": 2
      }
    })

    headers = {
      'Authorization': 'Basic YmY2YWI3NTY6NTNlZmU3YzQ1YTY1Y2RhNjdhOTllZjVmMjIxNDY5ZDY=',
      'Content-Type': 'application/json',
      'Cookie': 'session=zDEiK5uaDkWx8Vh2bBfHdg.SdSaJuX1GfMWwXbHaS08SgucHtQiAqQ3iVJ8uw6FZTc.1644622973899.86400000.kWaSXCx9djJZ_tj3_5jQfJjgPcnci2kYb3kN084x7Ho'
    }

    response = requests.request("POST", URL, headers=headers, data=payload)
    dict_json = response.json()
    
    brand_id = None
    if 'total' in dict_json.keys():    
        for i in range(min(num_first_return, dict_json['total'] - page_offset)):
            if dict_json['hits'][i]['fields']['brand_name'].lower() == brand_name.lower():
                brand_id = dict_json['hits'][i]['fields']['brand_id']
                break
    else:
        print('No total value was returned for brand', brand_name)
    if brand_id == None:
        print('No brand match with %s found in first %d returns' % (brand_name, num_first_return))
    else:        
        print('Brand ID for %s is %s' % (brand_name, brand_id))

    dict_brand_id[brand_name] = brand_id

Brand ID for Sun Chips is 51db37d1176fe9790a899dee


In [73]:
# merge found brand ids with brand names 
for brand, id in dict_brand_id.items():
    df_brands.loc[df_brands['brand'] == brand, ['brand_id']] = id
# overwrite original csv file with new data
df_brands.to_csv(input_file, index=False)
# append this to snack_brands_merge.csv before downloading new files

### Retrieve data by brand ID ###

In [4]:
# read all fields
df_fields = pd.read_excel(input_path + 'Nutritionix CSV Bulk Export 2.0 Data Dictionary [PUBLIC].xlsx')
lst_fields = list(df_fields['csv column name'].values)

In [74]:
# load brands that have not been downloaded yet
df_brands = pd.read_csv(input_path + 'snack_brands_merge.csv')
df_brands[df_brands['downloaded'].isnull()]

Unnamed: 0,brand,brand_id,downloaded


In [5]:
# helper function for sending repeated post requests
def send_api_request(page_offset, brand_name, brand_id, page_limit=50, 
                     file_save=True, file_path=output_path):
    payload = json.dumps({
      "fields": lst_fields,

      "offset": page_offset,
      "limit": page_limit, # max limit 50
      "sort": {
        "field": "_score",
        "order": "desc"
      },
    #   "min_score": 4,
      "queries": {
          "brand_name": brand_name     
      },
      "filters": {
        "item_type": 2,
        "brand_id": brand_id
      }
    })

    headers = {
      'Authorization': 'Basic YmY2YWI3NTY6NTNlZmU3YzQ1YTY1Y2RhNjdhOTllZjVmMjIxNDY5ZDY=',
      'Content-Type': 'application/json',
      'Cookie': 'session=zDEiK5uaDkWx8Vh2bBfHdg.SdSaJuX1GfMWwXbHaS08SgucHtQiAqQ3iVJ8uw6FZTc.1644622973899.86400000.kWaSXCx9djJZ_tj3_5jQfJjgPcnci2kYb3kN084x7Ho'
    }

    response = requests.request("POST", URL, headers=headers, data=payload)
    # json() decoder turns data into dictionary
    dict_json = response.json()
    
    # Save each returned item into a file, named by the ID field
    num_files = 0
    if file_save:
        if 'hits' in dict_json.keys():
            for item in dict_json['hits']:
                file_name = item['_id']
                with open(file_path + file_name + '.json', 'w') as outfile:
                    # dumps() converts dictionary into a JSON string
                    outfile.write(json.dumps(item))
            # print('%d files have been saved for brand %s' %(len(dict_json['hits']), brand_name))
            num_files = len(dict_json['hits'])
        else:
            print('A request for brand %s has no returns.' % brand_name)
    return dict_json, num_files

In [15]:
num_total = 0
num_kfiles = 1
page_limit = 50
# retrieve data by brand name and brand id
for idx_brand in range(0, df_loop.shape[0]):
# for idx_brand in range(0, 3):
    brand_name = df_loop['brand'][idx_brand]
    brand_id = df_loop['brand_id'][idx_brand]
    page_offset = 0
    print('Retrieving files for brand:', brand_name)
    # Returns on first page
    time.sleep(max(random.gauss(5, 1), 3))
    dict_json, num_files = send_api_request(page_offset, brand_name, brand_id)
    num_total += num_files
    # if returns more than one page, loop through every page
    if 'total' in dict_json.keys():
        total_returns = dict_json['total']        
        if total_returns > page_limit:
            for num_page in range(total_returns // page_limit):
                page_offset = 50 + num_page*page_limit
                time.sleep(max(random.gauss(5, 1), 3))
                dict_json, num_files = send_api_request(page_offset, brand_name, brand_id)
                num_total += num_files
        print(total_returns, 'files have been downloaded for', brand_name)
    else:
        print('A request for brand %s has not returned a total value.' % brand_name)
    if num_total // 1000 > num_kfiles:
        print('A total of %d files have been downloaded.' % num_total)
        num_kfiles = num_total // 1000        

Retrieving files for brand: Annies
13 files have been downloaded for Annies
Retrieving files for brand: Archway
25 files have been downloaded for Archway
Retrieving files for brand: Astor
11 files have been downloaded for Astor
Retrieving files for brand: Blue Diamond
300 files have been downloaded for Blue Diamond
Retrieving files for brand: Buc-ee's
418 files have been downloaded for Buc-ee's
Retrieving files for brand: Catalina
5 files have been downloaded for Catalina
Retrieving files for brand: Central Market
221 files have been downloaded for Central Market
Retrieving files for brand: Chicken in a Biskit
1 files have been downloaded for Chicken in a Biskit
Retrieving files for brand: Clif
268 files have been downloaded for Clif
Retrieving files for brand: Daelmans
56 files have been downloaded for Daelmans
Retrieving files for brand: Duke's
120 files have been downloaded for Duke's
Retrieving files for brand: Dunkaroos
6 files have been downloaded for Dunkaroos
Retrieving files f

### Single request for testing

In [None]:
# Posted data body
payload = json.dumps({
  "fields": [
    "item_name",
    "brand_name",
    "brand_id",
    "item_type",
    "item_description",
    "nf_calories",
    "nf_sodium",
    "nf_sugars"    
  ],
  "offset": 0,
  "limit": 15,
  "sort": {
    "field": "nf_sugars",
    "order": "asc"
  },
  "min_score": 0.5,
  "queries": {
#     "item_name": "chocolate AND ice cream",
    "brand_name": "Andes"
  },
  "filters": {
    "item_type": 2
#     "nf_calories": {
#       "from": 0,
#       "to": 200
#     },
#     "nf_sodium": {
#       "lte": 100
#     }
  }
})

headers = {
  'Authorization': 'Basic YmY2YWI3NTY6NTNlZmU3YzQ1YTY1Y2RhNjdhOTllZjVmMjIxNDY5ZDY=',
  'Content-Type': 'application/json',
  'Cookie': 'session=zDEiK5uaDkWx8Vh2bBfHdg.SdSaJuX1GfMWwXbHaS08SgucHtQiAqQ3iVJ8uw6FZTc.1644622973899.86400000.kWaSXCx9djJZ_tj3_5jQfJjgPcnci2kYb3kN084x7Ho'
}

response = requests.request("POST", URL, headers=headers, data=payload)
dict_json = response.json()
dict_json

{'total': 133,
 'max_score': None,
 'hits': [{'_index': 'f762ef22-e660-434f-9071-a10ea6691c27',
   '_type': 'item',
   '_id': '55d20dd0d3ca12132e60d71f',
   '_score': None,
   'sort': [0],
   'fields': {'item_name': 'Landbrot Rustic German Rye',
    'brand_name': "Andy's",
    'brand_id': '51db37c5176fe9790a89937f',
    'item_type': 2,
    'item_description': None,
    'nf_calories': 90,
    'nf_sodium': 220,
    'nf_sugars': 0}},
  {'_index': 'f762ef22-e660-434f-9071-a10ea6691c27',
   '_type': 'item',
   '_id': '5e3284d4be89478477299bf4',
   '_score': None,
   'sort': [0],
   'fields': {'item_name': 'Kabanosy',
    'brand_name': "Andy's Deli",
    'brand_id': '597ae24918e2c9951b101262',
    'item_type': 2,
    'item_description': None,
    'nf_calories': 200,
    'nf_sodium': 620,
    'nf_sugars': 0}},
  {'_index': 'f762ef22-e660-434f-9071-a10ea6691c27',
   '_type': 'item',
   '_id': '5b62b16987b39d2c40edeba2',
   '_score': None,
   'sort': [0],
   'fields': {'item_name': 'Golden Fish

https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/

https://docs.python-requests.org/en/latest/

https://docs.python.org/3/library/json.html#encoders-and-decoders