In [120]:
import requests
import json
import pysolr

In [121]:
URL = 'http://localhost:8983/solr/craving/'

## Using Pysolr

In [122]:
# health check
solr = pysolr.Solr(URL, always_commit=True)
print(solr.ping())

SolrError: Failed to connect to server at http://localhost:8983/solr/craving/admin/ping/?: HTTPConnectionPool(host='localhost', port=8983): Max retries exceeded with url: /solr/craving/admin/ping/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f7204cfd520>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [6]:
# query
results = solr.search('\"vanilla ice cream\"')
print("Returned {0} result(s).".format(len(results)))
for result in results:
    print("The item name is '{0}'.".format(result['fields.item_name']))

Returned 10 result(s).
The item name is '['Soft Dipped Vanilla Ice Cream Bars']'.
The item name is '['Vanilla Ice Cream Bars, Soft Dipped']'.
The item name is '['Vanilla Ice Cream, Cookies & Cream']'.
The item name is '['Vanilla Ice Cream']'.
The item name is '['Vanilla Ice Cream, Cookie Dough']'.
The item name is '['Philly Vanilla Ice Cream']'.
The item name is '['Vanilla Ice Cream Sandwich']'.
The item name is '['Vanilla Ice Cream Cup']'.
The item name is '['Vanilla Ice Cream Sandwich']'.
The item name is '['Vanilla, Ice Cream']'.


In [4]:
# delete individual files by id
# solr.delete(id='f121af0a-6af0-47c7-8836-ff19f9b3d43b')


'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">97</int>\n</lst>\n</response>\n'

In [5]:

# delete individual files by query
# solr.delete(q='*')

'<?xml version="1.0" encoding="UTF-8"?>\n<response>\n\n<lst name="responseHeader">\n  <int name="status">0</int>\n  <int name="QTime">325</int>\n</lst>\n</response>\n'

## Using Requests

In [10]:
# health check
response = requests.get(URL+'admin/ping')
response.json()

{'responseHeader': {'zkConnected': None,
  'status': 0,
  'QTime': 2,
  'params': {'q': '{!lucene}*:*',
   'distrib': 'false',
   'df': '_text_',
   'rows': '10',
   'echoParams': 'all',
   'rid': '-1'}},
 'status': 'OK'}

In [7]:
# query
# JSON field: https://solr.apache.org/guide/8_11/json-request-api.html
payload = json.dumps({
    'query': 'fields.brand_name:glico, fields.item_name:pocky',   
    'limit': 5,
    'fields': 'fields.item_name, fields.nf_sugars',
    'sort': 'fields.nf_sugars desc'
    })
headers = {'Content-Type': 'application/json'}
response = requests.request("POST", URL+'query', headers=headers, data=payload)
response.json()

{'responseHeader': {'status': 0,
  'QTime': 50,
  'params': {'json': '{"query": "fields.brand_name:glico, fields.item_name:pocky", "limit": 5, "fields": "fields.item_name, fields.nf_sugars", "sort": "fields.nf_sugars desc"}'}},
 'response': {'numFound': 59,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'fields.item_name': ['Pocky Biscuits, Chocolate'],
    'fields.nf_sugars': [26]},
   {'fields.item_name': ['Pocky, Biscuit Sticks, Milk Chocolate Cream'],
    'fields.nf_sugars': [25]},
   {'fields.item_name': ['Almond Peak'], 'fields.nf_sugars': [25]},
   {'fields.item_name': ['Pocky Matcha Green Tea Cream Covered Biscuit Sticks'],
    'fields.nf_sugars': [24]},
   {'fields.item_name': ['Kobe Roasted Chocolate Snack'],
    'fields.nf_sugars': [24]}]}}

In [5]:
# query
# JSON field: https://solr.apache.org/guide/8_11/json-request-api.html
payload = json.dumps({
    'query': 'item_id:57c4fe021505159a29f2d2ef'
    })
headers = {'Content-Type': 'application/json'}
response = requests.request("POST", URL+'query', headers=headers, data=payload)
response.json()

{'responseHeader': {'status': 0,
  'QTime': 26,
  'params': {'json': '{"query": "item_id:57c4fe021505159a29f2d2ef"}'}},
 'response': {'numFound': 2,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'_index': ['f762ef22-e660-434f-9071-a10ea6691c27'],
    '_type': ['item'],
    '_id': ['57c4fe021505159a29f2d2ef'],
    '_score': [2.3352942],
    'fields.brand_name': ['Wegmans'],
    'fields.item_name': ['Dried Pineapple Chunks'],
    'fields.brand_id': ['51db37b0176fe9790a8983be'],
    'fields.item_id': ['57c4fe021505159a29f2d2ef'],
    'item_id': ['57c4fe021505159a29f2d2ef'],
    'fields.item_type': [2],
    'fields.nf_calories': [140],
    'fields.nf_total_fat': [0],
    'fields.nf_saturated_fat': [0.0],
    'fields.nf_trans_fatty_acid': [0],
    'fields.nf_cholesterol': [0],
    'fields.nf_sodium': [0],
    'fields.nf_total_carbohydrate': [33],
    'fields.nf_dietary_fiber': [3],
    'fields.nf_sugars': [26],
    'fields.nf_protein': [2],
    'fields.nf_servings_per_container': [4],


WIP TESTS  

Grocery: Snack Foods, Nuts & Seeds, Dietary Supplement Foods, Non-Supplement Nutritional Foods
Beverages: Coffee, Water, Soda / Flavored Drinks, Fruit & Vegetable Drinks, Drink Mixes & Flavorings, Dairy-Based Drinks (Shelf-Stable), Tea, Dairy Substitute Based Drinks (Shelf Stable), Energy Drinks, Coffee / Tea Variety Packs, Sports Drinks

In [1]:
import pandas as pd


In [108]:
df = pd.read_csv('product_categories_v2.csv') #read in file
df.loc[df['category_level_1'] != "categoryNotFound"] #remove items without category
df.drop(['category_level_1', 'category_level_2', 'category_level_5','category_level_6', 'category_level_7'], axis=1, inplace=True) #remove irrelevant category levels
df = df[df['category_level_3'].notna()] #drop empty
for col in df.columns: #strip out white space from both ends
    df[col] = df[col].str.strip()


In [109]:
df

Unnamed: 0,item_id,category_level_3,category_level_4
0,61b0b8fc4ce3c40008bf94b5,Meals (Frozen),
1,61acc46e244643000aabe8f1,Cheeses,
3,61a238616296b40008d37ee2,Prepared Fruits / Vegetables,
4,619e43edabc5600008f0d2b4,Condiments,Sauces / Dressings / Dips
9,618a7d7d8bbb540008091dbe,Prepared Fruits / Vegetables,
...,...,...,...
87705,572b9efa160628e0525d704b,Confectionery / Desserts / Sweeteners,Candy & Confectioneries (Shelf-Stable)
87707,5aab6d73e85c33764aeb1218,Confectionery / Desserts / Sweeteners,Candy & Confectioneries (Shelf-Stable)
87708,56b05ab5bcf5e2ce1bae2c09,Confectionery / Desserts / Sweeteners,Candy & Confectioneries (Shelf-Stable)
87709,555d90f2b1e133a57cd56a6e,Confectionery / Desserts / Sweeteners,Brownies & Bars (Shelf-Stable)


In [117]:
tester = df.loc[df['category_level_4'] == "Nutritional Powders"] #Non-Supplement Nutritional Foods
len(tester)
# products.values.tolist()

30

In [111]:
products = df.loc[df['category_level_3'] == "Water", 'item_id'] #Non-Supplement Nutritional Foods
products.values.tolist()

['60365befa2560e1b186f6bec',
 '600445f242a96e1d3af1cc79',
 '5f89a9355390877547ba3678',
 '5557da5ff22407741e5efd26',
 '5f44bc79b1968369292328f5',
 '552d69848483e34608cbef3c',
 '567f35fc4bba658c0f6e7ba8',
 '54f07091a5bcc6203013bda5',
 '54dff6b06df06b4e2e2bd245',
 '57b001a3daf2c23d4922df81',
 '603f9605e685523535bb13fc',
 '606f0bb2169ccdaf4f6fbcbf',
 '60608bc79a9c704062df3bbb',
 '565fe60b12f2e37f6d7d0eb4',
 '5ef05b574a00b41960bd8b4e',
 '5489b8a47c2c7c370b776d42',
 '54c816ca0bcc130b6f1b7a88',
 '554cdb864f2b93ca635d9e3c',
 '55e005ccb626b5ae307b831b',
 '55bba4180d5bbccc530febd8',
 '60365bed4499242a6e5c2939',
 '5a97a73002f3db872fbe5de8',
 '552332ae74c9001e282fbddf',
 '5afe7cdb7691e85a169e555d',
 '54b581d704e843bd1952166a',
 '5908315cb86637d13a70dd88',
 '569c7850558730481b50bce9',
 '56a98ac6c8251708020d7e71',
 '54dfbe99bb4de4a11179650c',
 '5b4d98f90a09aee53ed07605',
 '60969869ea2f3e72616c0770',
 '5938f90b8613ab613856d7f6',
 '5aab6e0f855a8ca36d597fcb',
 '54c9137d619eb3416596e4bc',
 '54c86b260bcc

In [103]:
lvl4 = df.drop(columns='item_id').loc[df['category_level_3'] == "Water"]
a = lvl4['category_level_4'].unique()
a.tolist()

['Sparkling Water', 'Still Water', 'Coconut Water', 'Distilled Water']

In [None]:
name = "water"


In [102]:
type(a)

numpy.ndarray

In [87]:
valid_categories = ["Snack Foods","Nuts & Seeds","Dietary Supplement Foods","Non-Supplement Nutritional Foods","Coffee","Water","Soda / Flavored Drinks","Fruit & Vegetable Drinks","Drink Mixes & Flavorings","Dairy-Based Drinks (Shelf-Stable)","Tea","Dairy Substitute Based Drinks (Shelf Stable)","Energy Drinks","Coffee / Tea Variety Packs","Sports Drinks"]
fdf = df.loc[df['category_level_3'].isin(valid_categories)]
# len(fdf['category_level_3'].unique())

In [88]:
len(fdf)

16311

In [89]:
fdf

Unnamed: 0,item_id,category_level_3,category_level_4
10,618a7d7c8bbb540008091dba,Snack Foods,Snack Bites & Clusters
22,6116a39594371967c00544f6,Coffee,Single-Serve Coffee Cups / Bags / Pods
25,610aade4ae6ed04c788787b6,Coffee,Ready-to-Drink Coffee
32,60d5e004436d3a00088404f6,Nuts & Seeds,Almonds
52,603e458936073c4e14da8f26,Nuts & Seeds,Pecans
...,...,...,...
87144,568d301666cf48c115e5b085,Snack Foods,"Cereal, Energy, Granola, & Protein Bars"
87148,5835404c335eddb7472e478e,Snack Foods,Fruit-Based Snacks
87162,5463671e3a1aadb564d3be79,Snack Foods,Fruit-Based Snacks
87169,54cb461edc305e7b040d9125,Snack Foods,Fruit-Based Snacks


In [92]:
ckk = fdf.groupby(['category_level_3', 'category_level_4'])
ckk.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_id
category_level_3,category_level_4,Unnamed: 2_level_1
Coffee,Coffee Beans,5fda13b8f6be773967c7c7f1
Coffee,Ground Coffee,60d39bc36650033f0ff5e442
Coffee,Instant Coffee,5b48cc1031ef2fc16b794519
Coffee,Liquid Coffee Concentrates,5c7b7bef943e3019324f4f80
Coffee,Ready-to-Drink Coffee,610aade4ae6ed04c788787b6
...,...,...
Tea,Tea Leaves,5d204a8c13ab2f0d2167b0c3
Water,Coconut Water,5938f90b8613ab613856d7f6
Water,Distilled Water,60e483c8ca61be15d195ff70
Water,Sparkling Water,60365befa2560e1b186f6bec


In [77]:
fdf.to_csv("valid_ids")

In [80]:
fdf.drop(columns=['item_id'])
fdf.groupby(['category_level_3']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,category_level_3,count
0,Coffee,751
1,Coffee / Tea Variety Packs,3
2,Dairy Substitute Based Drinks (Shelf Stable),100
3,Dairy-Based Drinks (Shelf-Stable),209
4,Dietary Supplement Foods,465
5,Drink Mixes & Flavorings,308
6,Energy Drinks,250
7,Fruit & Vegetable Drinks,1291
8,Non-Supplement Nutritional Foods,101
9,Nuts & Seeds,1108


In [57]:
narrow

Unnamed: 0,category_level_3,category_level_4,count
0,Coffee,Coffee Beans,13
1,Coffee,Ground Coffee,282
2,Coffee,Instant Coffee,131
3,Coffee,Liquid Coffee Concentrates,5
4,Coffee,Ready-to-Drink Coffee,102
5,Coffee,Single-Serve Coffee Cups / Bags / Pods,218
6,Dairy-Based Drinks (Shelf-Stable),Milks (Shelf-Stable),24
7,Dairy-Based Drinks (Shelf-Stable),Powdered Milk-Based Drinks,20
8,Dairy-Based Drinks (Shelf-Stable),Powdered Milks,14
9,Dairy-Based Drinks (Shelf-Stable),Smoothies / Shakes (Shelf-Stable),151


In [39]:
some_values = ["Snack Foods","Nuts & Seeds","Dietary Supplement Foods","Non-Supplement Nutritional Foods","Coffee"," Water","Soda / Flavored Drinks","Fruit & Vegetable Drinks","Drink Mixes & Flavorings","Dairy-Based Drinks (Shelf-Stable)","Tea","Dairy Substitute Based Drinks (Shelf Stable)","Energy Drinks","Coffee / Tea Variety Packs","Sports Drinks"]
df2 = df.loc[df['category_level_3'].isin(some_values)]
#valid_ids = df.loc[df['category_level_3'].isin(some_values)]

In [42]:
df2.head()

Unnamed: 0,item_id,category_level_3,category_level_4,category_level_5
10,618a7d7c8bbb540008091dba,Snack Foods,Snack Bites & Clusters,
22,6116a39594371967c00544f6,Coffee,Single-Serve Coffee Cups / Bags / Pods,
25,610aade4ae6ed04c788787b6,Coffee,Ready-to-Drink Coffee,
32,60d5e004436d3a00088404f6,Nuts & Seeds,Almonds,
52,603e458936073c4e14da8f26,Nuts & Seeds,Pecans,


In [40]:
a = df2['category_level_4'].unique()
a

array(['Snack Bites & Clusters', 'Single-Serve Coffee Cups / Bags / Pods',
       'Ready-to-Drink Coffee', 'Almonds', 'Pecans',
       'Pumpkin & Squash Seeds', 'Cashews', 'Pistachios',
       'Trail & Snack Mixes', 'Cereal, Energy, Granola, & Protein Bars',
       'Dietary Supplement Drinks & Mixes', 'Chips, Puffs, & Crisps',
       'Walnuts', 'Soda', 'Crackers & Crispbreads', nan, 'Peanuts',
       'Jerky & Meat-Based Snacks', 'Dietary Supplement Foods',
       'Dipper & Dunker Combos', 'Sunflower Seeds',
       'Smoothies / Shakes (Shelf-Stable)', 'Fruit-Based Snacks',
       'Pretzels & Breadsticks', 'Dried & Baked Vegetable Snacks',
       'Nut & Seed Mixtures', 'Protein / Dietary Supplement Powders',
       'Ready to Drink Tea', 'Coffee Beans', 'Powdered Milks',
       'Powdered Milk-Based Drinks', 'Instant Tea', 'Sandwich Crackers',
       'Popcorn', 'Single-Serve Tea Pods', 'Chia', 'Corn Nuts',
       'Milks (Shelf-Stable)', 'Nutritional Foods', 'Soda Syrups',
       'Ground Co

In [41]:
b = df2['category_level_5'].unique()
b

array([nan], dtype=object)

In [2]:
df_category = pd.read_csv('product_categories_v2.csv')
df_category.head()

Unnamed: 0,item_id,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7
0,61b0b8fc4ce3c40008bf94b5,Food / Beverages,Frozen Foods,Meals (Frozen),,,,
1,61acc46e244643000aabe8f1,Food / Beverages,Dairy & Egg Products,Cheeses,,,,
2,619e4484abc5600008f0d530,categoryNotFound,,,,,,
3,61a238616296b40008d37ee2,Food / Beverages,Produce,Prepared Fruits / Vegetables,,,,
4,619e43edabc5600008f0d2b4,Food / Beverages,Grocery,Condiments,Sauces / Dressings / Dips,Sauces (Shelf-Stable),,


In [10]:
df = df_category.loc[df_category['category_level_1'] != "categoryNotFound"]
# df.head()

In [11]:
for col in df.columns:
    df[col] = df[col].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].str.strip()


In [None]:
df.head()

In [12]:
df_lim = df.loc[df['category_level_3'] == "Water"]
print(f'number of items in category: {len(df_lim)}')
df_lim.head()

number of items in category: 507


Unnamed: 0,item_id,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7
56,60365befa2560e1b186f6bec,Food / Beverages,Beverages,Water,Sparkling Water,,,
90,600445f242a96e1d3af1cc79,Food / Beverages,Beverages,Water,Sparkling Water,,,
116,5f89a9355390877547ba3678,Food / Beverages,Beverages,Water,Sparkling Water,,,
146,5557da5ff22407741e5efd26,Food / Beverages,Beverages,Water,Sparkling Water,,,
182,5f44bc79b1968369292328f5,Food / Beverages,Beverages,Water,Sparkling Water,,,


In [13]:
a = df_lim['category_level_4'].unique()
a

array(['Sparkling Water', 'Still Water', 'Coconut Water',
       'Distilled Water'], dtype=object)

In [2]:
# strip whitespaces of categories
for col in df_category.columns:
    df_category[col] = df_category[col].str.strip()
df_category.head()

Unnamed: 0,item_id,category_level_1,category_level_2,category_level_3,category_level_4,category_level_5,category_level_6,category_level_7
0,61b0b8fc4ce3c40008bf94b5,Food / Beverages,Frozen Foods,Meals (Frozen),,,,
1,61acc46e244643000aabe8f1,Food / Beverages,Dairy & Egg Products,Cheeses,,,,
2,619e4484abc5600008f0d530,categoryNotFound,,,,,,
3,61a238616296b40008d37ee2,Food / Beverages,Produce,Prepared Fruits / Vegetables,,,,
4,619e43edabc5600008f0d2b4,Food / Beverages,Grocery,Condiments,Sauces / Dressings / Dips,Sauces (Shelf-Stable),,


In [20]:
import json

r = json.loads(response.text)

In [22]:
type(r)

dict

In [24]:
print(json.dumps(r, sort_keys=True, indent=4, separators=(',', ':')))

{
    "response":{
        "docs":[
            {
                "_id":[
                    "59856db67dba91b97b43db28"
                ],
                "_index":[
                    "f762ef22-e660-434f-9071-a10ea6691c27"
                ],
                "_score":[
                    8.079004
                ],
                "_type":[
                    "item"
                ],
                "_version_":1729914426594689024,
                "fields.brand_id":[
                    "51db37e5176fe9790a89a5e7"
                ],
                "fields.brand_name":[
                    "Glico"
                ],
                "fields.item_id":[
                    "59856db67dba91b97b43db28"
                ],
                "fields.item_name":[
                    "Pocky Biscuits, Chocolate"
                ],
                "fields.item_type":[
                    2
                ],
                "fields.nf_calories":[
                    121
                ],
       

In [33]:
#len(r['response'])
(r['response']['docs'])

[{'_index': ['f762ef22-e660-434f-9071-a10ea6691c27'],
  '_type': ['item'],
  '_id': ['59856db67dba91b97b43db28'],
  '_score': [8.079004],
  'fields.brand_name': ['Glico'],
  'fields.item_name': ['Pocky Biscuits, Chocolate'],
  'fields.brand_id': ['51db37e5176fe9790a89a5e7'],
  'fields.item_id': ['59856db67dba91b97b43db28'],
  'item_id': ['59856db67dba91b97b43db28'],
  'fields.item_type': [2],
  'fields.nf_ingredient_statement': ['Wheat Flour, Sugar, Cocoa Mass, Vegetable Oil (Palm Oil, Sunflower Oil, Shea Butter, Illipe Butter, Sal Butter, Hydrogenated Rapeseed Oil), Whole Milk Powder, Vegetable Shortening (Palm Oil, Soybean Oil, Coconut Oil, Palm Kernel Oil, Sunflower Oil, Hydrogenated Rapeseed Oil, Hydrogenated Palm Oil), Malt Extract, Tapioca Starch, Salt, Yeast, Emulsifiers (Soy Lecithin, Polyglycerol Polyricinoleate), Cocoa Butter, Artificial Flavors, Leavening Agents (Sodium Bicarbonate), Annatto, Seasoning Agents (Trisodium Phosphate).'],
  'fields.nf_calories': [121],
  'fields

In [None]:
def parseSolrResponse(res):
    

In [None]:
def productBuilder:
    

In [32]:
import nu_product as product

tp = product.NuProduct("59856db67dba91b97b43db28", "Pocky Biscuits, Chocolate", "Glico", 121, 8, 23, 11.2, 196, 26, 62, " Food / Beverages ", " Grocery ")
print(tp._brand_name)

Glico
