In [1]:
import os
import pandas as pd
import pickle
import re
import sys
import numpy as np
from tqdm import tqdm
work_dir = re.search('(.*GRE).*', os.getcwd(), re.IGNORECASE).group(1)
sys.path.append(work_dir)
sys.path.append(os.path.join(work_dir, 'RecStudio'))
from RecStudio.recstudio.data import SeqDataset
from RecStudio.recstudio.data.dataset import TensorFrame
from RecStudio.recstudio.utils import parser_yaml
from RecStudio.recstudio.utils import *
from collections import defaultdict
import random
import json
import csv
csv.field_size_limit(sys.maxsize)

from transformers import set_seed
set_seed(42)

def get_data_from_json_by_line(json_file_path, fields):
    data = defaultdict(list)
    with open(json_file_path, 'r') as rf:
        while True:
            datum = rf.readline()
            if not datum:
                break

            try:
                datum =json.loads(datum)
            except Exception as e:
                print(e, datum)
                continue
                
            if not set(fields).issubset(set(datum.keys())):
                continue
            for f in fields:
                data[f].append(datum[f])
    df = pd.DataFrame(data)
    return df


  from .autonotebook import tqdm as notebook_tqdm


## Alabama

In [2]:
category = 'Alabama'

In [3]:
used_features = ['gmap_id', 'name', 'category']
# item metadata
item_df = get_data_from_json_by_line(
                    json_file_path=os.path.join(work_dir, 'data/GoogleLocalData/meta', f'{category}.json'),
                    fields=used_features) 
item_df = item_df[~pd.isna(item_df['category'])].copy()

In [4]:
item_df['category']

0                                               [Boutique]
1                 [Baby store, Children's furniture store]
2                     [Soccer club, Service establishment]
3                                           [Fishing pier]
4                                       [Corporate office]
                               ...                        
74962                                        [Gas station]
74963                                 [E-commerce service]
74964                    [Grocery store, ATM, Supermarket]
74965    [Hotel, Indoor lodging, Meeting planning servi...
74966                         [Shopping mall, Outlet mall]
Name: category, Length: 74635, dtype: object

In [5]:
all_categories = []
for c in item_df['category']:
    all_categories += c
print('num_category: ', len(all_categories))
num_cat = defaultdict(int)
for c in all_categories:
    num_cat[c] += 1

num_category:  169259


In [6]:
num_cat

defaultdict(int,
            {'Boutique': 256,
             'Baby store': 104,
             "Children's furniture store": 102,
             'Soccer club': 7,
             'Service establishment': 794,
             'Fishing pier': 29,
             'Corporate office': 407,
             'Fishing charter': 55,
             'Parasailing ride service': 11,
             'Tour agency': 7,
             'Movie rental kiosk': 145,
             'DVD store': 161,
             'Movie rental store': 147,
             'Flooring store': 202,
             'Carpet store': 67,
             'Rug store': 125,
             'Wood and laminate flooring supplier': 16,
             'Baptist church': 1321,
             'Wedding photographer': 45,
             'Social services organization': 255,
             'RV park': 200,
             'Herbal medicine store': 26,
             'Cannabis store': 24,
             'Plumber': 281,
             'Home builder': 213,
             'Balloon artist': 3,
             'Ball

In [9]:
num_rest = 0
for k, v in num_cat.items():
    if 'restaurant' in k.lower():
        print(k, v)
        num_rest += v
num_rest

Pizza restaurant 924
Chicken wings restaurant 564
Asian fusion restaurant 34
Dessert restaurant 54
Restaurant 4907
Fast food restaurant 2914
Breakfast restaurant 1259
Burrito restaurant 192
Lunch restaurant 232
Takeout Restaurant 1691
Mexican restaurant 937
Taco restaurant 222
Tex-Mex restaurant 189
Vegetarian restaurant 184
American restaurant 1701
Health food restaurant 88
Chicken restaurant 809
Barbecue restaurant 594
Italian restaurant 212
Cajun restaurant 118
Seafood restaurant 561
Chinese restaurant 407
Raw food restaurant 1
Vegan restaurant 44
Delivery Restaurant 306
Cheesesteak restaurant 47
Sushi restaurant 160
Restaurant supply store 26
Brunch restaurant 134
Family restaurant 807
Restaurant or cafe 2
Hamburger restaurant 1230
Creole restaurant 8
Asian restaurant 183
Fine dining restaurant 45
Soul food restaurant 72
Japanese restaurant 165
Country food restaurant 15
Pho restaurant 1
Puerto Rican restaurant 5
Vietnamese restaurant 26
Organic restaurant 8
Latin American restaura

23807

In [13]:
for k, v in num_cat.items():
    if 'canteen' in k.lower():
        print(k, v)
        num_rest += v
num_rest

23807

In [12]:
num_cloth = 0
for k, v in num_cat.items():
    if 'cloth' in k.lower():
        print(k, v)
        num_cloth += v
num_cloth

Clothing store 1339
Men's clothing store 453
Women's clothing store 744
Clothing alteration service 106
Children's clothing store 315
Plus size clothing store 119
Used clothing store 35
Youth clothing store 11
Protective clothing supplier 4
Work clothes store 25
Baby clothing store 72
Clothing wholesale market place 2
Clothing wholesaler 1
Outdoor clothing and equipment shop 33
Vintage clothing store 24
Beach clothing store 9
Clothes market 2
Clothes and fabric manufacturer 1
Clothing supplier 1


3296

## California

In [15]:
category = 'California'

used_features = ['gmap_id', 'name', 'category']
# item metadata
item_df = get_data_from_json_by_line(
                    json_file_path=os.path.join(work_dir, 'data/GoogleLocalData/meta', f'{category}.json'),
                    fields=used_features) 
item_df = item_df[~pd.isna(item_df['category'])].copy()

In [16]:
all_categories = []
for c in item_df['category']:
    all_categories += c
print('num_category: ', len(all_categories))
num_cat = defaultdict(int)
for c in all_categories:
    num_cat[c] += 1

num_category:  1180076


In [17]:
num_rest = 0
for k, v in num_cat.items():
    if 'restaurant' in k.lower():
        num_rest += v
num_rest

197256

In [18]:
num_cloth = 0
for k, v in num_cat.items():
    if 'cloth' in k.lower():
        num_cloth += v
num_cloth

23257

In [20]:
num_cloth = 0
for k, v in num_cat.items():
    if 'fashion' in k.lower():
        print(k, v)
        num_cloth += v
num_cloth

Fashion accessories store 3472
Fashion designer 342
Haute couture fashion house 13
Fashion design school 1


3828

## Florida

In [22]:
category = 'Florida'

used_features = ['gmap_id', 'name', 'category']
# item metadata
item_df = get_data_from_json_by_line(
                    json_file_path=os.path.join(work_dir, 'data/GoogleLocalData/meta', f'{category}.json'),
                    fields=used_features) 
item_df = item_df[~pd.isna(item_df['category'])].copy()

all_categories = []
for c in item_df['category']:
    all_categories += c
print('num_category: ', len(all_categories))
num_cat = defaultdict(int)
for c in all_categories:
    num_cat[c] += 1

num_rest = 0
for k, v in num_cat.items():
    if 'restaurant' in k.lower():
        num_rest += v
print(num_rest)

num_cloth = 0
for k, v in num_cat.items():
    if 'cloth' in k.lower():
        num_cloth += v
print(num_cloth)

num_fashion = 0
for k, v in num_cat.items():
    if 'fashion' in k.lower():
        print(k, v)
        num_fashion += v
print(num_fashion)

num_category:  909522
116913
16369
Fashion accessories store 2597
Fashion designer 245
Haute couture fashion house 9
Fashion design school 3
2854


## Texas

In [23]:
category = 'Texas'

used_features = ['gmap_id', 'name', 'category']
# item metadata
item_df = get_data_from_json_by_line(
                    json_file_path=os.path.join(work_dir, 'data/GoogleLocalData/meta', f'{category}.json'),
                    fields=used_features) 
item_df = item_df[~pd.isna(item_df['category'])].copy()

all_categories = []
for c in item_df['category']:
    all_categories += c
print('num_category: ', len(all_categories))
num_cat = defaultdict(int)
for c in all_categories:
    num_cat[c] += 1

num_rest = 0
for k, v in num_cat.items():
    if 'restaurant' in k.lower():
        num_rest += v
print(num_rest)

num_cloth = 0
for k, v in num_cat.items():
    if 'cloth' in k.lower():
        num_cloth += v
print(num_cloth)

num_fashion = 0
for k, v in num_cat.items():
    if 'fashion' in k.lower():
        print(k, v)
        num_fashion += v
print(num_fashion)

num_category:  1042781
151834
19158
Fashion accessories store 2964
Fashion designer 226
Haute couture fashion house 9
3199


In [24]:
print(num_cat)

defaultdict(<class 'int'>, {'Convenience store': 9441, 'Transportation service': 1205, 'Pharmacy': 2046, 'Drug store': 2982, 'Medical supply store': 471, 'Vitamin & supplements store': 1503, 'Delivery service': 1048, 'Employment agency': 686, 'Auto electrical service': 686, 'Cell phone store': 5362, 'Shredding service': 588, 'Campground': 991, 'Restaurant': 29880, 'Towing service': 1444, 'Auto wrecker': 754, 'Service establishment': 6615, 'Massage therapist': 2851, 'Lutheran church': 249, 'Place of worship': 353, 'Yoga studio': 464, 'Meditation instructor': 29, 'Physical fitness program': 1384, 'Tai chi school': 18, 'Wellness program': 251, 'Loan agency': 1885, 'Medical clinic': 2925, 'Apartment building': 4552, 'Home builder': 2755, 'Computer repair service': 1050, 'City government office': 393, 'College': 278, 'Thrift store': 1004, 'Home health care service': 401, 'State government office': 134, 'Mattress store': 1581, 'Bed shop': 115, 'Criminal justice attorney': 770, 'Vacation home

In [29]:
from transformers import AutoModel, AutoTokenizer
import torch

model = AutoModel.from_pretrained('/data1/home/xingmei/GRE/bge-base-en-v1.5')
tokenizer = AutoTokenizer.from_pretrained('/data1/home/xingmei/GRE/bge-base-en-v1.5')
model.eval()
num_cat_keys = list(num_cat.keys())
inputs = tokenizer(
            ['restaurant', 'food'] + num_cat_keys, 
            padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**inputs)
    sentence_embeddings = model_output[0][:, 0]
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

In [44]:
sim1 = sentence_embeddings[0] @ sentence_embeddings[2:].T
sim2 = sentence_embeddings[1] @ sentence_embeddings[2:].T
num_rest = 0
no_rest = 0
for s1, s2, k in zip(sim1, sim2, num_cat_keys):

    if s1 > 0.7 or s2 > 0.7:
        num_rest += 1
        if 'restaurant' not in k.lower():
            print(s1, s2, k)
            no_rest += num_cat[k]
print(num_rest, no_rest)


tensor(0.7084) tensor(0.6315) Bar
tensor(0.7125) tensor(0.6363) Sandwich shop
tensor(0.7978) tensor(0.6341) Cafe
tensor(0.7328) tensor(0.6503) Salad shop
tensor(0.6883) tensor(0.7013) Snack bar
tensor(0.7909) tensor(0.6455) Bistro
tensor(0.7072) tensor(0.5709) Wine bar
tensor(0.7294) tensor(0.6247) Bar & grill
tensor(0.7313) tensor(0.5552) Coffee shop
tensor(0.6153) tensor(0.7090) Food manufacturer
tensor(0.5648) tensor(0.7221) Food producer
tensor(0.7426) tensor(0.6276) Pub
tensor(0.6749) tensor(0.7223) Food court
tensor(0.6390) tensor(0.7059) Health food store
tensor(0.7041) tensor(0.5425) Cocktail bar
tensor(0.7210) tensor(0.5653) Steak house
tensor(0.7002) tensor(0.5885) Lounge
tensor(0.7040) tensor(0.6129) Noodle shop
tensor(0.7164) tensor(0.6981) Cafeteria
tensor(0.6131) tensor(0.7205) Food products supplier
tensor(0.7119) tensor(0.6141) Crêperie
tensor(0.7838) tensor(0.5908) Hotel
tensor(0.7175) tensor(0.6221) Pastry shop
tensor(0.5480) tensor(0.7267) Meat products
tensor(0.7102

In [41]:
query = tokenizer(
            ['clothing', 'fashion'],
            padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**query)
    query_embeddings = model_output[0][:, 0]
query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)

In [43]:
sim1 = query_embeddings[0] @ sentence_embeddings[2:].T
sim2 = query_embeddings[1] @ sentence_embeddings[2:].T
num_cloth = 0
for s1, s2, k in zip(sim1, sim2, num_cat_keys):
    if s1 > 0.7 or s2 > 0.7:
        num_cloth += 1
        if 'cloth' not in k.lower():
            print(s1, s2, k, num_cat[k])


tensor(0.6564) tensor(0.7115) Fashion accessories store 2964
tensor(0.6218) tensor(0.7087) Design 22
tensor(0.7394) tensor(0.5863) Laundry 434
tensor(0.7016) tensor(0.6600) Tailor 470
tensor(0.6503) tensor(0.7601) Fashion designer 226
tensor(0.7409) tensor(0.6171) Textiles 2
tensor(0.7682) tensor(0.6861) Costumes 1
