In [2]:
import requests
from tqdm.notebook import tqdm as tqdm_notebook
import pandas as pd
import os
import json
import boto3

# YELP

## Authorization

In [17]:
SEARCH_LIMIT =50

In [87]:
api_key = ""

def construct_url_params(url, params:dict):
    """
        Construct new url by adding query params to the url.
        
        Returns: new url
    """
    
    #if no query params, add ? else add & at the end of url
    url+='?' if not '?' in url else '&'
    for i,j in params.items():
        url+=f'{i}={j}&'
        
    #strip extra & at the end
    return url.rstrip('&')
        

def business_search(params, api_key):
    url = "https://api.yelp.com/v3/businesses/search?"
        
    headers = {
        'Accept': 'application/json',
        'Authorization': f'Bearer {api_key}'
    }
    
    search_url = construct_url_params(url, params)
    result = requests.get(search_url, headers=headers)
    if result.status_code == 200:
        #return data, status_code, error as None
        return (result.json(), 200, None)
    else:
        #reutrn data as None, Failed status code, and the response (error)
        return (None, result.status_code,  result.json())

In [88]:
res = business_search({'location': 'manhattan', 'term': f'Indian restaurants', 'offset':i*SEARCH_LIMIT, 'limit': SEARCH_LIMIT}, api_key)

In [37]:
def create_folder(path='', folder_name='data'):
    """
        path contains the path where folder is to be created, if empty, creates folder in current dir
    """
    try:
        os.makedirs(os.path.join(path, folder_name))
        return os.path.join(path, folder_name)
    except FileExistsError as err:
        return os.path.join(path, folder_name)
    except Exception as err:
        print(err)
        return None

In [38]:
#create folder to store data
data_folder_path = create_folder()

## Dynamo DB

In [4]:
def create_table(table_name, partition_key, region_name='us-east-1'):
    dynamodb = boto3.resource('dynamodb', region_name= region_name)
    table = dynamodb.create_table(
        TableName=table_name,
        KeySchema=[
            {
                'AttributeName': partition_key,
                'KeyType': 'HASH'  #Partition_key
            }
        ],
        AttributeDefinitions =[{"AttributeName": partition_key,"AttributeType":"S"}],
         ProvisionedThroughput={
        'ReadCapacityUnits': 10,
        'WriteCapacityUnits': 10
    }

    )

    print("Table status:", table.table_status)


In [57]:
import boto3

create_table("past-restaurant-suggestions", partition_key='user_id')

Table status: CREATING


In [84]:
import boto3

create_table("yelp-restaurants", partition_key='business_id')

Table status: CREATING


In [86]:
all_id = []
count = 0

cuisines = ['italian', 'indian', 'french', 'chinese', 'mexican', 'thai', 'japanese']
ind_count = { cuisine: 0 for cuisine in cuisines}

from datetime import datetime
import time
from decimal import Decimal


dynamodb = boto3.resource('dynamodb', region_name= 'us-east-1')

table = dynamodb.Table('yelp-restaurants')

for cuisine in tqdm_notebook(cuisines[:3]):
    #open each cuisine data file 
    file_path = f"{data_folder_path}/{cuisine}.json"
    df = pd.read_json(file_path)
    print(cuisine, len(df))
    data = {}
    cnt = 0
    for i in tqdm_notebook(range(len(df))):
        if not df.iloc[i]['id'] in all_id:
            ind_count[cuisine] +=1
            all_id.append(df.iloc[i]['id'])
            
            data = {
                'insertedAtTimestamp' : str(datetime.now()),
                'business_id': df.iloc[i]['id'],
                'name': df.iloc[i]['name'],
                'address': ','.join(df.iloc[i]['location']['display_address']),
                'coordinates':{'latitude': Decimal(str(df.iloc[i]['coordinates']['latitude'])), 'longitude': Decimal(str(df.iloc[i]['coordinates']['longitude']))},
                'rating': Decimal(str(df.iloc[i]['rating'])),
                'reviews': Decimal(str(df.iloc[i]['review_count'])),
                'zip_code': df.iloc[i]['location']['zip_code']
                
            }
            
            table.put_item(Item=data)
            cnt+=1
            time.sleep(0.25)
            
        if cnt == 50:
            break

  0%|          | 0/3 [00:00<?, ?it/s]

italian 1000


  0%|          | 0/1000 [00:00<?, ?it/s]

indian 932


  0%|          | 0/932 [00:00<?, ?it/s]

french 815


  0%|          | 0/815 [00:00<?, ?it/s]

## Elastic Search

In [88]:
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
from elasticsearch.helpers import bulk


# Set up AWS credentials and region
region = 'us-east-1'
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service)

# Set up Elasticsearch connection
host = 'search-es-cc-brsklwm7vfelhgxjchwyvm4nue.aos.us-east-1.on.aws'
index = 'restaurants'
doc_type = 'your-doc-type'

# Create Elasticsearch client
es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
#     basic_auth=['master', 'Computing@1234'],
    http_auth=['master_test', 'Computing@1234'],
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

mappings = {

        "properties": {

#             "restaurant_id": {"type": "text"},
            "cuisine_type": {"type": "text"}
        }
    }
resp = es.indices.create(index=index, body = {'mappings': mappings},  ignore=400)
# print(resp)

resp = es.indices.get(index=index, ignore=400)
print(resp)


{'restaurants': {'aliases': {}, 'mappings': {'properties': {'cuisine_type': {'type': 'text'}}}, 'settings': {'index': {'creation_date': '1708462333791', 'number_of_shards': '5', 'number_of_replicas': '1', 'uuid': 'sQT-xkpsRlSF7lnFmbuUaQ', 'version': {'created': '7100299'}, 'provided_name': 'restaurants'}}}}


In [87]:
# resp = es.indices.delete(index=index)

In [89]:
documents = []
all_id = []
cuisines = ['italian', 'indian', 'french', 'chinese', 'mexican', 'thai', 'japanese']

for cuisine in tqdm_notebook(cuisines[:3]):
    #open each cuisine data file 
    file_path = f"{data_folder_path}/{cuisine}.json"
    df = pd.read_json(file_path)
    print(cuisine, len(df))
    data = {}
    cnt = 0
    for i in tqdm_notebook(range(len(df))):
        if not df.iloc[i]['id'] in all_id:
            all_id.append(df.iloc[i]['id'])
            
            documents.append({
                '_index': index,
                '_id': df.iloc[i]['id'],
                '_source':
                
                {
                    'cuisine_type': cuisine,
                }
            })

            cnt+=1

        if cnt == 50:
            bulk(es, documents)
            documents = []
            break

  0%|          | 0/3 [00:00<?, ?it/s]

italian 1000


  0%|          | 0/1000 [00:00<?, ?it/s]

indian 932


  0%|          | 0/932 [00:00<?, ?it/s]

french 815


  0%|          | 0/815 [00:00<?, ?it/s]

In [90]:
print(es.cat.count(index=index, format= "json"))

[{'epoch': '1708462348', 'timestamp': '20:52:28', 'count': '150'}]


In [50]:
script_query ={
    "field": "restaurant_id",
    "num_candidates": 1000
  
}

In [91]:
response = es.search(index=index, q='cuisine_type:italian', size = 100)