## Yelp Image Data Scrapper

* Author: Peerapak Adsavakulchai
* Email: padsavak@uchicago.edu
* Purpose: University of Chicago MSCA 31009 Final Project


### Installation of the Yelp API
Run if not already installed

In [1]:
!pip install yelpapi --quiet

### Importing Packages and Dependencies

In [2]:
import requests
import json
import yelpapi
import itertools
import urllib.request
import time
import os
from google.cloud import storage

### Setting up data scrapper Parameters

* Interest: Italian restaurants
* Location: All Zip Codes in Manhattan
* Criteria: Restaurants with pictures
* Order: Not specified - receiving results with Yelp's default's params
* Result Limit: None - as many as API would allow

In [3]:
#Setting up the API Key
api_key = "WX-9PSoMpwTQk9QwiYb9YrBC_n4-vNGXz_Upii1Njd565n2ZX3xE7jrkE2ymTBhxUCEuAx4mjMhsOsGkQ-1XAmWgNfo80H0kayOzEaFshhaCO-KQo77tgSJsZ3RUZHYx"
yelp_api = yelpapi.YelpAPI(api_key)

# Define search parameters
params = {
    'term': 'italian',
    'categories': 'restaurants',
    'sort_by': 'rating',
    'photos': True
}


manhattan_zips = [
    '10026', '10027', '10030', '10037', '10039', # Central Harlem
    '10001', '10011', '10018', '10019', '10020', '10036', # Chelsea and Clinton
    '10029', '10035', # East Harlem
    '10010', '10016', '10017', '10022', # Gramercy Park and Murray Hill
    '10012', '10013', '10014', # Greenwich Village and Soho
    '10004', '10005', '10006', '10007', '10038', '10280', # Lower Manhattan
    '10002', '10003', '10009', # Lower East Side
    '10021', '10028', '10044', '10065', '10075', '10128', # Upper East Side
    '10023', '10024', '10025', # Upper West Side
    '10031', '10032', '10033', '10034', '10040' # Inwood and Washington Heights
]


### Business ID Query
* Queried based on search parameters 
* Results split based on price range (target class variable)

In [4]:
#Querying a list of business IDs according to the parameters 

biz_id = [[],[],[],[],[]]

for zip_code in manhattan_zips:
    params['location'] = 'New York City, ' + zip_code
    response = yelp_api.search_query(**params)
    
    for business in response['businesses']:

        if 'price' in business:
            if business['price'] == '$':
                biz_id[0].append(business['id'])
            elif business['price'] == '$$':
                biz_id[1].append(business['id'])
            elif business['price'] == '$$$':
                biz_id[2].append(business['id'])
            elif business['price'] == '$$$$':
                biz_id[3].append(business['id'])
        else:
            biz_id[4].append(business['id'])

In [5]:
#Check the number of businesses obtained with and without price filter 

print(len(biz_id[4])) # Number of businesses without price filter

c = 0
for i in [0,1,2,3]:
    c = c + len(biz_id[i])

c # Number of businesses with price filter

213


647

### Image URL Query
* Queried based on business IDs obtained from the previous block
* Results split based on price range (target class variable)

In [6]:
#Querying a list of image urls according to previously obtained business IDs 

pics_url = [[],[],[],[],[]]

for i in range(4):
    for j in range(0, len(biz_id[i])):
        id_ = biz_id[i][j]
        
        response = yelp_api.business_query(id_)
        pics_url[i].extend(list(itertools.chain(response['photos'])))
        
        time.sleep(0.5)

In [7]:
#Check the number of image urls with and without price filter 

print(len(pics_url[4])) # Number of urls without price filter

c = 0
for i in [0,1,2,3]:
    c = c + len(pics_url[i])

c # Number of urls with price filter

0


1941

### URL to Excel
* Saving the list of URLs to Excel in order to not have to query again after the storage refreshes

In [13]:
#Writting the resulting file to the storage in the GCP instance 
import csv


#gcloud compute scp path 31009finalprojectpak@image-scrape:remote-directory



for i in range(4):
    name = f'image-url{i}.csv'

    #file_path = path + name

    with open(name, 'w', newline='') as file:
        writer = csv.writer(file)
        
        for j in range(len(pics_url[i])):
            row = [pics_url[i][j]]
            writer.writerow(row)
        


### Saving Images in GCP buckets
* Automatically train-test splitting data with train split = 0.7 

In [17]:

train_split = 0.7

path_train0 = 'Data/Train Data/Class 0'
path_train1 = 'Data/Train Data/Class 1'
path_train2 = 'Data/Train Data/Class 2'
path_train3 = 'Data/Train Data/Class 3'

path_test0 = 'Data/Test Data/Class 0'
path_test1 = 'Data/Test Data/Class 1'
path_test2 = 'Data/Test Data/Class 2'
path_test3 = 'Data/Test Data/Class 3'

In [18]:
storage_client = storage.Client()
bucket_name = 'final-project-31009-peerapak-a'
bucket = storage_client.get_bucket(bucket_name)

In [None]:
for i in [0,1,2,3]:
    var_name_train = f'path_train{i}'
    var_name_test = f'path_test{i}'
    
    
    var_value_train = locals()[var_name_train]
    var_value_test = locals()[var_name_test]
    
    train = []
    test = []
    

    for j in range(0,len(pics_url[i])):
        if len(train) < round((train_split * len(pics_url[i]))):
            train.append(j)
            
            image_content = urllib.request.urlopen(pics_url[i][j]).read()
            filename = f'price_class{i}_image_{j}_train.jpg'
            bucket_path = os.path.join(var_value_train, filename)
            
            blob = bucket.blob(bucket_path)
            blob.upload_from_string(image_content)

        else:    
            test.append(j)
            
            image_content = urllib.request.urlopen(pics_url[i][j]).read()
            filename = f'price_class{i}_image_{j}_test.jpg'
            bucket_path = os.path.join(var_value_test, filename)
            
            blob = bucket.blob(bucket_path)
            blob.upload_from_string(image_content)
    
        
    print(f'All done for batch {i}')

All done for batch 0
All done for batch 1


In [16]:
print(len(pics_url[0]))
print(len(pics_url[1]))
print(len(pics_url[2]))
print(len(pics_url[3]))

168
1062
618
93
