# Scraping Businesses from Yelp

In [2]:
import pandas as pd

## `16zpallagi.csv` is the IRS dataset that contains all tax return informations 
- **Note**: Every zip code has 6 rows (based on the income class rated 1 to 6)

The following image displays the income classification based on zip code
<img src= "./visuals/income_class_irs.png">

In [3]:
affluence = pd.read_csv('./datasets/16zpallagi.csv')
affluence.head()

Unnamed: 0,STATEFIPS,STATE,zipcode,agi_stub,N1,mars1,MARS2,MARS4,PREP,N2,...,N10300,A10300,N85530,A85530,N85300,A85300,N11901,A11901,N11902,A11902
0,1,AL,0,1,815440,477700,105350,221200,440830,1296920,...,367320,330066,0,0,0,0,63420,51444,711580,1831661
1,1,AL,0,2,495830,211930,142340,128890,272440,996240,...,398050,984297,0,0,0,0,74090,110889,416090,1173463
2,1,AL,0,3,263390,83420,137870,36340,154880,584000,...,253180,1349246,0,0,0,0,64000,143060,195130,543284
3,1,AL,0,4,167190,29420,124060,10610,99700,421720,...,165830,1425430,0,0,0,0,45020,128920,117410,381329
4,1,AL,0,5,217440,20240,188080,4880,129410,601040,...,216720,3922449,390,155,60,19,82940,423629,126130,506526


In [4]:
affluence.shape

(179796, 147)

In [5]:
affluence['zipcode'].nunique()

#29,874 unique zip codes

29874

**According to the documentation, we must look at agi stub.**

In [6]:
features = ['agi_stub', 'zipcode']
df = affluence[features]

In [7]:
#summarizing income class based on zip code
mother_df = df.groupby('zipcode', as_index=False)['agi_stub'].mean()

In [8]:
mother_df.head()

Unnamed: 0,zipcode,agi_stub
0,0,3.5
1,1001,3.5
2,1002,3.5
3,1003,3.4
4,1005,3.5


In [10]:
mother_df.shape

(29874, 2)

In [11]:
mother_df.dtypes

zipcode       int64
agi_stub    float64
dtype: object

In [12]:
mother_df['zipcode']=mother_df['zipcode'].apply(lambda x: '{0:0>5}'.format(x))
#shoutout to stack overflow- this code adds a zero in front of the zipcode if there are digits less than 5

In [13]:
mother_df.head()

Unnamed: 0,zipcode,agi_stub
0,0,3.5
1,1001,3.5
2,1002,3.5
3,1003,3.4
4,1005,3.5


In [14]:
#locating the index where this dataset starts the 9 of the zip code
mother_df['zipcode'][27097]

'90001'

In [15]:
mother_df = mother_df.iloc[27097:] #dropping first row 

In [16]:
mother_df.head(20)

Unnamed: 0,zipcode,agi_stub
27097,90001,3.5
27098,90002,3.5
27099,90003,3.5
27100,90004,3.5
27101,90005,3.5
27102,90006,3.5
27103,90007,3.5
27104,90008,3.5
27105,90010,3.5
27106,90011,3.5


### Note: All `agi_stub` for every zip code is 3.5. We can't proceed to use this as our target variable to predict neighborhood affluence. 

### But we will use this data to extract businesses from Yelp using the `zip_code` column. 

In [16]:
#creating a list of zip codes to set up our web scraper
zipcodes = mother_df['zipcode']

In [17]:
type(zipcodes)

pandas.core.series.Series

In [18]:
import requests
import time

#### Due to confidentiality, we will hide our Yelp API keys
- Since we have a total of three people in our group, `list_of_keys` is a list of 3 Yelp API keys. 

In [19]:
list_of_keys = []

In [20]:
#importing time since we are ethical!
import time

In [1]:
mother = []
ENDPOINT = 'https://api.yelp.com/v3/businesses/search'

i = 0
key = list_of_keys[i]
for codes in zipcodes:
    for offsetnum in range(0,150, 50):
        HEADERS = {'Authorization': 'bearer %s' % key}
        PARAMETERS = {'location': codes,
                        'limit': 50,
                        'offset': offsetnum

                            }  
        response = requests.get(url = ENDPOINT, params= PARAMETERS, headers = HEADERS)
        #print(response.status_code)
        
        #if response code is 429, we have reached our daily max; thus, this code will move onto the 2nd key
        if response.status_code == 429:
            i = i + 1
            key = list_of_keys[i]
            HEADERS = {'Authorization': 'bearer %s' % key}
            response = requests.get(url = ENDPOINT, params= PARAMETERS, headers = HEADERS)
            business_data = response.json()
            mother.extend(business_data['businesses'])
            print(f'hit {codes} using {key}') #print the specific API key we are using and the zip code we're scraping
        
        #if response code is 500, break the loop and move onto the next zip code
        elif response.status_code == 500:
            break
        #if the response code is not 500, scrape more and print the appropriate info
        else:
            business_data = response.json()
            mother.extend(business_data['businesses'])
            print(f'hit {codes} using {key}')
                


            
        #wait .3 seconds then scrape the next 50        
        time.sleep(.3)
        
#due to the confidentiality of our api keys, this function will return an error (but this is the final code for scraping)

NameError: name 'list_of_keys' is not defined

In [22]:
import pandas as pd

### The following code will get the scraped dictionary and organize the data into a pandas dataframe
- What does this mean: every row represents a single business 

In [23]:
# initialize list of dicts
post_info = []
# loop through dicts and get specific info
for b in mother:
    
    info = {}
    info['id'] = b['id']
    info['latitude'] = b['coordinates']['latitude']
    info['longitude'] = b['coordinates']['longitude']
    info['rating'] = b['rating']
    info['address'] = b['location']['address1']
    info['city'] = b['location']['city']
    info['zip_code'] = b.get('location').get('zip_code', None)
    info['phone'] = b['phone']
    info['name'] = b['name']
    info['price'] = b.get('price', None)
    #info['category1'] = b['categories'][0].get('alias', None)
    try:
        info['category1'] = b['categories'][0].get('alias', None)
        info['category2'] = b['categories'][1].get('alias', None)
        info['category3'] = b['categories'][2].get('alias', None)
        info['category4'] = b['categories'][3].get('alias', None)
        info['category5'] = b['categories'][4].get('alias', None)
    except: None
    post_info.append(info)   
# create dataframe    
post_info_df = pd.DataFrame(post_info)

In [24]:
#we have scraped approximately 277k businesses 
post_info_df.shape

(277159, 15)

In [25]:
#there's a chance there are duplicate businesses, let's drop them based on the unique ID number called id
post_info_df.drop_duplicates(subset = 'id', keep= 'last', inplace = True)

In [26]:
#total of 85,011 unique businesses 
post_info_df.shape

(85011, 15)

In [27]:
#sanity check that all duplicated businesses are dropped
post_info_df[post_info_df['id'].duplicated()]

Unnamed: 0,address,category1,category2,category3,category4,category5,city,id,latitude,longitude,name,phone,price,rating,zip_code


### Let's see what our data looks like!

In [28]:
post_info_df.head()

Unnamed: 0,address,category1,category2,category3,category4,category5,city,id,latitude,longitude,name,phone,price,rating,zip_code
71,1517 E Florence Ave,mexican,,,,,Los Angeles,RVJ7CtzHbSIXySMJ7uJw9w,33.97499,-118.24696,El Senor Taco,13235824709,$,3.5,90001
109,7600 Graham Ave,parks,playgrounds,recreation,,,Los Angeles,73_9SM3HCAPZi6VmjjY2Rg,33.9702,-118.24204,Franklin D Roosevelt Park,13235867201,,4.5,90001
111,,desserts,chocolate,,,,Los Angeles,XuDsBBkKMgyb19qrCG-Vxw,33.97363,-118.24989,Brigadier Sweets,12132811821,$$,5.0,90001
118,,foodtrucks,african,,,,Los Angeles,E02jtyN7b9LJW8bJwX97kA,33.97853,-118.2497,African Chop,14084295458,,4.5,90001
129,757 S La Brea Ave,foodtrucks,mexican,,,,Los Angeles,KWKkQHHwVBPS_4abj-DaYw,34.060716,-118.344931,Huitlacoche,13233810688,$,4.5,90017


In [29]:
#calling and saving this dataframe CA
post_info_df.to_csv('./datasets/CA')

In [33]:
# I like Ruen Pair (my go-to thai restaurant in Hollywood)
# I just wanted to make sure I scraped it!
post_info_df[post_info_df['name'] == 'Ruen Pair']

Unnamed: 0,address,category1,category2,category3,category4,category5,city,id,latitude,longitude,name,phone,price,rating,zip_code
8762,5257 Hollywood Blvd,thai,,,,,Los Angeles,UdpE_5k2c3zsGBFdcAjMGA,34.101885,-118.30513,Ruen Pair,13234660153,$$,4.0,90027


In [38]:
#printing unique number of zip codes 
post_info_df['zip_code'].nunique()

3444