# Generating Businesses Data .CSV

## Imports

In [1]:
import requests
import pandas as pd
import json
import csv
import numpy as np

with open(r'C:\Users\bmcca\.secret\yelp_api.json') as f:
    keys = json.load(f)

client_id = keys['id']
yelp_key = keys['key']

## Creating the Pull Request

In [2]:
def yelp_request_offset(term, location, yelp_key, offset=0, verbose=False):
    '''Adapted from Yelp API Lab: https://github.com/BenJMcCarty/dsc-yelp-api-lab/tree/solution'''
    
    url = 'https://api.yelp.com/v3/businesses/search'

    headers = {
            'Authorization': 'Bearer {}'.format(yelp_key),
        }

    url_params = {
                    'term': term.replace(' ', '+'),
                    'location': location.replace(' ', '+'),
                    'limit': 50,
                    'offset': offset
                        }
    
    response = requests.get(url, headers=headers, params=url_params)
    
    if verbose == True:
        print(response)
        print(type(response.text))
        print(response.text[:1000])
        
    return response.json()

In [26]:
def parse_data(list_of_data):
    '''Adapted from Tyrell's code'''  

    parsed_data = []
    
    # Loop through each business in the list of businesses
    # Add specific k:v pairs to a dictionary
    
    for business in list_of_data:
        
        # Insert NaN for missing Price values
        if 'price' not in business:
            business['price'] = np.nan
            
        details = {'Name': business['name'],
                     'Location': ' '.join(business['location']['display_address']),
                     'Business ID': business['id'],
                     'Alias': business['categories'][0]['alias'],
                     'Title': business['categories'][0]['title'],
                     'Rating': business['rating'],
                     'Review Count': business['review_count'],
                     'Price': business['price'],
                     'Latitude': business['coordinates']['latitude'],
                     'Longitude': business['coordinates']['longitude']
                    }
    
        parsed_data.append(details)
    
    # Create a DataFrame from the resulting dictionary
    
    df_parsed_data = pd.DataFrame(parsed_data)

    
    return df_parsed_data

In [13]:
def get_full_data(term, location, yelp_key, file_name='data/wineries_raw.csv'):
    '''Requests all results from Yelp API; saves as a .csv; 
    and returns a DataFrame.'''

    # Create a .csv to store results
    blank_df = pd.DataFrame()#columns= ['Name', 'Location', 'Business ID',
                                      #'Alias', 'Title', 'Rating', 
                                      #'Review Count', 'Price', 'Latitude', 
                                      #'Longitude'])
    blank_df.to_csv(file_name)
    
    # Process first request to Yelp API and calculate number of pages 
    results = yelp_request_offset(term, location, yelp_key)
    num_pages = results['total']//50+1
    
    # Print out confirmation feedback
    print(f'For {term} and {location}: ')
    print(f"    Total number of results: {results['total']}.")
    print(f'    Total number of pages: {num_pages}.')
    
    # Create offset for additional results
    offset = 0

    # Retrieves remaining pages
    for num in range(num_pages-1):
        try:
            # Process API request
            results = yelp_request_offset(term, location, yelp_key,
                                          offset=offset)
            
            # From results, take values from "Businesses" key and save
            parsed_results = parse_data(results['businesses'])
          
            # Save resulting DF to .csv from top
            parsed_results.to_csv(file_name, mode='a', index = False)
            
            # Increase offset to move to next "page" of data
            offset += 50
            
        except:
            # If error, print where the error happens
            print(f'Error on page {num}.')
            # Then save the results so far to the .csv
            parsed_results.to_csv(file_name, mode='a', index = False)


    return parsed_results

In [19]:
def get_full_data(term, location, yelp_key, file_name = 'data/wineries_raw.csv'):
    '''Requests all results from Yelp API; saves as a .csv; and returns a DataFrame.'''

    # Create a .csv to store results
    blank_df = pd.DataFrame()
    blank_df.to_csv(file_name)
    
    # Process first request to Yelp API and calculate number of pages 
    results = yelp_request_offset(term, location, yelp_key, offset=0, 
                                  verbose=False)
    num_pages = results['total']//50+1
    
    # Print out confirmation feedback
    print(f'For {term} and {location}: ')
    print(f"    Total number of results: {results['total']}.")
    print(f'    Total number of pages: {num_pages}.')
    
    # Create offset for additional results
    offset = 0

    # Retrieves remaining pages
    for num in range(num_pages-1):
        try:
            # Process API request
            results = yelp_request_offset(term, location, yelp_key,
                                          offset=offset, verbose=False)
            
            # From results, take values from "Businesses" key and save
            parsed_results = parse_data(results['businesses'])
          
            # Save resulting DF to .csv from top
            parsed_results.to_csv(file_name, mode='a', index = False)
            
            # Increase offset to move to next "page" of data
            offset += 50
            
        except:
            # If error, print where the error happens
            print(f'Error on page {num}.')
            # Then save the results so far to the .csv
            parsed_results.to_csv(file_name, mode='a', index = False)


    return parsed_results

## Cleaning Data

In [20]:
def sort_by_aliases(raw_data = 'data/wineries_raw.csv'):

    # Read in businesses
    df2 = pd.read_csv(raw_data, header = 1)

    alias_index = df2['alias'].value_counts()[:2].index

    # Filtering rows based on condition

    df3 = df2[df2['alias'].isin(alias_index)]
    
    # Resetting index
    df3.reset_index(drop=True, inplace=True)

    df3.to_csv('data/wineries_filtered_alias.csv', index = False)

    print("Saved to 'data/wineries_filtered_alias.csv'")
    
    return df3

## Testing Functions

### Get Data

In [21]:
get_full_data('winery','San Diego', yelp_key)

For winery and San Diego: 
    Total number of results: 262.
    Total number of pages: 6.
Error on page 4.


Unnamed: 0,Name,Location,Business ID,Alias,Title,Rating,Review Count,Price,Latitude,Longitude
0,BevMo!,"168 S Solana Hills Dr Solana Beach, CA 92075",m2LBkxbaDBg2VDFAW4Q9Gg,beer_and_wine,"Beer, Wine & Spirits",3.0,31,$$,32.994626,-117.257886


In [None]:
df_test = pd.read_csv('data/wineries_raw.csv', header = 1)
df_test

### Clean Data

In [None]:
df_test_clean = sort_by_aliases(raw_data = 'data/wineries_raw.csv')
df_test_clean

In [None]:
df_test2 = pd.read_csv('data/wineries_filtered_alias.csv')
df_test2

In [None]:
# list_test = df_test2.id
# list_test[list_test.str.startswith("-")]

In [None]:
# for item in df_test2['id']:
#     if item.startswith("-") == True:
#         item = None

In [None]:
def fix_csv_issue(df_var_name):
    for item in df_var_name['id']:
        if item.startswith("-") == True:
            df_var_name['id'].replace(to_replace=item, value = 0, inplace=True)
#     df_var_name = df_var_name['id'].dropna(inplace=True)
    
    return df_var_name

In [None]:
fix_csv_issue(df_test2)

In [None]:
df_test2[~df_test2['id'].isna()]

In [None]:
df_test2.dropna(axis=0, inplace=True)
df_test2

## Saving Data