In [59]:
api_key="INSERT API KEY HERE"
#above is the api key, which we won't show for security purposes

In [1]:

import requests, json, time, csv
from plotly.io import write_html
import plotly.express as px
import pandas as pd
import numpy as np
from ast import literal_eval

In [55]:
def map(API_KEY, company_name, size):
    """
    Purpose:
    Calls API with API_KEY to return size amount of employees from company of company_name.
    
    Return: 
    CSV file named "{company_name}{size}_employee.csv" if a file of that name doesn't already exist. This is so costly
    calls aren't repeated if we already have the dataset for them
    A plotly plot of the addresses of employees from company_name and the company HQ address.
    
    Parameters: 
    API_KEY is the string of your API key with PDL
    company_name is a string of the company's name
    size is an int of how many employees you want retrieved max. If you want all available employees, use -1"""
    
    #basic test cases
    if type(company_name) != str:
        raise TypeError
    if type(size) != int:
        raise TypeError
    if type(API_KEY) != str:
        raise TypeError   
    if len(API_KEY) < 5:
        raise ValueError
        
    csv_filename = company_name + str(size) + "_employees.csv"
    try:
        df = pd.read_csv(csv_filename)
    except (FileNotFoundError):
        #the following is code from PDL's Query Builder https://www.peopledatalabs.com/main/query-builder
        MAX_NUM_RECORDS = size

    # NO CHANGES NEEDED BELOW HERE
        PDL_URL = "https://api.peopledatalabs.com/v5/person/search"
        request_header = {
            "Content-Type": "application/json",
            "X-api-key": API_KEY
        }
        
        ES_QUERY = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "term": {
                                "job_company_name": company_name
                            }
                        }
                    ]
                }
            }
        }

        num_records_to_request = 100
        params = {
            "dataset": "street_address",
            "query": json.dumps(ES_QUERY),
            "size": num_records_to_request,
            "pretty": True
        }
        
        # Pull all results in multiple batches
        batch = 1
        all_records = []
        start_time = time.time()
        while batch == 1 or params["scroll_token"]:
            if MAX_NUM_RECORDS != -1:
                # Update num_records_to_request
                # Compute the number of records left to pull
                num_records_to_request = MAX_NUM_RECORDS - len(all_records)
                # Clamp this number between 0 and 100
                num_records_to_request = max(0, min(num_records_to_request, 100))

            if num_records_to_request == 0:
                break

            params["size"] = num_records_to_request
            response = requests.get(PDL_URL, headers=request_header, params=params).json()

            if batch == 1:
                print(f"{response['total']} available records in this search")

            all_records.extend(response.get("data", []))
            params["scroll_token"] = response.get("scroll_token")
            print(f"Retrieved {len(response.get('data', []))} records in batch {batch}")
            batch += 1

            if params["scroll_token"]:
                time.sleep(6)   # avoid hitting rate limit thresholds
                
        end_time = time.time()
        runtime = end_time - start_time

        print(f"Successfully recovered {len(all_records)} profiles in "
              f"{batch} batches [{runtime} seconds]")
        print("status:",response["status"], "total:",response["total"])
        #there should be a test case to check status and run an error if not 200
        
        def save_profiles_to_csv(profiles, filename, fields=[], delim=","):
            """Save profiles to csv (utility function)"""

            # Define header fields
            if fields == [] and len(profiles) > 0:
                fields = profiles[0].keys()

            with open(filename, "w", encoding="utf-8") as csvfile:
                # Write csv file
                writer = csv.writer(csvfile, delimiter=delim)

                # Write Header:
                writer.writerow(fields)

                count = 0
                for profile in profiles:
                    # Write Body:
                    writer.writerow([profile[field] for field in fields])
                    count += 1
                    print(f"Wrote {count} lines to: '{filename}'")
        # Use utility function to save profiles to csv
        csv_header_fields = ['job_company_location_geo', 'job_company_location_street_address',
                                 'job_company_location_address_line_2',
                                 'location_street_address', 'location_address_line_2', 'location_geo', 'job_title', 'education']
        save_profiles_to_csv(all_records, csv_filename, csv_header_fields)
        
    df = pd.read_csv(csv_filename)
    #turns education array string into array. Extracts school
    df["education1"]=df["education"].apply(literal_eval)
    def get_school(x):
        try:
            return x[-1]['school']['name']
        except:
            return 'null'
    df["school"]=df['education1'].apply(get_school)
    df = df[df['location_geo'].notnull()]
    df['lat'] = df['location_geo'].apply(lambda s: float(s.split(',')[0]))
    df['lon'] = df['location_geo'].apply(lambda s: float(s.split(',')[1]))
    company_lat=float(df["job_company_location_geo"][0].split(',')[0])
    company_lon=float(df["job_company_location_geo"][0].split(',')[1])
    company=pd.DataFrame(np.array([[company_lat, company_lon]]), columns=['lat', 'lon'])
    # used Mapbox for map with streets
    token_name = 'pk.eyJ1IjoidHJhY3ljaGFybGVzMTA4IiwiYSI6ImNsMjJmdmUzajFmeXcza3BkOXgwOWZoNW4ifQ.bFv6klpNU8XWRbEJ0zk1Dw'
    px.set_mapbox_access_token(token_name)
    fig = px.density_mapbox(df, lat = 'lat', lon = 'lon', zoom=3, mapbox_style='carto-positron', radius = 10, hover_data=['job_title', "school"], color_continuous_scale = 'viridis')
    fig.add_densitymapbox(lat = [company_lat], lon = [company_lon], colorscale = 'Reds', radius = 15, hovertext = 'company HQ')  

    fig.update_layout(margin={"r":0, "t":0, "l":0, "b":0})
    fig.show()
    write_html(fig, company_name + '_employees.html')

In [68]:
map(api_key, "twitch", -1)

401 available records in this search
Retrieved 100 records in batch 1
Retrieved 100 records in batch 2
Retrieved 100 records in batch 3
Retrieved 100 records in batch 4
Retrieved 1 records in batch 5
Retrieved 0 records in batch 6
Successfully recovered 401 profiles in 7 batches [40.91098380088806 seconds]
status: 404 total: 401
Wrote 1 lines to: 'twitch-1_employees.csv'
Wrote 2 lines to: 'twitch-1_employees.csv'
Wrote 3 lines to: 'twitch-1_employees.csv'
Wrote 4 lines to: 'twitch-1_employees.csv'
Wrote 5 lines to: 'twitch-1_employees.csv'
Wrote 6 lines to: 'twitch-1_employees.csv'
Wrote 7 lines to: 'twitch-1_employees.csv'
Wrote 8 lines to: 'twitch-1_employees.csv'
Wrote 9 lines to: 'twitch-1_employees.csv'
Wrote 10 lines to: 'twitch-1_employees.csv'
Wrote 11 lines to: 'twitch-1_employees.csv'
Wrote 12 lines to: 'twitch-1_employees.csv'
Wrote 13 lines to: 'twitch-1_employees.csv'
Wrote 14 lines to: 'twitch-1_employees.csv'
Wrote 15 lines to: 'twitch-1_employees.csv'
Wrote 16 lines to:

In [60]:
map(api_key, "google", 1000)

7447 available records in this search
Retrieved 100 records in batch 1
Retrieved 100 records in batch 2
Retrieved 100 records in batch 3
Retrieved 100 records in batch 4
Retrieved 100 records in batch 5
Retrieved 100 records in batch 6
Retrieved 100 records in batch 7
Retrieved 100 records in batch 8
Retrieved 100 records in batch 9
Retrieved 100 records in batch 10
Successfully recovered 1000 profiles in 11 batches [95.61114978790283 seconds]
status: 200 total: 7447
Wrote 1 lines to: 'google1000_employees.csv'
Wrote 2 lines to: 'google1000_employees.csv'
Wrote 3 lines to: 'google1000_employees.csv'
Wrote 4 lines to: 'google1000_employees.csv'
Wrote 5 lines to: 'google1000_employees.csv'
Wrote 6 lines to: 'google1000_employees.csv'
Wrote 7 lines to: 'google1000_employees.csv'
Wrote 8 lines to: 'google1000_employees.csv'
Wrote 9 lines to: 'google1000_employees.csv'
Wrote 10 lines to: 'google1000_employees.csv'
Wrote 11 lines to: 'google1000_employees.csv'
Wrote 12 lines to: 'google1000_

In [62]:
map(api_key, "netflix", -1)

1197 available records in this search
Retrieved 100 records in batch 1
Retrieved 100 records in batch 2
Retrieved 100 records in batch 3
Retrieved 100 records in batch 4
Retrieved 100 records in batch 5
Retrieved 100 records in batch 6
Retrieved 100 records in batch 7
Retrieved 100 records in batch 8
Retrieved 100 records in batch 9
Retrieved 100 records in batch 10
Retrieved 100 records in batch 11
Retrieved 97 records in batch 12
Retrieved 0 records in batch 13
Successfully recovered 1197 profiles in 14 batches [110.40964937210083 seconds]
status: 404 total: 1197
Wrote 1 lines to: 'netflix-1_employees.csv'
Wrote 2 lines to: 'netflix-1_employees.csv'
Wrote 3 lines to: 'netflix-1_employees.csv'
Wrote 4 lines to: 'netflix-1_employees.csv'
Wrote 5 lines to: 'netflix-1_employees.csv'
Wrote 6 lines to: 'netflix-1_employees.csv'
Wrote 7 lines to: 'netflix-1_employees.csv'
Wrote 8 lines to: 'netflix-1_employees.csv'
Wrote 9 lines to: 'netflix-1_employees.csv'
Wrote 10 lines to: 'netflix-1_em