### 1. Import Python Packages

In [None]:
import requests
import json
import pandas as pd
from sqlalchemy import create_engine

### 2. Retrieve Organization Data via Cruchbase API

a. Gathering data on 1000 companies located accross the globe

In [None]:
# Storing api in variable
api_url = "https://crunchbase-crunchbase-v1.p.rapidapi.com/searches/organizations"

# Setting up request body and storing in a variable
payload = {
    "field_ids": [
        "identifier",
        "location_identifiers",
        "short_description",
        "rank_org"
    ],
    "limit": 1000,
    "order": [
        {
            "field_id": "rank_org",
            "sort": "asc"
        }
    ],
    "query": [
        {
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "europe",
                "north-america",
                "latin-america",
                "asia",
                "australia",
                "new-zealand"
            ]
        },
        {
            "field_id": "facet_ids",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "company"
            ]
        }
    ]
}

# Establishing headers
headers = {
    "content-type" : "application/json",
    "x-rapidapi-key" : "API_KEY",
    "x-rapidapi-host" : "HOST"
}

In [None]:
# Posting request and verifying a 200 HTTP status code
response = requests.post(api_url, data=json.dumps(payload), headers=headers)

response

In [None]:
# Decoding API response to JSON
json.loads(response.text)

In [None]:
api_response = json.loads(response.text)

In [None]:
# finding key that lists the european organizations and storing it in a variable that we can loop through
company_list = api_response["entities"]

# Checking variable type to confirm it is a list
type(company_list)

In [None]:
company_list

In [None]:
# Initalizing empty dictionary to append values to
company_details = {
    "company_name" : [],
    "company_country" : []
}

# Looping through JSON resopnse to extract required data
for company in company_list :
    
    company_name = company["properties"]["identifier"]["value"]
    company_details["company_name"].append(company_name)
    print("company_name:", company_name)
    
    # Since there are multiple permalink dicts in the location_identifiers array, I specified the specific
    # Array position for the permalink dict that is storing the name of the comapany's country 
    country_dict_array_position = 2
    
    company_country = company["properties"]["location_identifiers"][country_dict_array_position]["value"]
    company_details["company_country"].append(company_country)
    print("company_location:", company_country)
    
    print("-"*70)

In [None]:
# Checking dictionary to make sure values were appended correctly
company_details

In [None]:
# converting dictionary into pandas dataframe and checking the first five rows
company_df = pd.DataFrame(company_details)

company_df.head()

In [None]:
# Saving to CSV
company_df.to_csv('company_info_pg1.csv', index=False)

b. Paginating to page 2

In [None]:
# Changing payload to pull 1000 companies globally from page 2 utilizing after_id key
payload = {
    "field_ids": [
        "identifier",
        "location_identifiers",
        "short_description",
        "rank_org"
    ],
    "limit": 1000,
    "after_id" : "7c8beaea-b3c5-14b4-5866-dc4d4c0bc7cf",
    "order": [
        {
            "field_id": "rank_org",
            "sort": "asc"
        }
    ],
    "query": [
        {
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "europe",
                "north-america",
                "latin-america",
                "asia",
                "australia",
                "new-zealand"
            ]
        },
        {
            "field_id": "facet_ids",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "company"
            ]
        }
    ]
}


# Posting request
response = requests.post(api_url, data=json.dumps(payload), headers=headers)

In [None]:
# Checking for 200 Status code
response

In [None]:
# Decoding to json and locating key that stores list of companies
api_output = json.loads(response.text)

pg2_company_list = api_output["entities"]

type(pg2_company_list)

In [None]:
# Confirming output
api_output

In [None]:
# Looping through list to extract data and append to dictionary
pg2_company_details = {
    "company_name" : [],
    "company_country" : []
}

for company in pg2_company_list :
    
    company_name = company["properties"]["identifier"]["value"]
    pg2_company_details["company_name"].append(company_name)
    print("company_name:", company_name)
    
    country_value_array_location = 2
    company_country = company["properties"]["location_identifiers"][country_value_array_location]["value"]
    pg2_company_details["company_country"].append(company_country)
    print("company_country:", company_country)
    
    print('-'*70)

In [None]:
# Checking appended values
pg2_company_details

In [None]:
# Truning into pd DF and checking first five rows

pg2_companies_df = pd.DataFrame(pg2_company_details)

pg2_companies_df.head()

In [None]:
# Saving to CSV
pg2_companies_df.to_csv('company_info_pg2.csv', index=False)

c. Paginating to page 3

In [None]:
# Changing payload to pull 1000 companies from pg 3

payload = {
    "field_ids": [
        "identifier",
        "location_identifiers",
        "short_description",
        "rank_org"
    ],
    "limit": 1000,
    "after_id" : "6895b3bb-e29a-05fb-1b87-ee747eeae975",
    "order": [
        {
            "field_id": "rank_org",
            "sort": "asc"
        }
    ],
    "query": [
        {
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "europe",
                "north-america",
                "latin-america",
                "asia",
                "australia",
                "new-zealand"
            ]
        },
        {
            "field_id": "facet_ids",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "company"
            ]
        }
    ]
}

# posting request
api_response = requests.post(api_url, data=json.dumps(payload), headers=headers)

In [None]:
# Checking for 200 status code
api_response

In [None]:
# Decoding to json and locating key that stores list of apac companies

api_output = json.loads(api_response.text)

pg3_company_list = api_output["entities"]

type(pg3_company_list)

In [None]:
# Checking output
api_output

In [None]:
# Looping through pg3 to extract data and store in dictionary
pg3_company_details = {
    "company_name" : [],
    "company_country" : []
}

for company in pg3_company_list :
    company_name = company["properties"]["identifier"]["value"]
    pg3_company_details['company_name'].append(company_name)
    print(company_name)
    
    country_value_array_location = 2
    company_country = company["properties"]["location_identifiers"][country_value_array_location]["value"]
    pg3_company_details['company_country'].append(company_country)
    print(company_country)
    
    print('-'*70)

In [None]:
# Checking dictionary
pg3_company_details

In [None]:
# Converting dictionary into Dataframe and checking first 5 rows
pg3_company_df = pd.DataFrame(pg3_company_details)

pg3_company_df.head()

In [None]:
# Saving as CSV
pg3_company_df.to_csv("company_info_pg3.csv", index=False)

d. Paginating to page 4

In [None]:
# Changing payload to pull 1000 companies from pg 4

payload = {
    "field_ids": [
        "identifier",
        "location_identifiers",
        "short_description",
        "rank_org"
    ],
    "limit": 1000,
    "after_id" : "49f195d7-c566-4d5d-8d6b-3ac849204bd9",
    "order": [
        {
            "field_id": "rank_org",
            "sort": "asc"
        }
    ],
    "query": [
        {
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "europe",
                "north-america",
                "latin-america",
                "asia",
                "australia",
                "new-zealand"
            ]
        },
        {
            "field_id": "facet_ids",
            "operator_id": "includes",
            "type": "predicate",
            "values": [
                "company"
            ]
        }
    ]
}

# posting request
api_response = requests.post(api_url, data=json.dumps(payload), headers=headers)

In [None]:
# Checking response
api_response

In [None]:
# Decoding to JSON, locating key that stores list of companies, and checking the type to confirm it is a list
api_output = json.loads(api_response.text)

pg4_company_list = api_output['entities']

type(pg4_company_list)

In [None]:
# Checking output
api_output

In [None]:
# Looping through pg4 to extract data and store in dictionary
pg4_company_details = {
    "company_name" : [],
    "company_country" : []
}

for company in pg4_company_list :
    company_name = company["properties"]["identifier"]["value"]
    pg4_company_details['company_name'].append(company_name)
    print(company_name)
    
    country_value_array_location = 2
    company_country = company["properties"]["location_identifiers"][country_value_array_location]["value"]
    pg4_company_details['company_country'].append(company_country)
    print(company_country)
    
    print('-'*70)

In [None]:
# Checking appended values
pg4_company_details

In [None]:
# Converting dictionary into Dataframe and checking first 5 rows
pg4_company_df = pd.DataFrame(pg4_company_details)

pg4_company_df.head()

In [None]:
# Saving as CSV
pg4_company_df.to_csv("company_info_pg4.csv", index=False)

### 3. Upload Cleaned Data Into MySQL Database

a.  Upload the CountryId and RegionId fields into CountryRegions Table

In [None]:
# Uploading CSV file containing the Country and Region Id and converting it into a pandas dataframe
country_region_df = pd.DataFrame(pd.read_csv('country_region.csv'))

country_region_df.head()

In [None]:
# Establishing connection to MySQL Database
engine = create_engine('mysql+mysqldb://USERNAME:PASSWORD@HOST/DATABASE?charset=utf8')

In [None]:
# loading data to CountryRegions table
country_region_df.to_sql('CountryRegions', engine, if_exists='append', index=False)

b. Upload Account data into Accounts Table

In [None]:
# Uploading CSV file containing account data and converting it into a pandas dataframe
# Companies under the AccountName column consist of companies pulled from the Crunchbase Search API above
account_data_df = pd.DataFrame(pd.read_csv('account_data.csv'))

account_data_df.head()

In [None]:
# Loading data into Accounts table
account_data_df.to_sql('Accounts', engine, if_exists='append', index=False)