## Mapping Tech Companies in New York City

### Joyce Lee

Import necessary libraries

In [2]:
% matplotlib inline
from googleplaces import GooglePlaces, types, lang
import googlemaps
from datetime import datetime
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
import geopy


import matplotlib.pyplot as plt
import re

In [3]:
with open("API_joyce_ignore", "r") as f:
    lines = f.readlines()
    google_places_api = lines[0].split(":")[1].strip()

API_KEY = google_places_api
google_places = GooglePlaces(API_KEY)
gmaps = googlemaps.Client(key = API_KEY)

### Finding most valuable tech companies in the US

Use BeautifulSoup to scrape a BusinessInsider article to get a list of the top 21 most valuable American tech companies. Webscraping functions taken from following website: https://realpython.com/python-web-scraping-practical-introduction/

In [4]:
# Webscraping function to  article to get list of the top most valuable tech companies in America.

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

If you view the page source of the Business Insider article, you can see that each company name is contained within an H2 tag with the class 'slide-title-text'. Use BeautifulSoup to find all the elements that meet these criteria, then clean up the list to get the names of the companies.

In [5]:
raw_html = simple_get('http://www.businessinsider.com/amazon-google-apple-most-valuable-tech-brands-america-2018-6')
soup = BeautifulSoup(raw_html, 'html.parser')
BI_companies = soup.find_all("h2", class_="slide-title-text")
BI_company_list = [company.text for company in BI_companies]
BI_company_list = [company.split(".")[1].strip() for company in BI_company_list]
print(BI_company_list)

['Qualcomm', 'Western Digital', 'eBay', 'Adobe', 'HPE', '3M', 'HP', 'booking', 'NETFLIX', 'Dell', 'Uber', 'Cisco', 'Intel', 'YouTube', 'Oracle', 'IBM', 'Microsoft', 'Facebook', 'Google', 'Apple', 'Amazon']


Now, define a function that iterates through the list of companies, and for each company query google_places with a text search for its office in New York. This query returns an array of 'Places'. Store these results in a dict, where the key is the name of the company and the value is the array of 'Places'.

Next, the verify_places function takes this dictionary and returns a DataFrame. It iterates through the dictionary, and for each company, go through the array of Places and attempt to verify if each 'Place' really is affiliated with the company. A simple way to do this is to check to see if the name of the company is in the 'Place' name, and also if the substring 'company name'.com is located in the 'Place' website. If both of these criteria are met, then this Place is likely to be truly associated with the company, and an entry for this 'Place' is inserted into the dataframe. Once the function is done iterating through the entire dictionary, the populated DataFrame is returned.

Finally, the make_geodf function turns the dataframe into a geo-dataframe, so that this can then be written to a geojson file.

In [None]:
def make_company_places_dict(company_list):
    company_place_dict = {}
    place_query = '{} office in New York City'
    for company in company_list:
        company_place_dict[company] = google_places.text_search(query=place_query.format(company))
    return company_place_dict

def verify_places(company_place_dict):
    company_df = pd.DataFrame(columns=['company_name', 'latitude', 'longitude'])
    
    for company in company_place_dict:
        for place in company_place_dict[company].places:
            score = 0
            place.get_details()
            if company.lower() in place.name.lower():
                score += 1
            if (company.lower() + ".com") in str(place.website).lower():
                score += 1
            if score == 2:
                row = pd.Series({'company_name':company, 'latitude':place.geo_location['lat'], 'longitude':place.geo_location['lng']})
                company_df = company_df.append(row, ignore_index=True)
                
    return company_df

def make_geodf(company_df):
    company_df['coordinates'] = list(zip(company_df['longitude'], company_df['latitude']))
    company_df['coordinates'] = company_df['coordinates'].apply(Point)
    company_geodf = gpd.GeoDataFrame(company_df).set_geometry('coordinates')
    
    company_geodf['latitude'] = pd.to_numeric(company_geodf['latitude'])
    company_geodf['longitude'] = pd.to_numeric(company_geodf['longitude'])
    company_geodf['company_name'] = company_geodf['company_name'].astype('str')
    
    return company_geodf

In [None]:
BI_dict = make_company_places_dict(BI_company_list)

In [None]:
BI_verified_places_df = verify_places(BI_dict)
BI_geodf = make_geodf(BI_verified_places_df)

In [None]:
BI_geodf

In [None]:
BI_geodf.to_file("BI_companies.geojson", driver='GeoJSON')

In [None]:
companies = gpd.read_file('BI_companies.geojson')

In [None]:
background = "community_districts.geojson"
fig, ax1 = plt.subplots(1,1, figsize=(6,8))
companies = companies.set_geometry('geometry')
bkg = gpd.read_file(background)
base = bkg.plot(ax = ax1, color = 'white', edgecolor='black')
fig = companies.plot(ax=base, markersize = 50, legend = True, column = 'company_name')

## Top 100 NYC tech companies

Let's scrape a different article, this time an article published by BuiltInNYC listing the top 100 NYC-based tech companies.

In [None]:
raw_html = simple_get('https://www.builtinnyc.com/2017/11/07/nyc-top-100-tech-companies-2017')
soup = BeautifulSoup(raw_html, 'html.parser')

top_100 = soup.find_all("div", class_="company-info-wrapper")

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

top_100_company_list = []

for company in top_100:
    s = company.find_all("a", href=re.compile("https://www.builtinnyc.com/company/"))
    top_100_company_list.append(str(s))

top_100_company_list = list(map(remove_html_tags, top_100_company_list))
top_100_company_list = [company.split(",")[1].strip() for company in top_100_company_list]
top_100_company_list

In [None]:
top_100_dict = make_company_places_dict(top_100_company_list)

In [None]:
top_100_verified_places_df = verify_places(top_100_dict)
top_100_geodf = make_geodf(top_100_verified_places_df)

In [None]:
top_100_geodf.to_file("top_100_tech_nyc.geojson", driver='GeoJSON')

In [None]:
companies = gpd.read_file('top_100_tech_nyc.geojson')

background = "community_districts.geojson"
fig, ax1 = plt.subplots(1,1, figsize=(8,8))
companies = companies.set_geometry('geometry')
bkg = gpd.read_file(background)
base = bkg.plot(ax = ax1, color = 'white', edgecolor='black')
fig = companies.plot(ax=base, markersize = 50, legend = True, column = 'company_name')

In [None]:
raw_html = simple_get('https://www.builtinnyc.com/2018/01/16/50-nyc-startups-watch-2018')
soup = BeautifulSoup(raw_html, 'html.parser')

new_50_2018 = soup.find_all("div", class_="company-info-wrapper")

def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

company_list = []

for company in new_50_2018:
    test = company.find_all("a", href=re.compile("https://www.builtinnyc.com/company/"))
    company_list.append(str(test))

clean_company_list = list(map(remove_html_tags, company_list))
clean_company_list = [company.split(",")[1].strip() for company in clean_company_list]
clean_company_list

In [None]:
top_50 = make_company_places_dict(clean_company_list)

In [None]:
top_50_df = verify_places(top_50)
top_50_geodf = make_geodf(top_50_df)

In [None]:
top_50_geodf.to_file("top_50_tech_nyc.geojson", driver='GeoJSON')
companies = gpd.read_file('top_50_tech_nyc.geojson')

background = "community_districts.geojson"
fig, ax1 = plt.subplots(1,1, figsize=(8,8))
companies = companies.set_geometry('geometry')
bkg = gpd.read_file(background)
base = bkg.plot(ax = ax1, color = 'white', edgecolor='black')
fig = companies.plot(ax=base, markersize = 50, legend = True, column = 'company_name')