In [158]:
# Import modules
from pymongo import MongoClient
import pandas as pd
import json as js
import matplotlib.pyplot as plt
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import os
import requests

In [159]:
client = MongoClient("mongodb://localhost/companies")
db = client.get_database()

In [160]:
df = pd.DataFrame(list(db["companies"].find({'founded_year': {'$gte':2003}})))
df.shape

(9297, 42)

In [161]:
# DataFrame with companies founded before 2003
df_oldestcomp = pd.DataFrame(list(db['companies'].find({'founded_year': {'$lt': 2003}}, {"name":1,"offices":1,"category_code":1})))
df_oldestcomp.shape

(3839, 4)

In [162]:
# Total companies with declared founded year = 13.136.
# Companies with founded year null = 5665

In [163]:
# Look for different currencies on raised_currency_code for a future money exchange

currency = list(db['companies'].distinct('funding_rounds.raised_currency_code'))
currency

[None, 'CAD', 'EUR', 'GBP', 'JPY', 'SEK', 'USD']

In [164]:
#To filter for tech startups that raised more than $1M, first know which types of startups are in the database

startups_type = list(db['companies'].distinct('category_code'))


In [165]:
# Data Frame with Tech Statups founded after 2002 that raised $1M on a funding round. 6977
df = pd.DataFrame(list(db['companies'].find({'$and': [{'founded_year': {'$gte':2003}}, {'category_code': {'$in': ['advertising', 'analytics', 'consulting', 'design', 'ecommerce', 'games_video', 'hardware', 'mobile', 'nanotech', 'network_hosting', 'software', 'web']}}, {'funding_rounds.raised_amount': {'$gte': 1000000}}]}, {"name":1,"offices":1,"category_code":1})))
df.shape

(1468, 4)

In [166]:
# Create a DataFrame with Starbucks location to add to paint markers on the offices map
df_st = pd.read_csv('input/starbucks.csv', engine='python')
df_st = df_st[['Brand', 'Name', 'City', 'Country', 'Latitude', 'Longitude']]
df_st = df_st.dropna(subset= ['Latitude', 'Longitude'])

In [170]:
#Create a DataFrame with USA airports from a csv and filter it
df_airports = pd.read_csv("input/airports.csv", engine = 'python')
df_airports = df_airports.dropna(subset=['wikipedia_link'])
df_airports = df_airports[(df_airports['type'] == 'large_airport')]
df_airports = df_airports[['name', 'latitude_deg', 'longitude_deg']]
df_airports.head()

Unnamed: 0,name,latitude_deg,longitude_deg
11825,Port Moresby Jacksons International Airport,-9.44338,147.220001
12289,Keflavik International Airport,63.985001,-22.6056
12336,Priština International Airport,42.5728,21.035801
16465,Edmonton International Airport,53.3097,-113.580002
16520,Halifax / Stanfield International Airport,44.880798,-63.508598


In [171]:
# Explode offices to have each office (dictionary, part of a list) on a row
df = df.explode('offices')

# Expand offices info into different columns
dfOffices = df[["offices"]].apply(lambda r: r.offices, result_type="expand", axis=1)

#Concanete both DataFrames
cleanData = pd.concat([df,dfOffices], axis=1)

In [172]:
#Same process for DataFrame of companies founded before 2003
df_oldestcomp = df_oldestcomp.explode('offices')
df_oldestcompOffices = df_oldestcomp[['offices']].apply(lambda r: r.offices, result_type='expand', axis=1)
oldestOffices = pd.concat([df_oldestcomp,df_oldestcompOffices], axis=1)

In [173]:
# Cleaning offices DataFrame dropping unnecessary columns and null info for latitude and longitude
cleanData = cleanData.drop(columns= ['_id', 'offices'])
cleanData = cleanData.dropna(subset= ['latitude', 'longitude'])

In [174]:
# Cleaning oldest offices DataFrame dropping unnecessary columns and null info for latitude and longitude
oldestOffices = oldestOffices.drop(columns= ['_id', 'offices'])
oldestOffices = oldestOffices.dropna(subset= ['latitude', 'longitude'])

In [175]:
#Create a function to apply to latitude and longitude series to create a new column with a JSON 'point'
#Why I need this column?
def asGeoJSON(lat,lng):
    lat = float(lat)
    lng = float(lng)
    return {
        "type":"Point",
        "coordinates":[lng,lat]
    }
        

cleanData["location"] = cleanData[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)

oldestOffices["location"] = oldestOffices[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)

df_st['location'] = df_st[['Latitude', 'Longitude']].apply(lambda x:asGeoJSON(x.Latitude,x.Longitude), axis=1)

df_airports['location'] = df_airports[['latitude_deg', 'longitude_deg']].apply(lambda x:asGeoJSON(x.latitude_deg,x.longitude_deg), axis=1)

In [176]:
# Select 3 cities from the original DataFrame based on the number of value_counts of startups in the original DataFrame 
cleanData['city'].value_counts().head(3)

# Cities to compare: San Francisco, New York, Mountain View
# Filter both DataFrames where cities are previous ones.

cleanData = cleanData[(cleanData['city'] == 'San Francisco') | (cleanData['city'] == 'New York') | (cleanData['city'] == 'Mountain View')]

df_st = df_st[(df_st['City'] == 'San Francisco') | (df_st['City'] == 'New York') | (df_st['City'] == 'Mountain View')]

oldestOffices = oldestOffices[(oldestOffices['city'] == 'San Francisco') | (oldestOffices['city'] == 'New York') | (oldestOffices['city'] == 'Mountain View')]

In [177]:
# Exporting data for MongoDB. 
cleanData.to_json("output/cleanOffices.json", orient="records")
oldestOffices.to_json("output/oldestOffices.json", orient="records")
df_st.to_json("output/starbucks.json", orient="records")
df_airports.to_json("output/airports.json", orient="records")

In [178]:
start_lat = cleanData['latitude'][0]
start_lon = cleanData['longitude'][0]
heat_m = folium.Map(location=[start_lat, start_lon],tiles='cartodbpositron', zoom_start=15)
heat_m

In [179]:
for index,row in cleanData.iterrows():
    m_lat = row['latitude']
    m_long = row['longitude']
    Marker([m_lat, m_long], icon=folium.Icon(color='red')).add_to(heat_m)

heat_m

In [180]:
for index,row in df_st.iterrows():
    s_lat = row['Latitude']
    s_long = row['Longitude']
    Marker([s_lat, s_long], icon=folium.Icon(color='green')).add_to(heat_m)
heat_m

In [181]:
for index,row in oldestOffices.iterrows():
    s_lat = row['latitude']
    s_long = row['longitude']
    Marker([s_lat, s_long], icon=folium.Icon(color='black')).add_to(heat_m)
heat_m

Filtering DataFrame with new columns based on the amount of companies founded before 2003, starbucks and airports nearby.

In [182]:
def officesComp(coord, bd, meters):
    '''Function to know distance from office to old company, starbucks, airport, bar, etc...'''
    coordinates = coord['coordinates']
    comp = (db[f'{bd}'].aggregate([{'$geoNear': {'near': { 'type': "Point", 'coordinates': coordinates },
        'distanceField': "dist.calculated",
        'maxDistance': meters,
        'spherical': 'true'}}]))
    officescomp = len(list(comp))
    return officescomp

In [183]:
cleanData['oldcompanies'] = cleanData['location'].apply(lambda x: officesComp(x, 'oldestOffices', 2000))
cleanData['starbucks'] = cleanData['location'].apply(lambda x: officesComp(x, 'oldestOffices', 500))
cleanData['airports'] = cleanData['location'].apply(lambda x: officesComp(x, 'airports', 10000))

In [184]:
cleanData.head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports
0,Wetpaint,web,,270 Lafayette Street,Suite 505,10012,New York,NY,USA,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",38,3,16
3,Joost,games_video,,100 5th Ave Fl 6,,10011-6903,New York,NY,USA,40.746497,-74.009447,"{'type': 'Point', 'coordinates': [-74.0094471,...",34,1,18
5,AddThis,advertising,New York Office,568 Broadway,"11th Floor, Suite 1105",10012,New York,NY,USA,40.724604,-73.996876,"{'type': 'Point', 'coordinates': [-73.996876, ...",36,3,16
9,Kyte,games_video,,442 Post Street,10th Floor,94102,San Francisco,CA,USA,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",96,7,6
11,Jingle Networks,mobile,,475 Park Ave South,10th Floor,10016,New York,NY,USA,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",0,0,3


In [185]:
def requestMap(lat, long):
    '''Requesting nearby night clubs to offices'''
    token = os.getenv('MAPS_KEY')
    url = f'https://maps.googleapis.com/maps/api/place/nearbysearch/json?key={token}&location={lat},{long}&radius=1000&type=night_club'
    res = requests.get(url)
    if res != 200:
        raise ValueError('Bad Response')
    return res.json()



In [186]:
cleanData['trainstation'] = cleanData[['latitude', 'longitude']].apply(lambda x: requestMap(x.latitude, x.longitude), axis=1)

ValueError: Bad Response

In [189]:
# Assign a weight
airport = 0.8
starbuck = 0.6
oldcompanies = 0.9

cleanData['Ranking'] = cleanData['airports'].rank(method='max') * airport + cleanData['starbucks'].rank(method='max') * starbuck - cleanData['oldcompanies'].rank(method='max') * oldcompanies

In [190]:
cleanData.head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports,Ranking
0,Wetpaint,web,,270 Lafayette Street,Suite 505,10012,New York,NY,USA,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",38,3,16,171.8
3,Joost,games_video,,100 5th Ave Fl 6,,10011-6903,New York,NY,USA,40.746497,-74.009447,"{'type': 'Point', 'coordinates': [-74.0094471,...",34,1,18,180.2
5,AddThis,advertising,New York Office,568 Broadway,"11th Floor, Suite 1105",10012,New York,NY,USA,40.724604,-73.996876,"{'type': 'Point', 'coordinates': [-73.996876, ...",36,3,16,175.4
9,Kyte,games_video,,442 Post Street,10th Floor,94102,San Francisco,CA,USA,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",96,7,6,-31.6
11,Jingle Networks,mobile,,475 Park Ave South,10th Floor,10016,New York,NY,USA,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",0,0,3,32.4


In [199]:
cleanData.sort_values(by=['Ranking'], ascending=False).reset_index(drop=True)

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports,Ranking
0,GameChanger Media,software,New York Office,86 Chambers St,2nd Floor,10007,New York,NY,USA,40.707834,-74.013661,"{'type': 'Point', 'coordinates': [-74.0136605,...",20,7,18,264.5
1,Mantara,software,Headquarters,215 Park Avenue South,20th Floor,10003,New York,NY,USA,40.726517,-74.032177,"{'type': 'Point', 'coordinates': [-74.032177, ...",0,0,21,256.4
2,Socialize,mobile,Headquarters,153 Townsend St,Suite 9057,94107,San Francisco,CA,USA,38.885210,-76.998641,"{'type': 'Point', 'coordinates': [-76.998641, ...",0,0,18,254.0
3,RayV,games_video,,"150 West 77, Suite 4",,,New York,NY,USA,40.781159,-73.978332,"{'type': 'Point', 'coordinates': [-73.978332, ...",3,0,18,245.9
4,TheLadders,web,,137 Varick Street,Floor 8,10013,New York,NY,USA,40.725883,-74.006196,"{'type': 'Point', 'coordinates': [-74.006196, ...",32,6,18,234.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,Splunk,software,Headquarters,250 Brannan St.,,94107,San Francisco,CA,USA,37.782740,-122.390945,"{'type': 'Point', 'coordinates': [-122.390945,...",90,10,5,-60.3
312,KODA,web,San Francisco,220 Montgomery Street,#991,94104,San Francisco,CA,USA,37.788796,-122.409710,"{'type': 'Point', 'coordinates': [-122.40971, ...",90,1,6,-61.9
313,Tiny Pictures,mobile,,454 Natoma Street,,94103,San Francisco,CA,USA,37.781002,-122.406912,"{'type': 'Point', 'coordinates': [-122.406912,...",102,4,6,-64.0
314,DanceJam,games_video,,965 Mission Street,Suite 730,94103,San Francisco,CA,USA,37.781557,-122.407959,"{'type': 'Point', 'coordinates': [-122.407959,...",102,4,6,-64.0


In [193]:
cleanData.head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports,Ranking
0,Wetpaint,web,,270 Lafayette Street,Suite 505,10012,New York,NY,USA,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",38,3,16,171.8
3,Joost,games_video,,100 5th Ave Fl 6,,10011-6903,New York,NY,USA,40.746497,-74.009447,"{'type': 'Point', 'coordinates': [-74.0094471,...",34,1,18,180.2
5,AddThis,advertising,New York Office,568 Broadway,"11th Floor, Suite 1105",10012,New York,NY,USA,40.724604,-73.996876,"{'type': 'Point', 'coordinates': [-73.996876, ...",36,3,16,175.4
9,Kyte,games_video,,442 Post Street,10th Floor,94102,San Francisco,CA,USA,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",96,7,6,-31.6
11,Jingle Networks,mobile,,475 Park Ave South,10th Floor,10016,New York,NY,USA,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",0,0,3,32.4
