In [1]:
# Import modules
from pymongo import MongoClient
import pandas as pd
import json as js
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
import fun

In [2]:
client = MongoClient("mongodb://localhost/companies")
db = client.get_database()

In [3]:
df = pd.DataFrame(list(db["companies"].find({'founded_year': {'$gte':2003}})))
df.shape

(9297, 42)

In [4]:
# DataFrame with companies founded before 2003
df_oldestcomp = pd.DataFrame(list(db['companies'].find({'founded_year': {'$lt': 2003}}, {"name":1,"offices":1,"category_code":1})))
df_oldestcomp.shape

(3839, 4)

In [5]:
# Total companies with declared founded year = 13.136.
# Companies with founded year null = 5665

In [6]:
#To filter for tech startups that raised more than $1M, first know which types of startups are in the database

startups_type = list(db['companies'].distinct('category_code'))


In [7]:
# Data Frame with Tech Statups founded after 2002 that raised $1M on a funding round. 6977
df = pd.DataFrame(list(db['companies'].find({'$and': [{'founded_year': {'$gte':2003}}, {'category_code': {'$in': ['advertising', 'analytics', 'consulting', 'design', 'ecommerce', 'games_video', 'hardware', 'mobile', 'nanotech', 'network_hosting', 'software', 'web']}}, {'funding_rounds.raised_amount': {'$gte': 1000000}}]}, {"name":1,"offices":1,"category_code":1})))
df.shape

(1468, 4)

In [8]:
# Create a DataFrame with Starbucks location 
df_st = pd.read_csv('input/starbucks.csv', engine='python')
df_st = df_st[['Brand', 'Name', 'City', 'Country', 'Latitude', 'Longitude']]
df_st = df_st.dropna(subset= ['Latitude', 'Longitude'])

In [9]:
#Create a DataFrame with USA airports from a csv and filter it
df_airports = pd.read_csv("input/airports.csv", engine = 'python')
df_airports = df_airports.dropna(subset=['wikipedia_link'])
df_airports = df_airports[(df_airports['type'] == 'large_airport')]
df_airports = df_airports[['name', 'latitude_deg', 'longitude_deg']]

In [10]:
# Explode offices to have each office (dictionary, part of a list) on a row
df = df.explode('offices')

# Expand offices info into different columns
dfOffices = df[["offices"]].apply(lambda r: r.offices, result_type="expand", axis=1)

#Concanete both DataFrames
cleanData = pd.concat([df,dfOffices], axis=1)

In [11]:
#Same process for DataFrame of companies founded before 2003
df_oldestcomp = df_oldestcomp.explode('offices')
df_oldestcompOffices = df_oldestcomp[['offices']].apply(lambda r: r.offices, result_type='expand', axis=1)
oldestOffices = pd.concat([df_oldestcomp,df_oldestcompOffices], axis=1)

In [12]:
# Cleaning offices DataFrame dropping unnecessary columns and null info for latitude and longitude
cleanData = cleanData.drop(columns= ['_id', 'offices'])
cleanData = cleanData.dropna(subset= ['latitude', 'longitude'])

In [13]:
# Cleaning oldest offices DataFrame dropping unnecessary columns and null info for latitude and longitude
oldestOffices = oldestOffices.drop(columns= ['_id', 'offices'])
oldestOffices = oldestOffices.dropna(subset= ['latitude', 'longitude'])

In [14]:
#Create GEOJSON
cleanData["location"] = cleanData[["latitude","longitude"]].apply(lambda x: fun.asGeoJSON(x.latitude,x.longitude), axis=1)

oldestOffices["location"] = oldestOffices[["latitude","longitude"]].apply(lambda x: fun.asGeoJSON(x.latitude,x.longitude), axis=1)

df_st['location'] = df_st[['Latitude', 'Longitude']].apply(lambda x: fun.asGeoJSON(x.Latitude,x.Longitude), axis=1)

df_airports['location'] = df_airports[['latitude_deg', 'longitude_deg']].apply(lambda x: fun.asGeoJSON(x.latitude_deg,x.longitude_deg), axis=1)

In [15]:
# Select 3 cities from the original DataFrame based on the number of value_counts of startups in the original DataFrame 
cleanData['city'].value_counts().head(3)

# Cities to compare: San Francisco, New York, Mountain View
# Filter both DataFrames where cities are previous ones.

cleanData = cleanData[(cleanData['city'] == 'San Francisco') | (cleanData['city'] == 'New York') | (cleanData['city'] == 'Mountain View')]

df_st = df_st[(df_st['City'] == 'San Francisco') | (df_st['City'] == 'New York') | (df_st['City'] == 'Mountain View')]

oldestOffices = oldestOffices[(oldestOffices['city'] == 'San Francisco') | (oldestOffices['city'] == 'New York') | (oldestOffices['city'] == 'Mountain View')]

In [16]:
# Exporting data for MongoDB. 
cleanData.to_json("output/cleanOffices.json", orient="records")
oldestOffices.to_json("output/oldestOffices.json", orient="records")
df_st.to_json("output/starbucks.json", orient="records")
df_airports.to_json("output/airports.json", orient="records")

In [17]:
#How many oldcompanies, starbucks and airports are nearby the offices filtered.
cleanData['oldcompanies'] = cleanData['location'].apply(lambda x: fun.officesComp(x, 'oldestOffices', 2000))
cleanData['starbucks'] = cleanData['location'].apply(lambda x: fun.officesComp(x, 'starbucks', 500))
cleanData['airports'] = cleanData['location'].apply(lambda x: fun.officesComp(x, 'airports', 10000))

In [18]:
cleanData.head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports
0,Wetpaint,web,,270 Lafayette Street,Suite 505,10012,New York,NY,USA,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",38,4,0
3,Joost,games_video,,100 5th Ave Fl 6,,10011-6903,New York,NY,USA,40.746497,-74.009447,"{'type': 'Point', 'coordinates': [-74.0094471,...",34,0,0
5,AddThis,advertising,New York Office,568 Broadway,"11th Floor, Suite 1105",10012,New York,NY,USA,40.724604,-73.996876,"{'type': 'Point', 'coordinates': [-73.996876, ...",36,4,0
9,Kyte,games_video,,442 Post Street,10th Floor,94102,San Francisco,CA,USA,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",96,10,0
11,Jingle Networks,mobile,,475 Park Ave South,10th Floor,10016,New York,NY,USA,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",0,0,0


In [19]:
# Assign a weight
airport = 0.8
starbuck = 0.6
oldcompanies = 0.9

cleanData['Ranking'] = cleanData['airports'].rank(method='max') * airport + cleanData['starbucks'].rank(method='max') * starbuck - cleanData['oldcompanies'].rank(method='max') * oldcompanies

In [20]:
cleanData.sort_values(by=['Ranking'], ascending=False).reset_index(drop=True).head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports,Ranking
0,RayV,games_video,,"150 West 77, Suite 4",,,New York,NY,USA,40.781159,-73.978332,"{'type': 'Point', 'coordinates': [-73.978332, ...",3,5,1,318.5
1,muzu tv,web,New York Office,345 Park Avenue,17th Floor,10154-0037,New York,NY,USA,40.757716,-73.972321,"{'type': 'Point', 'coordinates': [-73.972321, ...",70,17,1,290.6
2,The Receivables Exchange,ecommerce,New York Office,"437 Madison Avenue, 28th Floor",,10022,New York,NY,USA,40.757651,-73.975277,"{'type': 'Point', 'coordinates': [-73.9752768,...",75,20,1,283.7
3,Cellfish,web,,215 Lexington Avenue,,10016,New York,NY,USA,40.761855,-73.983754,"{'type': 'Point', 'coordinates': [-73.983754, ...",75,20,1,283.7
4,Vringo,software,HQ,780 Third Avenue,Fifteenth Floor,10017,New York,NY,USA,40.755001,-73.971736,"{'type': 'Point', 'coordinates': [-73.9717364,...",72,15,1,280.7


In [21]:
cleanData.head()

Unnamed: 0,name,category_code,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,oldcompanies,starbucks,airports,Ranking
0,Wetpaint,web,,270 Lafayette Street,Suite 505,10012,New York,NY,USA,40.723731,-73.996431,"{'type': 'Point', 'coordinates': [-73.9964312,...",38,4,0,196.4
3,Joost,games_video,,100 5th Ave Fl 6,,10011-6903,New York,NY,USA,40.746497,-74.009447,"{'type': 'Point', 'coordinates': [-74.0094471,...",34,0,0,152.6
5,AddThis,advertising,New York Office,568 Broadway,"11th Floor, Suite 1105",10012,New York,NY,USA,40.724604,-73.996876,"{'type': 'Point', 'coordinates': [-73.996876, ...",36,4,0,200.0
9,Kyte,games_video,,442 Post Street,10th Floor,94102,San Francisco,CA,USA,37.788482,-122.409173,"{'type': 'Point', 'coordinates': [-122.409173,...",96,10,0,95.6
11,Jingle Networks,mobile,,475 Park Ave South,10th Floor,10016,New York,NY,USA,37.480999,-122.173887,"{'type': 'Point', 'coordinates': [-122.173887,...",0,0,0,237.2


In [22]:
closest_starbucks = fun.nearestStarbucks(cleanData['latitude'][0], cleanData['longitude'][0], cleanData)
closest_airports = fun.nearestAirports(cleanData['latitude'][0], cleanData['longitude'][0], cleanData)
closest_comp = fun.nearestComp(cleanData['latitude'][0], cleanData['longitude'][0], cleanData)

In [23]:
lat = cleanData['latitude'][0]
lon = cleanData['longitude'][0]
map_ = folium.Map(location=[lat, lon],tiles='cartodbpositron', zoom_start=15)
Marker([lat, lon], icon=folium.Icon(color='red')).add_to(map_)
for point in closest_starbucks:
    Marker(point, icon=folium.Icon(color='green')).add_to(map_)
for point in closest_airports:
    Marker(point, icon=folium.Icon(color='blue')).add_to(map_)
for point in closest_comp:
    Marker(point, icon=folium.Icon(color='black')).add_to(map_)
print(lat,lon)
map_

#No he conseguido pintar los aeropuertos. 
#He tenido que triplicar las funciones nearest porque cada en diccionario que devuelve, latitude y longitude recibe un nombre diferente.

40.7237306 -73.9964312
