In [153]:
# Import modules
from pymongo import MongoClient
import pandas as pd
import json as js
import matplotlib.pyplot as plt
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

In [135]:
client = MongoClient("mongodb://localhost/companies")
db = client.get_database()

In [136]:
df = pd.DataFrame(list(db["companies"].find({'founded_year': {'$gte':2003}})))
df.shape

(9297, 42)

In [137]:
# DataFrame with companies founded before 2003
df_oldestcomp = pd.DataFrame(list(db['companies'].find({'founded_year': {'$lt': 2003}}, {"name":1,"offices":1,"category_code":1})))
df_oldestcomp.shape

(3839, 4)

In [138]:
# Total companies with declared founded year = 13.136.
# Companies with founded year null = 5665

In [139]:
# Look for different currencies on raised_currency_code for a future money exchange

currency = list(db['companies'].distinct('funding_rounds.raised_currency_code'))
currency

[None, 'CAD', 'EUR', 'GBP', 'JPY', 'SEK', 'USD']

In [140]:
#To filter for tech startups that raised more than $1M, first know which types of startups are in the database

startups_type = list(db['companies'].distinct('category_code'))


In [141]:
# Data Frame with Tech Statups founded after 2002 that raised $1M on a funding round. 6977
df = pd.DataFrame(list(db['companies'].find({'$and': [{'founded_year': {'$gte':2003}}, {'category_code': {'$in': ['advertising', 'analytics', 'consulting', 'design', 'ecommerce', 'games_video', 'hardware', 'mobile', 'nanotech', 'network_hosting', 'software', 'web']}}, {'funding_rounds.raised_amount': {'$gte': 1000000}}]}, {"name":1,"offices":1,"category_code":1})))
df.shape

(1468, 4)

In [142]:
# Create a DataFrame with Starbucks location to add to paint markers on the offices map
df_st = pd.read_csv('input/starbucks.csv', engine='python')
df_st = df_st[['Brand', 'Name', 'City', 'Country', 'Latitude', 'Longitude']]
df_st = df_st.dropna(subset= ['Latitude', 'Longitude'])

In [143]:
#Create a DataFrame with USA airports from a csv and filter it
df_airports = pd.read_csv("input/airports.csv", engine = 'python')
df_airports = df_airports[['name', 'latitude_deg', 'longitude_deg']]

In [144]:
# Explode offices to have each office (dictionary, part of a list) on a row
df = df.explode('offices')

# Expand offices info into different columns
dfOffices = df[["offices"]].apply(lambda r: r.offices, result_type="expand", axis=1)

#Concanete both DataFrames
cleanData = pd.concat([df,dfOffices], axis=1)

In [145]:
#Same process for DataFrame of companies founded before 2003
df_oldestcomp = df_oldestcomp.explode('offices')
df_oldestcompOffices = df_oldestcomp[['offices']].apply(lambda r: r.offices, result_type='expand', axis=1)
oldestOffices = pd.concat([df_oldestcomp,df_oldestcompOffices], axis=1)

In [146]:
# Cleaning offices DataFrame dropping unnecessary columns and null info for latitude and longitude
cleanData = cleanData.drop(columns= ['_id', 'offices'])
cleanData = cleanData.dropna(subset= ['latitude', 'longitude'])

In [147]:
# Cleaning oldest offices DataFrame dropping unnecessary columns and null info for latitude and longitude
oldestOffices = oldestOffices.drop(columns= ['_id', 'offices'])
oldestOffices = oldestOffices.dropna(subset= ['latitude', 'longitude'])

In [148]:
#Create a function to apply to latitude and longitude series to create a new column with a JSON 'point'
#Why I need this column?
def asGeoJSON(lat,lng):
    lat = float(lat)
    lng = float(lng)
    return {
        "type":"Point",
        "coordinates":[lng,lat]
    }
        

cleanData["location"] = cleanData[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)

oldestOffices["location"] = oldestOffices[["latitude","longitude"]].apply(lambda x:asGeoJSON(x.latitude,x.longitude), axis=1)

df_st['location'] = df_st[['Latitude', 'Longitude']].apply(lambda x:asGeoJSON(x.Latitude,x.Longitude), axis=1)

df_airports['location'] = df_airports[['latitude_deg', 'longitude_deg']].apply(lambda x:asGeoJSON(x.latitude_deg,x.longitude_deg), axis=1)

In [149]:
# Select 3 cities from the original DataFrame based on the number of value_counts of startups in the original DataFrame 
cleanData['city'].value_counts().head(3)

# Cities to compare: San Francisco, New York, Mountain View
# Filter both DataFrames where cities are previous ones.

cleanData = cleanData[(cleanData['city'] == 'San Francisco') | (cleanData['city'] == 'New York') | (cleanData['city'] == 'Mountain View')]

df_st = df_st[(df_st['City'] == 'San Francisco') | (df_st['City'] == 'New York') | (df_st['City'] == 'Mountain View')]

oldestOffices = oldestOffices[(oldestOffices['city'] == 'San Francisco') | (oldestOffices['city'] == 'New York') | (oldestOffices['city'] == 'Mountain View')]

In [150]:
# Exporting data for MongoDB. 
cleanData.to_json("output/cleanOffices.json", orient="records")
oldestOffices.to_json("output/oldestOffices.json", orient="records")
df_st.to_json("output/starbucks.json", orient="records")
df_airports.to_json("output/airports.json", orient="records")

In [154]:
df_st.head()

Unnamed: 0,Brand,Name,City,Country,Latitude,Longitude,location
547,Starbucks,Washington Hgts/181st St,New York,US,40.851,-73.938421,"{'type': 'Point', 'coordinates': [-73.93842099..."
549,Starbucks,168th & Broadway,New York,US,40.84135,-73.939822,"{'type': 'Point', 'coordinates': [-73.93982199..."
557,Starbucks,145th & Bradhurst,New York,US,40.823382,-73.942611,"{'type': 'Point', 'coordinates': [-73.942611, ..."
570,Starbucks,Target East River Plaza T-2380,New York,US,40.795688,-73.932552,"{'type': 'Point', 'coordinates': [-73.932552, ..."
571,Starbucks,125th Street and Lenox,New York,US,40.807809,-73.945108,"{'type': 'Point', 'coordinates': [-73.94510799..."


In [164]:
df_st.head()

Unnamed: 0,Brand,Name,City,Country,Latitude,Longitude,location
547,Starbucks,Washington Hgts/181st St,New York,US,40.851,-73.938421,"{'type': 'Point', 'coordinates': [-73.93842099..."
549,Starbucks,168th & Broadway,New York,US,40.84135,-73.939822,"{'type': 'Point', 'coordinates': [-73.93982199..."
557,Starbucks,145th & Bradhurst,New York,US,40.823382,-73.942611,"{'type': 'Point', 'coordinates': [-73.942611, ..."
570,Starbucks,Target East River Plaza T-2380,New York,US,40.795688,-73.932552,"{'type': 'Point', 'coordinates': [-73.932552, ..."
571,Starbucks,125th Street and Lenox,New York,US,40.807809,-73.945108,"{'type': 'Point', 'coordinates': [-73.94510799..."


In [159]:
start_lat = cleanData['latitude'][0]
start_lon = cleanData['longitude'][0]
heat_m = folium.Map(location=[start_lat, start_lon],tiles='cartodbpositron', zoom_start=15)
heat_m

In [162]:
for index,row in cleanData.iterrows():
    m_lat = row['latitude']
    m_long = row['longitude']
    Marker([m_lat, m_long], icon=folium.Icon(color='red')).add_to(heat_m)

heat_m

In [166]:
for index,row in df_st.iterrows():
    s_lat = row['Latitude']
    s_long = row['Longitude']
    Marker([s_lat, s_long], icon=folium.Icon(color='green')).add_to(heat_m)
heat_m

In [168]:
for index,row in oldestOffices.iterrows():
    s_lat = row['latitude']
    s_long = row['longitude']
    Marker([s_lat, s_long], icon=folium.Icon(color='black')).add_to(heat_m)
heat_m