In [5]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import re
import requests as req


client = MongoClient('mongodb://localhost:27017/')
db = client.companies


#### First of all, let's query the "young" (valid) companies from our new database:

In [6]:
young=db.offices_ok.find({"is_young":1},{"_id":0,"index":0,'is_young':0,'is_design':0,'is_similar':0,'is_succesful':0})
mydf=pd.DataFrame(young)

In [7]:
mydf=mydf[['name','category_code','description','tag_list','founded_year','number_of_employees','raised_money_dollars','lat','lng','geo_point']]
mydf.shape

(3504, 10)

#### Now we want to make a geo query for each of our desired companies. We would like to know for each company:
#### 1. How many nearby design, similar and/or succesufl companies there are.
#### 2. How many nearby "old" companies there are.

In [8]:
def find_old(df1, radio_max_meters=2000):
    lista=[]
    for i in range(len(df1)): 
        old=db.offices_ok.find({'$and':[{
                        "geo_point": {
                            "$near": {
                                "$geometry": df1[i],
                                "$maxDistance": radio_max_meters,
                            }
                        }
                    },{
                        'is_young':0
                    }]})
        tempdf=pd.DataFrame(old)
        lista.append(tempdf.shape[0])
    return lista
       


In [9]:
def find_young(df1, radio_max_meters=2000):
    lista=[]
    for i in range(len(df1)): 
        old=db.offices_ok.find({'$and':[{
                        "geo_point": {
                            "$near": {
                                "$geometry": df1[i],
                                "$maxDistance": radio_max_meters,
                            }
                        }
                    },{
                        'is_young':1
                    }]})
        tempdf=pd.DataFrame(old)
        lista.append(tempdf.shape[0])
    return lista

In [10]:
def find_design(df1, radio_max_meters=2000):
    lista=[]
    for i in range(len(df1)): 
        old=db.offices_ok.find({'$and':[{
                        "geo_point": {
                            "$near": {
                                "$geometry": df1[i],
                                "$maxDistance": radio_max_meters,
                            }
                        }
                    },{
                        'is_design':1
                    }]})
        tempdf=pd.DataFrame(old)
        lista.append(tempdf.shape[0])
    return lista

In [11]:
def find_similar(df1, radio_max_meters=2000):
    lista=[]
    for i in range(len(df1)): 
        old=db.offices_ok.find({'$and':[{
                        "geo_point": {
                            "$near": {
                                "$geometry": df1[i],
                                "$maxDistance": radio_max_meters,
                            }
                        }
                    },{
                        'is_similar':1
                    }]})
        tempdf=pd.DataFrame(old)
        lista.append(tempdf.shape[0])
    return lista

In [12]:
def find_succesful(df1, radio_max_meters=2000):
    lista=[]
    for i in range(len(df1)): 
        old=db.offices_ok.find({'$and':[{
                        "geo_point": {
                            "$near": {
                                "$geometry": df1[i],
                                "$maxDistance": radio_max_meters,
                            }
                        }
                    },{
                        'is_succesful':1
                    }]})
        tempdf=pd.DataFrame(old)
        lista.append(tempdf.shape[0])
    return lista

In [13]:
mydf["old_near"]=find_old(mydf["geo_point"])
mydf["young_near"]=find_young(mydf["geo_point"])
mydf["design_near"]=find_design(mydf["geo_point"])
mydf["similar_near"]=find_similar(mydf["geo_point"])
mydf["succesful_near"]=find_succesful(mydf["geo_point"])

In [14]:
mydf.head(3)

Unnamed: 0,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,lat,lng,geo_point,old_near,young_near,design_near,similar_near,succesful_near
0,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",0,8,1,8,2
1,MeetMoi,social,Mobile Dating,"mobile, dating, location, realtime, phone",2007,15,5580000,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ...",33,104,3,81,37
2,Twitter,social,Real time communication platform,"text, messaging, social, community, twitter, t...",2006,1300,1160000000,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244...",19,109,3,83,53


#### Now let's keep only the companies that have no "old" companies around them and create a "score" field for them. This score will be based in normalizing the *design_near, similar_near and succesful_near* columns based on their maximum values and then summing their values for each record. Each variable will have the same weight and the maximum score will be 3:

In [15]:
young=mydf[mydf["old_near"]==0]

In [16]:
young = young.drop(["description","tag_list","geo_point","old_near","young_near"], axis=1)

In [17]:
young["Score1"]=(young["design_near"]/young["design_near"].max())+(young["similar_near"]/young["similar_near"].max())+(young["succesful_near"]/young["succesful_near"].max())

#### Now let's rank the companies by their score and keep the top 50:

In [18]:
young=young.sort_values(by='Score1', ascending=False)
young=young.reset_index()


In [22]:
top50=young.copy()[:50]
top50.head(3)

Unnamed: 0,index,name,category_code,founded_year,number_of_employees,raised_money_dollars,lat,lng,design_near,similar_near,succesful_near,Score1
0,844,SocialVibe,advertising,2007,80,43900000,34.081524,-118.382674,2,14,2,2.183333
1,2192,Silver Tail Systems,analytics,2008,90,22100000,37.428088,-122.143368,0,15,8,2.0
2,129,SayNow,mobile,2005,20,7500000,37.42746,-122.143915,0,15,8,2.0


#### Now we want to know how these locations match the preferences of our employees. We will use the google maps API to figure it out:
#### 1. The number of schools in a radius of 1000m.
#### 2. Number of starbucks in a radius of 200 m.
#### 3. Number of airports in a radius of 10000 m.
#### 4. Number of bars in a radius of 500m.
#### 5. Number of vegan restaurant in a radius of 200m.


In [23]:
import os
from dotenv import load_dotenv
load_dotenv()

if not "PLACES_KEY" in os.environ:
    raise ValueError("You should pass a PLACES_KEY, see: https://developers.google.com/places/web-service/get-api-key")

PLACES_KEY = os.environ["PLACES_KEY"]

def get_near(df,tipo,keyword,radius):
    lst=[]
    for i in range(len(df)):
        lat=df["lat"][i]
        lng=df["lng"][i]
        res = req.get("https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={},{}&radius={}&type={}&keyword={}&key={}".format(lat,lng,radius,tipo,keyword,PLACES_KEY))
        content=res.json()
        if content['status']=='OK': lst.append(len(content["results"]))
        else: lst.append(0)
    return lst


In [24]:
top50["schools"]=get_near(top50,'school','school',1000)

In [25]:
top50["starbucks"]=get_near(top50,'cafe','starbucks',200)

In [26]:
top50["bars"]=get_near(top50,'bar','bar',500)

In [27]:
top50["vegan_rests"]=get_near(top50,'restaurant','vegan',200)

In [28]:
top50["airports"]=get_near(top50,'airport','international%20airport',10000)

In [30]:
top50.head()

Unnamed: 0,index,name,category_code,founded_year,number_of_employees,raised_money_dollars,lat,lng,design_near,similar_near,succesful_near,Score1,schools,starbucks,bars,vegan_rests,airports
0,844,SocialVibe,advertising,2007,80,43900000,34.081524,-118.382674,2,14,2,2.183333,10,1,19,4,3
1,2192,Silver Tail Systems,analytics,2008,90,22100000,37.428088,-122.143368,0,15,8,2.0,15,1,7,5,2
2,129,SayNow,mobile,2005,20,7500000,37.42746,-122.143915,0,15,8,2.0,15,1,8,9,2
3,107,Doostang,web,2005,10,5750000,37.427235,-122.145783,0,15,8,2.0,14,2,9,6,2
4,994,PerfectBusiness,web,2008,10,0,33.995974,-118.456709,2,7,2,1.716667,11,1,2,1,11


#### Now we will create another score column that will score each company based on "likes" of the employees. It will be calculated the same way as score 1 but using the info requested from the API and assigning different weights to each variable (*schools 0.6, airports 0.5, bars 0.4, starbucks 0.25 and vegan restaurants 0.25*), the maximum value of this score will be 2:

In [31]:
top50["Score2"]=(0.6*top50["schools"]/top50["schools"].max())+(0.25*top50["starbucks"]/top50["starbucks"].max())+(0.4*top50["bars"]/top50["bars"].max())+(0.25*top50["vegan_rests"]/top50["vegan_rests"].max())+(0.5*top50["airports"]/top50["airports"].max())

#### Also, let's create a total score that will be the sum of both scores and sort the list by total score:

In [32]:
top50["Total_score"]=top50["Score1"]+top50["Score2"]

In [33]:
top50=top50.sort_values(by='Total_score', ascending=False)
top50=top50.reset_index()

In [35]:
top50 = top50.drop(["index","level_0"], axis=1)
top50.head()

Unnamed: 0,name,category_code,founded_year,number_of_employees,raised_money_dollars,lat,lng,design_near,similar_near,succesful_near,Score1,schools,starbucks,bars,vegan_rests,airports,Score2,Total_score
0,SocialVibe,advertising,2007,80,43900000,34.081524,-118.382674,2,14,2,2.183333,10,1,19,4,3,0.971126,3.154459
1,Doostang,web,2005,10,5750000,37.427235,-122.145783,0,15,8,2.0,14,2,9,6,2,0.964719,2.964719
2,SayNow,mobile,2005,20,7500000,37.42746,-122.143915,0,15,8,2.0,15,1,8,9,2,0.944957,2.944957
3,plaYce,games_video,2008,7,0,37.80204,-122.438231,1,11,3,1.608333,20,3,20,2,0,1.285714,2.894048
4,Silver Tail Systems,analytics,2008,90,22100000,37.428088,-122.143368,0,15,8,2.0,15,1,7,5,2,0.853528,2.853528


In [85]:
#Save to a .csv so I don't have to call the API again...:
top50.to_csv('../data/top50.csv',index=False)

In [86]:
#Load from .csv:
top50=pd.read_csv('../data/top50.csv')
top50.head()

Unnamed: 0,name,category_code,founded_year,number_of_employees,raised_money_dollars,lat,lng,design_near,similar_near,succesful_near,Score1,schools,starbucks,bars,vegan_rests,airports,Score2,Total_score
0,SocialVibe,advertising,2007,80,43900000,34.081524,-118.382674,2,14,2,2.183333,10,1,19,4,3,0.971126,3.154459
1,Doostang,web,2005,10,5750000,37.427235,-122.145783,0,15,8,2.0,14,2,9,6,2,0.964719,2.964719
2,SayNow,mobile,2005,20,7500000,37.42746,-122.143915,0,15,8,2.0,15,1,8,9,2,0.944957,2.944957
3,plaYce,games_video,2008,7,0,37.80204,-122.438231,1,11,3,1.608333,20,3,20,2,0,1.285714,2.894048
4,Silver Tail Systems,analytics,2008,90,22100000,37.428088,-122.143368,0,15,8,2.0,15,1,7,5,2,0.853528,2.853528


####  Here's a heat map of our top 50 companies:

In [87]:
import folium
from folium import plugins
from folium.plugins import MiniMap

m = folium.Map(zoom_start=5)
stationArr = top50[['lat', 'lng']].values
m.add_child(plugins.HeatMap(stationArr, radius=20))
m

#### And here's a map that also shows the number of companies in each zone (looks like California is our winner):

In [88]:
from folium.plugins import FastMarkerCluster
folium_map = folium.Map(location=[50, 20],
                        zoom_start=2,)
FastMarkerCluster(data=list(zip(top50['lat'].values, top50['lng'].values))).add_to(folium_map)
folium.LayerControl().add_to(folium_map)
folium_map

#### And here we can see how well each company did based on their score represented by the size of their radius:

In [89]:
m = folium.Map(zoom_start=5)

for i in range(len(top50["lat"])):
    tooltip = top50["name"][i]
    loc=[top50["lat"][i], top50["lng"][i]]
    score=int(top50["Total_score"][i])
    folium.Marker(loc, popup=tooltip, tooltip=tooltip).add_to(m)
    folium.Circle(location=loc, radius=(score)*100000, fill=True, fill_color='blue').add_to(m)


In [41]:
m

#### Now let's have a look at our top 10, we will choose our offices location based on these:

In [91]:
top10=top50.copy().head(10)
top10

Unnamed: 0,name,category_code,founded_year,number_of_employees,raised_money_dollars,lat,lng,design_near,similar_near,succesful_near,Score1,schools,starbucks,bars,vegan_rests,airports,Score2,Total_score
0,SocialVibe,advertising,2007,80,43900000,34.081524,-118.382674,2,14,2,2.183333,10,1,19,4,3,0.971126,3.154459
1,Doostang,web,2005,10,5750000,37.427235,-122.145783,0,15,8,2.0,14,2,9,6,2,0.964719,2.964719
2,SayNow,mobile,2005,20,7500000,37.42746,-122.143915,0,15,8,2.0,15,1,8,9,2,0.944957,2.944957
3,plaYce,games_video,2008,7,0,37.80204,-122.438231,1,11,3,1.608333,20,3,20,2,0,1.285714,2.894048
4,Silver Tail Systems,analytics,2008,90,22100000,37.428088,-122.143368,0,15,8,2.0,15,1,7,5,2,0.853528,2.853528
5,Boxbe,web,2005,3,1500000,37.800209,-122.442592,1,12,3,1.675,20,1,20,1,0,1.10119,2.77619
6,IGOpeople,network_hosting,2008,4,0,53.336139,-6.249789,1,9,2,1.35,20,3,20,1,3,1.404221,2.754221
7,PerfectBusiness,web,2008,10,0,33.995974,-118.456709,2,7,2,1.716667,11,1,2,1,11,0.97119,2.687857
8,ftopia,software,2009,7,0,53.332814,-6.249378,1,9,2,1.35,20,3,20,2,1,1.331169,2.681169
9,Making Sense,software,2006,100,0,-34.589281,-58.43297,2,3,0,1.2,20,1,20,3,2,1.227814,2.427814


#### Based on the scores, my first choice would be location 34.081524, -118.382674 (located in West Hollywood, California). Not only it ranks very well in employees amenities (it has more than enough schools, starbucks, bars, vegan restaurants and airports around) but it also matches the "nearby companies" criteria. Also, it's only a 20 minute ride to the "Staples Center", LA's NBA team stadium, so our maintenance guy will be very happy:

In [92]:
m = folium.Map(zoom_start=13, location=[top10["lat"][0], top10["lng"][0]])
tooltip = top10["name"][0]
loc=[top10["lat"][0], top10["lng"][0]]
folium.Marker(loc, popup=tooltip, tooltip=tooltip).add_to(m)
minimap = MiniMap(zoom_level_offset=-5)
m.add_child(minimap)
m

#### My second choice would be location 37.427235, -122.145783 (Palo Alto, California):

In [93]:
m = folium.Map(zoom_start=13, location=[top10["lat"][1], top10["lng"][1]])
tooltip = top10["name"][1]
loc=[top10["lat"][1], top10["lng"][1]]
folium.Marker(loc, popup=tooltip, tooltip=tooltip).add_to(m)
minimap = MiniMap(zoom_level_offset=-5)
m.add_child(minimap)
m