In [104]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
import re
import requests as req

client = MongoClient('mongodb://localhost:27017/')
db = client.companies

#### PyMongo query to keep only companies that match the following criteria:
#### **More than 0 employees, with information on foundation year, not deadpooled, with information on offices and coordinates**
#### Also, I will only be keeping the fields in the "projection" dictionary.

In [105]:
query={
    '$and': [
        {'number_of_employees':{'$ne':None}},
        {'number_of_employees':{'$gt':0}},
        {'founded_year':{'$ne':None}},
        {'deadpooled_year':None},
        {"offices.latitude":{'$not':{'$eq':None}}},
        {"offices.longitude":{'$not':{'$eq':None}}},
        {"offices.latitude":{'$exists':True}},
        {"offices.longitude":{'$exists':True}}]
}
projection={
    '_id':0,'category_code':1,
    'name':1,'number_of_employees':1,
    'founded_year':1,'tag_list':1,
    'description':1,
    'total_money_raised':1,
    "offices.latitude":1,
    "offices.longitude":1}
with_geoloc=db.companies.find(query,projection)

#### Make a dataframe from the previous query:

In [106]:
df=pd.DataFrame(with_geoloc)
df.columns

Index(['category_code', 'description', 'founded_year', 'name',
       'number_of_employees', 'offices', 'tag_list', 'total_money_raised'],
      dtype='object')

#### Re-order columns:

In [107]:
df=df[['name','category_code','description','tag_list','founded_year','number_of_employees','total_money_raised','offices']]
df.head(3)

Unnamed: 0,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,offices
0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,"[{'latitude': 37.764726, 'longitude': -122.394..."
1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,"[{'latitude': 34.090368, 'longitude': -118.393..."
2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,"[{'latitude': 37.789634, 'longitude': -122.404..."


#### The "total_money_raised" column needs to be cleaned as the info is available in different currencies and as a string. First, I'll get the exchange rates from an API:

In [108]:
url="https://api.exchangeratesapi.io/latest"
query_params={"base":"USD","symbols":["CAD","EUR","GBP","SEK"]}

res = req.get(url,params=query_params)
content=res.json()
rates=content["rates"]

rates

{'CAD': 1.3113190731,
 'EUR': 0.8912655971,
 'GBP': 0.8014973262,
 'SEK': 9.4557040998}

#### Now I will transform the content of the column into a float number in USD:

In [109]:
def money_dollars(s):
    mfloat=float(re.search("(\d*\.\d*|\d+)",s).group(1))
    if s.endswith("M"): mfloat*=1000000
    if s.endswith("B"): mfloat*=1000000000
    if s.endswith("k"): mfloat*=1000
    if s.startswith("C$"): mfloat*=1/rates["CAD"]
    if s.startswith("€"): mfloat*=1/rates["EUR"]
    if s.startswith("£"): mfloat*=1/rates["GBP"]
    if s.startswith("kr"): mfloat*=1/rates["SEK"]
    return round(mfloat)

df["raised_money_dollars"]=df.copy()["total_money_raised"].apply(money_dollars)
df=df[['name','category_code','description','tag_list','founded_year','number_of_employees','total_money_raised','raised_money_dollars','offices']]
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,raised_money_dollars,offices
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,45000000,"[{'latitude': 37.764726, 'longitude': -122.394..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,16500000,"[{'latitude': 34.090368, 'longitude': -118.393..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,25800000,"[{'latitude': 37.789634, 'longitude': -122.404..."


#### The "offices" column contains lists of dictionaries with the info of every office each company has. I need to separate each element of these lists into separate rows so I can have a register for each office of each company:

In [110]:
offices_list=df.copy()[["index","offices"]]
offices_list_clean= pd.DataFrame(offices_list["offices"].tolist()).stack().reset_index(level=1, drop=True).reset_index(name='offices')

offices_list_clean.head()

Unnamed: 0,index,offices
0,0,"{'latitude': 37.764726, 'longitude': -122.394523}"
1,1,"{'latitude': 34.090368, 'longitude': -118.393064}"
2,2,"{'latitude': 37.789634, 'longitude': -122.404052}"
3,3,"{'latitude': 40.757929, 'longitude': -73.985506}"
4,4,"{'latitude': 37.7768052, 'longitude': -122.416..."


In [111]:
df_offices=df.merge(offices_list_clean,left_on='index', right_on='index')
df_offices.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,raised_money_dollars,offices_x,offices_y
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,45000000,"[{'latitude': 37.764726, 'longitude': -122.394...","{'latitude': 37.764726, 'longitude': -122.394523}"
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,16500000,"[{'latitude': 34.090368, 'longitude': -118.393...","{'latitude': 34.090368, 'longitude': -118.393064}"
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,25800000,"[{'latitude': 37.789634, 'longitude': -122.404...","{'latitude': 37.789634, 'longitude': -122.404052}"


In [112]:
df_offices = df_offices.drop(["offices_x","total_money_raised"], axis=1)
df_offices.rename(index=str, columns={"offices_y": "offices"},inplace=True)

In [113]:
df_offices.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,offices
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,"{'latitude': 37.764726, 'longitude': -122.394523}"
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,"{'latitude': 34.090368, 'longitude': -118.393064}"
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,"{'latitude': 37.789634, 'longitude': -122.404052}"


#### Now that we have a record for each office, let's extract the coordinates from the "offices" column and create a geo-point we can use later on for geo queries:

In [114]:
def get_coords(data):
    data = data['offices']
#    return (len(data),data[0]['latitude'],data[0]['longitude'])

    # Only create the geoJSON object if all geodata is available
    principal = None
    if data['latitude'] and data['longitude']:
        principal = {
            "type":"Point",
            "coordinates":[data['longitude'], data['latitude']]
        }

    return {
        "lat": data['latitude'],
        "lng": data['longitude'],
        "geo_point": principal
    }


offices = df_offices[["offices"]].apply(get_coords, result_type="expand", axis=1)
offices.head()

Unnamed: 0,geo_point,lat,lng
0,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523
1,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064
2,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052
3,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506
4,"{'type': 'Point', 'coordinates': [-122.4169244...",37.776805,-122.416924


In [115]:
df_geo=pd.concat([df_offices,offices], axis=1)
df_geo = df_geo.drop(["offices"], axis=1)

In [13]:
df_geo=df_geo[['index', 'name', 'category_code', 'description', 'tag_list', 'founded_year', 'number_of_employees', 'raised_money_dollars', 'lat', 'lng', 'geo_point']]
df_geo.head()

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,lat,lng,geo_point
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,..."
3,3,MeetMoi,social,Mobile Dating,"mobile, dating, location, realtime, phone",2007,15,5580000,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ..."
4,4,Twitter,social,Real time communication platform,"text, messaging, social, community, twitter, t...",2006,1300,1160000000,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244..."


#### Now let's replace null values in text columns for "None":

In [116]:
df_geo["category_code"]=df_geo["category_code"].fillna("None")
df_geo["description"]=df_geo["description"].fillna("None")
df_geo["tag_list"]=df_geo["tag_list"].fillna("None")
df_geo.isnull().sum()
test_df=df_geo.copy()
test_df.shape

(4972, 11)

#### I still have almost 5000 companies in my dataframe. Not all of this companies are valid to our criteria. For instance, no one in the company wants to have companies with more than 10 years in a radius of 2 KM (note that the data in our dataset is from 2013, so let's assume it's still 2013).
#### Therefore, let's create a new column that indicates if the company is "young" (10 years old or less):

In [117]:
def is_young(x):
    if x>=2003: return 1
    return 0
test_df["is_young"]=test_df["founded_year"].apply(is_young)
test_df.head(3)


Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,geo_point,lat,lng,is_young
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523,1
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064,1
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052,1


#### Note that I will not be exluding the "old" companies from my dataframe as this would make it impossible to know wether a location is close to one of these companies (how else would you know that they are not close if you don't know where they are???)

#### Now we want to determine wether the companies in our dataframe other criteria or not:
#### 1. Employees want the offices to be close to design companies. 
#### 2. Employees want the offices to be close to succesful tech startups that have raised at least 1 Million dollars.
#### 3. In my opinion, it would be good to have "similar" companies close to our offices. 
#### So, based on this I've made a few functions to see if the criteria is met. I will only evaluate the criteria for "young" companies, as old companies will be in my "go away" list:

In [118]:
def is_succesful(x):
    if x>1000000: return 1
    return 0

def is_similar(df1,df2,df3,lista):
    lst=[]
    for i in range(df1.shape[0]):
        for el in lista:
            if (el in df1[i].lower()) or (el in df2[i].lower()) or (el in df3[i].lower()):
                lst.append(1)
                break
        else: lst.append(0)
    return lst

similars=['web', 'software', 'games_video', 'games','game','mobile','search', 'social','design']

In [119]:
test_df["is_design"]=test_df["is_young"]*is_similar(test_df["category_code"],test_df["description"],test_df["tag_list"],["design"])
test_df["is_similar"]=test_df["is_young"]*is_similar(test_df["category_code"],test_df["description"],test_df["tag_list"],similars)
test_df["is_succesful"]=test_df["is_young"]*test_df["raised_money_dollars"].apply(is_succesful)

In [120]:
test_df = test_df.drop(["index"], axis=1)


In [122]:
test_df.head(3)

Unnamed: 0,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,geo_point,lat,lng,is_young,is_design,is_similar,is_succesful
0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523,1,1,1,1
1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064,1,0,1,1
2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052,1,0,1,1


#### There are some duplicated rows, we need to delete those:

In [124]:
no_dups=test_df.copy()
no_dups["geo_point"]=no_dups["geo_point"].astype(str)
before = no_dups.shape[0]
no_dups = no_dups.drop_duplicates()
after = no_dups.shape[0]
print('Number of duplicate records dropped: ', str(before - after))


Number of duplicate records dropped:  275


In [125]:
no_dups["geo_point"][0]

"{'type': 'Point', 'coordinates': [-122.394523, 37.764726]}"

#### We had to convert our "geo_point" column to string to delete duplicate records. Now we need to make it into a dictionary again:

In [126]:
import ast
no_dups["geo_point"]=no_dups["geo_point"].apply(ast.literal_eval)

In [128]:
no_dups["geo_point"][0]

{'coordinates': [-122.394523, 37.764726], 'type': 'Point'}

In [129]:
no_dups[no_dups['is_young']==1].shape

(3504, 14)

#### Out of nearly 5000 companies, 3504 onf them are "young" companies we would like to be near to! these will be the companies we'll be evaluating in our next step. Now, lets export the df to json so we can creat the geo index:

In [131]:
no_dups.to_json('../data/df_geo.json', orient="records")