In [58]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import pprint
import re
import requests as req

client = MongoClient('mongodb://localhost:27017/')
db = client.companies

PyMongo query to keep only companies with office longitude and latitude information available:

In [59]:
query={'$and': [{"offices.city":{'$ne':None}},{'number_of_employees':{'$ne':None}},{'number_of_employees':{'$gt':0}},{'founded_year':{'$ne':None}},{'deadpooled_year':None},{"offices.latitude":{'$not':{'$eq':None}}},{"offices.longitude":{'$not':{'$eq':None}}},{"offices.latitude":{'$exists':True}},{"offices.longitude":{'$exists':True}}]}
projection={'_id':0,'category_code':1,'name':1,'number_of_employees':1,'founded_year':1,'tag_list':1,'description':1,'total_money_raised':1,"offices.city":1,"offices.country_code":1,"offices.latitude":1,"offices.longitude":1}
with_geoloc=db.companies.find(query,projection)

Make a dataframe from the previous query:

In [60]:
df=pd.DataFrame(with_geoloc)
df.columns

Index(['category_code', 'description', 'founded_year', 'name',
       'number_of_employees', 'offices', 'tag_list', 'total_money_raised'],
      dtype='object')

Re-order columns:

In [61]:
df=df[['name','category_code','description','tag_list','founded_year','number_of_employees','total_money_raised','offices']]
df.head(3)

Unnamed: 0,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,offices
0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,"[{'city': 'San Francisco', 'country_code': 'US..."
1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,"[{'city': 'West Hollywood', 'country_code': 'U..."
2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,"[{'city': 'San Francisco', 'country_code': 'US..."


Get exchange rate from API to "clean" currency:

In [62]:
url="https://api.exchangeratesapi.io/latest"
query_params={"base":"USD","symbols":["CAD","EUR","GBP","SEK"]}

res = req.get(url,params=query_params)
content=res.json()
rates=content["rates"]

rates

{'CAD': 1.3124497992,
 'EUR': 0.8924587238,
 'GBP': 0.8029897367,
 'SEK': 9.490227577}

In [63]:
def money_dollars(s):
    mfloat=float(re.search("(\d*\.\d*|\d+)",s).group(1))
    if s.endswith("M"): mfloat*=1000000
    if s.endswith("B"): mfloat*=1000000000
    if s.endswith("k"): mfloat*=1000
    if s.startswith("C$"): mfloat*=1/rates["CAD"]
    if s.startswith("€"): mfloat*=1/rates["EUR"]
    if s.startswith("£"): mfloat*=1/rates["GBP"]
    if s.startswith("kr"): mfloat*=1/rates["SEK"]
    return round(mfloat)

df["raised_money_dollars"]=df.copy()["total_money_raised"].apply(money_dollars)
df=df[['name','category_code','description','tag_list','founded_year','number_of_employees','total_money_raised','raised_money_dollars','offices']]
df.reset_index(inplace=True)
df.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,raised_money_dollars,offices
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,45000000,"[{'city': 'San Francisco', 'country_code': 'US..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,16500000,"[{'city': 'West Hollywood', 'country_code': 'U..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,25800000,"[{'city': 'San Francisco', 'country_code': 'US..."


In [64]:
offices_list=df.copy()[["index","offices"]]
offices_list_clean= pd.DataFrame(offices_list["offices"].tolist()).stack().reset_index(level=1, drop=True).reset_index(name='offices')

offices_list_clean.head()

Unnamed: 0,index,offices
0,0,"{'city': 'San Francisco', 'country_code': 'USA..."
1,1,"{'city': 'West Hollywood', 'country_code': 'US..."
2,2,"{'city': 'San Francisco', 'country_code': 'USA..."
3,3,"{'city': 'New York City', 'country_code': 'USA..."
4,4,"{'city': 'San Francisco', 'country_code': 'USA..."


In [65]:
df_offices=df.merge(offices_list_clean,left_on='index', right_on='index')
df_offices.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,total_money_raised,raised_money_dollars,offices_x,offices_y
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,$45M,45000000,"[{'city': 'San Francisco', 'country_code': 'US...","{'city': 'San Francisco', 'country_code': 'USA..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,$16.5M,16500000,"[{'city': 'West Hollywood', 'country_code': 'U...","{'city': 'West Hollywood', 'country_code': 'US..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,$25.8M,25800000,"[{'city': 'San Francisco', 'country_code': 'US...","{'city': 'San Francisco', 'country_code': 'USA..."


In [66]:
df_offices = df_offices.drop(["offices_x","total_money_raised"], axis=1)
df_offices.rename(index=str, columns={"offices_y": "offices"},inplace=True)

In [67]:
df_offices.head(3)

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,offices
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,"{'city': 'San Francisco', 'country_code': 'USA..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,"{'city': 'West Hollywood', 'country_code': 'US..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,"{'city': 'San Francisco', 'country_code': 'USA..."


In [68]:
def get_coords(data):
    data = data['offices']
#    return (len(data),data[0]['latitude'],data[0]['longitude'])

    # Only create the geoJSON object if all geodata is available
    principal = None
    if data['latitude'] and data['longitude']:
        principal = {
            "type":"Point",
            "coordinates":[data['longitude'], data['latitude']]
        }

    return {
        "country":data['country_code'],
        "city":data['city'],
        "lat": data['latitude'],
        "lng": data['longitude'],
        "geo_point": principal
    }


offices = df_offices[["offices"]].apply(get_coords, result_type="expand", axis=1)
offices.head()

Unnamed: 0,city,country,geo_point,lat,lng
0,San Francisco,USA,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523
1,West Hollywood,USA,"{'type': 'Point', 'coordinates': [-118.393064,...",34.090368,-118.393064
2,San Francisco,USA,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052
3,New York City,USA,"{'type': 'Point', 'coordinates': [-73.985506, ...",40.757929,-73.985506
4,San Francisco,USA,"{'type': 'Point', 'coordinates': [-122.4169244...",37.776805,-122.416924


In [69]:
df_geo=pd.concat([df_offices,offices], axis=1)
df_geo = df_geo.drop(["offices"], axis=1)

In [70]:
df_geo=df_geo[['index', 'name', 'category_code', 'description', 'tag_list', 'founded_year', 'number_of_employees', 'raised_money_dollars', 'country', 'city', 'lat', 'lng', 'geo_point']]
df_geo.head()

Unnamed: 0,index,name,category_code,description,tag_list,founded_year,number_of_employees,raised_money_dollars,country,city,lat,lng,geo_point
0,0,Digg,news,user driven social content website,"community, social, news, bookmark, digg, techn...",2004,60,45000000,USA,San Francisco,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
1,1,Geni,web,Geneology social network site,"geni, geneology, social, family, genealogy",2006,18,16500000,USA,West Hollywood,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."
2,2,Scribd,news,Read Unlimited Books,"book-subscription, digital-library, netflix-fo...",2007,50,25800000,USA,San Francisco,37.789634,-122.404052,"{'type': 'Point', 'coordinates': [-122.404052,..."
3,3,MeetMoi,social,Mobile Dating,"mobile, dating, location, realtime, phone",2007,15,5580000,USA,New York City,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ..."
4,4,Twitter,social,Real time communication platform,"text, messaging, social, community, twitter, t...",2006,1300,1160000000,USA,San Francisco,37.776805,-122.416924,"{'type': 'Point', 'coordinates': [-122.4169244..."


In [71]:
df_geo["category_code"]=df_geo["category_code"].fillna("None")
df_geo["description"]=df_geo["description"].fillna("None")
df_geo["tag_list"]=df_geo["tag_list"].fillna("None")
df_geo.isnull().sum()

index                   0
name                    0
category_code           0
description             0
tag_list                0
founded_year            0
number_of_employees     0
raised_money_dollars    0
country                 0
city                    0
lat                     0
lng                     0
geo_point               0
dtype: int64

In [73]:
df_geo.to_json('../data/df_geo.json', orient="records")