In [37]:
import pymongo
import pandas as pd
MongoClient = pymongo.MongoClient
client = MongoClient()
db = client.companies
companies = db['companies']

In [38]:
def convert_companies(company): 
    return {
        "name": company["name"],
        "category_code": company["category_code"],
        "founded_year": company["founded_year"],
        "number_of_employees": company["number_of_employees"],
        "total_money_raised": company["total_money_raised"],
        "offices": company["offices"]
        }

cursor = companies.find(
    {"$and":
     [
         {"offices.latitude": {"$exists": True}},
         {"founded_year": {"$gte": 2005}}, 
         {"$or": [
             {"category_code": "web"},
             {"category_code": "software"},
             {"category_code": "games_video"},
             {"category_code": "mobile"},
             {"category_code": "social"},
             {"category_code": "search"},
             {"category_code": "cleantech"}
         ]
         }
     ]
    })

geoCompanies = list(map(lambda r: convert_companies(r), cursor))
len(geoCompanies)

3791

In [39]:
def convert_offices(company):
    companies = []
    for x in geoCompanies:
        for i, y in enumerate(x["offices"]):
            z = {
            "name": x["name"] + "_office_" + str(i +1),
            "category_code": x["category_code"],
            "founded_year": x["founded_year"],
            "number_of_employees": x["number_of_employees"],
            "total_money_raised": x["total_money_raised"],
            'latitude': y['latitude'],
            'longitude': y['longitude']}
            companies.append(z)
    return companies

offices = convert_offices(geoCompanies)
len(offices)

4318

In [40]:
df = pd.DataFrame(offices)
df.head()

Unnamed: 0,category_code,founded_year,latitude,longitude,name,number_of_employees,total_money_raised
0,web,2005,47.603122,-122.333253,Wetpaint_office_1,47.0,$39.8M
1,web,2005,40.723731,-73.996431,Wetpaint_office_2,47.0,$39.8M
2,software,2005,37.692934,-121.904945,Zoho_office_1,1600.0,$0
3,web,2006,34.090368,-118.393064,Geni_office_1,18.0,$16.5M
4,mobile,2005,34.057498,-118.446596,Helio_office_1,,$0


In [41]:
df = df.dropna(subset= ["latitude", "longitude"])
null_cols = df.isnull().sum()
null_cols[null_cols > 0]

number_of_employees    823
dtype: int64

In [42]:
df = df[['name','category_code','founded_year','number_of_employees','total_money_raised','latitude','longitude']]

In [43]:
df["category_code"].value_counts()

web            1395
software        715
games_video     418
mobile          334
search          150
cleantech        81
social           40
Name: category_code, dtype: int64

In [44]:
position = []
for i in range(len(df)):
    lat = df.iloc[i]["latitude"]
    lon = df.iloc[i]["longitude"]
    point = {"type":"Point", "coordinates": [lon,lat]}
    position.append(point)
    
df["position"] = position

In [45]:
for i,x in enumerate(df["total_money_raised"]):
    x = x.replace("$", "").replace("€", "").replace("£", "").replace("¥", "").replace("C", "")
    if "M" in x:
        x = x.replace("M", "")
        df["total_money_raised"].iloc[i] = float(x)*1000000
    elif "k" in x:
        x = x.replace("k", "")
        df["total_money_raised"].iloc[i] = float(x)*1000
    elif "B" in x:
        x = x.replace("B", "")
        df["total_money_raised"].iloc[i] = float(x)*1000000000
    else:
        df["total_money_raised"].iloc[i] = float(x)


In [46]:
df["total_money_raised"] = pd.to_numeric(df.total_money_raised)
df.dtypes

name                    object
category_code           object
founded_year             int64
number_of_employees    float64
total_money_raised     float64
latitude               float64
longitude              float64
position                object
dtype: object

In [47]:
df.to_json('visualize_companies.json', orient="records", lines = True)

In [94]:
cursor = []
for x in range(len(df)):
    lat = df.latitude.iloc[x]
    lgtd = df.longitude.iloc[x]
    cursor.append(list(db.clean.find({"position": 
                     {"$near": 
                      {"$geometry": {"type": "Point", 
                                     "coordinates": [lgtd, lat]},
                       "$maxDistance": 5000}}})))
    
count = [len(e) for e in cursor]
df["count"] = count


In [95]:
softcp_areas = df[df["count"] >= 50]

In [96]:
softcp_areas[softcp_areas["name"] == "Wisevid_office_1"]

Unnamed: 0,name,category_code,founded_year,number_of_employees,total_money_raised,latitude,longitude,position,count


In [97]:
softcp_areas = softcp_areas.drop(columns=['founded_year','number_of_employees','total_money_raised','position','count'])
softcp_areas = softcp_areas.drop_duplicates()

In [98]:
softcp_areas.to_json('visualize_companies2.json', orient="records", lines = True)