In [216]:
import pandas as pd
import numpy as np
import re

In [217]:
from pymongo import MongoClient

client = MongoClient()

def connectCollection(database, collection):
    ''' return collection from db crunchbase'''
    db = client[database]
    coll = db[collection]
    return db, coll

In [218]:

db, coll = connectCollection('dbcompanies','companies')

In [219]:
# Primer filtro:
filter1 = [
    {"$match": {"$and":
            [{"founded_year": {"$gte": 2009}}, # Fundadas a partir del 2009
             {"deadpooled_year": None}, # Que no hayan quebrado
             {"offices.latitude": {"$exists": True, "$ne": None}}, # Que tengan latitud
             {"offices.longitude": {"$exists": True, "$ne": None}}]}}] # Que tengan longitud

In [220]:
offices = list(coll.aggregate(filter1))

In [221]:
def prepareData(bd):
    cleanedItems = []
    errors = 0
    for group in offices:
        for item in range(len(group['offices'])):
            try:
                cleanedItems.append({
                "name":group["name"],
                "employees": group['number_of_employees'],
                "year":group['founded_year'],
                "category":group['category_code'],
                "id":group['_id'],
                "money raised":group['total_money_raised'],
                "city" : group['offices'][item]['city'],
                "country" : group['offices'][item]['country_code'],
                "latitude" : group['offices'][item]['latitude'],
                "longitude" : group['offices'][item]['longitude'],
                "coordinates":{
                    "type":"Point",
                    "coordinates":[group['offices'][item]['longitude'],group['offices'][item]['latitude']]}})
            except Exception:
                errors += 1
                if errors > 0:
                    print(f"Hay {errors} errores")
    return cleanedItems

In [222]:
prepareData(offices)

[{'name': 'Mokitown',
  'employees': None,
  'year': 2011,
  'category': 'web',
  'id': ObjectId('52cdef7c4bab8bd675297ea7'),
  'money raised': '$0',
  'city': None,
  'country': 'USA',
  'latitude': 37.09024,
  'longitude': -95.712891,
  'coordinates': {'type': 'Point', 'coordinates': [-95.712891, 37.09024]}},
 {'name': 'PeekYou',
  'employees': 20,
  'year': 2012,
  'category': 'search',
  'id': ObjectId('52cdef7c4bab8bd675297f94'),
  'money raised': '$1.83M',
  'city': 'New York',
  'country': 'USA',
  'latitude': 40.757929,
  'longitude': -73.985506,
  'coordinates': {'type': 'Point', 'coordinates': [-73.985506, 40.757929]}},
 {'name': 'GENWI',
  'employees': 25,
  'year': 2010,
  'category': 'mobile',
  'id': ObjectId('52cdef7c4bab8bd675297f9e'),
  'money raised': '$7.1M',
  'city': 'Los Altos',
  'country': 'USA',
  'latitude': 33.8171,
  'longitude': -111.9035,
  'coordinates': {'type': 'Point', 'coordinates': [-111.9035, 33.8171]}},
 {'name': 'Fixya',
  'employees': 30,
  'year

In [223]:
data = pd.DataFrame(prepareData(offices))

In [224]:
data.head()

Unnamed: 0,name,employees,year,category,id,money raised,city,country,latitude,longitude,coordinates
0,Mokitown,,2011,web,52cdef7c4bab8bd675297ea7,$0,,USA,37.09024,-95.712891,"{'type': 'Point', 'coordinates': [-95.712891, ..."
1,PeekYou,20.0,2012,search,52cdef7c4bab8bd675297f94,$1.83M,New York,USA,40.757929,-73.985506,"{'type': 'Point', 'coordinates': [-73.985506, ..."
2,GENWI,25.0,2010,mobile,52cdef7c4bab8bd675297f9e,$7.1M,Los Altos,USA,33.8171,-111.9035,"{'type': 'Point', 'coordinates': [-111.9035, 3..."
3,Fixya,30.0,2013,web,52cdef7c4bab8bd675297fec,$8M,San Mateo,USA,37.566879,-122.323895,"{'type': 'Point', 'coordinates': [-122.323895,..."
4,alluc,7.0,2009,games_video,52cdef7c4bab8bd675298038,$0,Norderstedt,DEU,53.707739,10.023246,"{'type': 'Point', 'coordinates': [10.023246, 5..."


In [228]:
data.dtypes

name             object
employees       float64
year              int64
category         object
id               object
money raised     object
city             object
country          object
latitude        float64
longitude       float64
coordinates      object
dtype: object

In [229]:
def money_clean(data):
    
    ''' Function to convert money raised to int '''
    
    data['money raised']= data['money raised'].replace('[Kk]', '*100',regex = True).replace( 'M', '*1000000',regex=True)
    data['money raised']= data['money raised'].replace('[€$£]','',regex=True)
    #data['money raised']= data['money raised'].apply(np.int64)
    data['money raised']= data['money raised'].map(pd.eval)
    #data=data[data!=0].dropna()
    return money_comp

def category(data):
    
    ''' Function to clean category column '''
    
    web_design = {'search': 'web','mobile': 'web','web': 'web','games_video': 'web','ecommerce': 'web','advertising': 'web',
              'hardware': 'web','enterprise': 'web','network_hosting': 'web','software': 'web',
              'analytics': 'web','cleantech': 'web'}
    data = money_clean(data).replace(web_design, regex=True)
    return money_comp

    

In [230]:
data_new = category(data)


In [232]:
data_new

Unnamed: 0,name,employees,year,category,id,money raised,city,country,latitude,longitude,coordinates
3,Fixya,30.0,2013,web,52cdef7c4bab8bd675297fec,8000000.0,San Mateo,USA,37.566879,-122.323895,"{'type': 'Point', 'coordinates': [-122.323895,..."
21,BrandYourself,22.0,2009,web,52cdef7d4bab8bd675298ce2,1500000.0,New York,USA,42.275263,-71.24762,"{'type': 'Point', 'coordinates': [-71.24762, 4..."
56,Formspring,19.0,2009,web,52cdef7d4bab8bd67529a00e,14300000.0,San Francisco,USA,39.905226,-86.054702,"{'type': 'Point', 'coordinates': [-86.0547016,..."
58,Yipit,23.0,2010,web,52cdef7d4bab8bd67529a1e3,7550000.0,New York,USA,40.744618,-73.987764,"{'type': 'Point', 'coordinates': [-73.987764, ..."
112,Tinychat,5.0,2009,web,52cdef7e4bab8bd67529b0f2,1500000.0,Glen Cove,USA,42.375641,-72.519691,"{'type': 'Point', 'coordinates': [-72.5196907,..."
164,Tongal,8.0,2009,web,52cdef7e4bab8bd67529b572,16000000.0,Santa Monica,USA,34.007112,-118.489748,"{'type': 'Point', 'coordinates': [-118.489748,..."
295,Ykone,10.0,2009,web,52cdef7f4bab8bd67529c014,1500000.0,Paris,FRA,48.856667,2.350987,"{'type': 'Point', 'coordinates': [2.3509871, 4..."
325,ChallengePost,17.0,2009,web,52cdef7f4bab8bd67529c343,4600000.0,New York,USA,40.740804,-74.00717,"{'type': 'Point', 'coordinates': [-74.00717, 4..."
