# CrunchBase indicators: number of organisations and investment.

Here we produce indicators about level of Venture & Seed Funding in the UK using proprietary CrunchBase data licensed by Nesta. 

This involves:

* Download the data from Nesta DAPS system
* Merge organisations & funders to create org - funding matches
* Geocode with NUTS2 and LEPS geographies
* Create indicators
  * This will be based on a function that subsets by year and distinguishes between seed funding and venture capital

## Preamble

In [1]:
%run ../notebook_preamble.ipy

In [2]:
import re
import random
from zipfile import ZipFile
from io import BytesIO
import csv
from data_getters.labs.core import download_file
from ast import literal_eval
from data_getters.core import get_engine


In [3]:
#dirs

if 'crunchbase' not in os.listdir('../../data/raw'):
    os.makedirs('../../data/raw/crunchbase')


if 'crunchbase' not in os.listdir('../../data/interim'):
    os.makedirs('../../data/interim/crunchbase')

    
if 'crunchbase' not in os.listdir('../../data/processed/'):
    os.makedirs('../../data/processed/crunchbase')

In [4]:
# %load ../utilities.py
# Some utilities

import random

def make_data_dict(table,name,path,sample=5):
    '''
    A function to output the form for a data dictionary
    
    Args:
        -table (df) is the df we want to create the data dictionary for
        -name (str) of the df
        -path (str) is the place where we want to save the file
        

    
    '''
    
    types = [estimate_type(table[x],sample=sample) for x in table.columns]
        
    data_dict = pd.DataFrame()
    data_dict['variable'] = table.columns
        
    data_dict['type'] = types
    
    data_dict['description'] = ['' for x in data_dict['variable']]
        
    out = os.path.join(path,f'{today_str}_{name}.csv')
    
    #print(data_dict.columns)
    
    data_dict.to_csv(out)
    

def estimate_type(variable,sample):
    '''
    Estimates the type of a column. 

    Args:
        variable (iterable) with values
        sample (n) is the number of values to test
    
    '''
    
    selection = random.sample(list(variable),sample)
    
    types = pd.Series([type(x) for x in selection]).value_counts().sort_values(ascending=False)
    
    return(types.index[0])



In [5]:
def save_data(df,name,path,today=today_str):
    '''
    Utility to save processed data quicker
    
    Arguments:
        df (df) is the dataframe we want to save
        name (str) is the name of the file
        path (str) is the path where we want to save the file
        today (str) is the day when the data is saved
    
    '''
    
    df.to_csv(f'{path}/{today_str}_{name}.csv')
    

In [6]:
def get_daps_data(table,connection,chunksize=1000):
    '''
    Utility function to get data from DAPS with less faff
    
    Args:
        -table is the SQL table in DAPS that we are extracting
        -connection is the database connection we are using
        -Chunksize are the chunks to download
    
    Returns:
        -A dataframe with the data we have collected
    
    '''
    #Get chunks
    chunks = pd.read_sql_table(table, connection, chunksize=chunksize)
    
    #Create df
    df = pd.concat(chunks)
    
    #Return data
    return(df)

In [7]:
def make_conversion(x,tid):
    '''
    Function to convert funding rounds from CrunchBase into GBP
    
    Args:
        transaction: a transaction from the CB funding rounds dataset
        tid: transaction id (to track issues)
    
    Returns:
        A conversion (if possible)
    '''
    
    #If an amount is not in GBP convert to GBP, if not, keep it as is
    
    #The currency converter doesn't work with Lebanese pounds so we will skip that
    if (x['raised_amount_currency_code']=='LBP')|(x['raised_amount_currency_code']==None):
        return(np.nan)
    
    else:
    
        try:
            out = x['raised_amount']*c.get_rate(
                x['raised_amount_currency_code'],'GBP',x['announced_on']) if x['raised_amount_currency_code']!='GBP' else x['raised_amount']
            return(out)

        except: 
            print(tid)

In [8]:
def aggregate_investments(df,geography):
    '''
    This function aggregates level of funding over a geography and investment type for a selected period
    
    Arguments:
        df: df with investment levels by geocoded organisaton, year and type
        years: (list) year range to be considered
        geography: (str) what geography name to use
    
    Returns a table where the rows are the geography and the columns are levels of funding by investment type
    
    '''
    
    #Period to consider
    #period = np.arange(years[0],years[1])
    
    #Subset by the year
    #df_in_period = df.loc[[x.year in period for x in df['announced_on']]]
    
    #Pivot
    out = pd.pivot_table(df,index=[geography,'announced_year'],columns='investment_type',values='raised_amount_gbp',aggfunc='sum').fillna(0)
    
    return(out)
    

In [9]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018,decimals=3):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    #Round variables
    t[var_code] = [np.round(x,decimals) if decimals>0 else int(x) for x in t[var_code]]
    
    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv',index=False)

## 1. Load Data

### Setup

In [10]:
# Download CrunchBase data using DAPS

my_config = '../../mysqldb_team.config'

#Create connection with SQL
con = get_engine(my_config)

#### Organisations

This is the list of organisations we want to wo

In [11]:
#Read data
cb_orgs = get_daps_data('crunchbase_organizations',con)

In [12]:
cb_orgs.head()

Unnamed: 0,id,company_name,roles,permalink,domain,homepage_url,country,state_code,region,city,...,twitter_url,aliases,created_at,updated_at,primary_role,type,long_description,parent_id,is_health,mesh_terms
0,00000aa4-ba42-9b68-a9c3-040c9f3bf9b9,Formel D GmbH,company,/organization/formel-d-gmbh,formeld.com,http://www.formeld.com,Germany,,DEU - Other,Troisdorf,...,https://www.twitter.com/formeld_es,,2016-06-01 06:58:37,2018-10-26 22:19:19,company,organization,Formel D GmbH is a automotive manufacturer and...,,0,
1,000014da-0c46-b9cb-0941-3a93c027b119,Resilio,company,/organization/resiliohq,resiliohq.com,http://www.resiliohq.com,Denmark,,,Aarhus,...,https://twitter.com/andersresilio,,2017-05-03 20:09:32,2018-10-26 23:15:32,company,organization,By combining state of the art signal processin...,,1,"Games, Recreational|Signal Processing, Compute..."
2,00001b2d-a4f7-55d5-d69a-17acbac9c17b,STD Risk Calculator,company,/organization/std-risk-calculator,stdriskcalculator.com,http://www.stdriskcalculator.com,,,,,...,http://twitter.com/STDRisk,,2012-03-18 06:23:27,2018-10-26 23:30:24,company,organization,STD Risk Calculator is a company dedicated to ...,,1,Gonorrhea|Syphilis|Sexual Health|Sexually Tran...
3,00001dba-e22c-2285-55ee-352a8b087a04,Buy Man Things,company,/organization/buy-man-things,buymanthings.com,http://buymanthings.com,,,,,...,http://twitter.com/buymanthings,,2013-08-10 00:59:51,2018-10-26 21:45:19,company,organization,Buy Man Things is an online men's magazine fea...,,0,
4,0000221d-6dfd-a049-dcfc-b4a5ca3b0fe7,Stabil Capital Management,company,/organization/stabil-capital-management,,,,,,,...,,,2016-10-03 16:35:46,2018-02-12 23:47:40,company,organization,,,0,


Every organisation has an id and a location id

### Funding rounds

Funding rounds for organisations

In [13]:
cb_funding_rounds = get_daps_data('crunchbase_funding_rounds',con)

In [14]:
cb_funding_rounds.head()

Unnamed: 0,funding_round_id,company_name,location_id,country,state_code,region,city,investment_type,announced_on,raised_amount_usd,...,post_money_valuation_usd,post_money_valuation,post_money_currency_code,investor_count,cb_url,company_id,created_at,updated_at,investor_names,investor_ids
0,00006645-9ef6-8958-129a-2e225e577eb8,OnPath Technologies,marlton_united-states,United States,NJ,NJ - Other,Marlton,series_a,2007-03-17,8500000.0,...,,,,,https://www.crunchbase.com/funding-round/onpat...,dfaccc94-3874-9fa7-f1f9-680dbd7d5f2c,2010-12-14 07:02:56,2018-02-12 23:37:51,{},{}
1,00014924-0e2b-bd9e-65b9-da6cc58959dd,Distributed ID,toronto_canada,Canada,ON,Toronto,Toronto,seed,2017-02-01,103461.0,...,1636226.0,2135000.0,CAD,,https://www.crunchbase.com/funding-round/distr...,abb6101f-f808-fa9a-ec1c-1f7b4cc64513,2017-07-08 03:31:12,2018-07-17 07:46:49,{},{}
2,00015b5a-54e6-4285-9c9e-2374cb608dd7,Snaptrude,bangalore_india,India,,Bangalore,Bangalore,seed,2017-08-30,,...,,,,1.0,https://www.crunchbase.com/funding-round/snapt...,ab4297e8-d9d7-99dc-6214-2bbb88bb7bcd,2019-02-07 06:07:57,2019-02-07 06:07:57,"{""Brigade Group""}",{47d6eee4-c487-c86e-2c21-dafcf1125f62}
3,0001c361-4849-c8fb-d6c6-157bba9e0847,Vapor Corp,florida_united-states,United States,NY,NY - Other,Florida,debt_financing,2014-11-26,1250000.0,...,,,,,https://www.crunchbase.com/funding-round/vapor...,97d154d6-423e-b046-b335-f60d9b9f2279,2014-12-08 11:22:01,2018-02-12 23:19:17,{},{}
4,0001cbd1-f7e2-4a56-607c-c57f46cc7dcb,CloudVelox,santa-clara_united-states,United States,CA,SF Bay Area,Santa Clara,series_c,2015-02-12,15000000.0,...,,,,3.0,https://www.crunchbase.com/funding-round/cloud...,3c8f1ae9-16f1-bd5f-2800-63756ed685de,2015-02-13 06:08:29,2018-02-12 23:42:38,"{""Third Point Ventures""}",{960ce830-967e-79c4-e98a-b402d7db9748}


Each funding round has the company name and location id, the investment type and the year. This means that we don't need the organisation data for the funding measurements

### Reverse geocoded place ids

We have reverse geocoded place ids with their NUTS and LEPS code in notebook `0_rev_geocoder`. 

We load that information here and use it to generate indicators of activity by NUTS and LEPS area in the UK.

In [15]:
places = pd.read_csv('../../data/interim/crunchbase/2020_02_18_rev_geocoded_places.csv')

In [16]:
places.head()

Unnamed: 0,location_id,nuts218cd,nuts218nm,lep17cd,lep17nm
0,abbots-langley_united-kingdom,UKH2,Bedfordshire and Hertfordshire,E37000017,Hertfordshire
1,ampthill_united-kingdom,UKH2,Bedfordshire and Hertfordshire,E37000041,South East Midlands
2,ardeley_united-kingdom,UKH2,Bedfordshire and Hertfordshire,E37000017,Hertfordshire
3,arlesey_united-kingdom,UKH2,Bedfordshire and Hertfordshire,E37000041,South East Midlands
4,ashwell_united-kingdom,UKH2,Bedfordshire and Hertfordshire,E37000017,Hertfordshire


## 2. Process data

### a. Number of technology companies indicator

This is the number of active companies in a NUTS or LEP.

In [20]:
cb_orgs_geo = pd.merge(cb_orgs,places,left_on='location_id',right_on='location_id')

#We remove organisations without founding year
cb_orgs_geo_time = cb_orgs_geo.dropna(axis=0,subset=['founded_on'])
cb_orgs_geo_time['founding_year'] = [x.year for x in cb_orgs_geo_time['founded_on']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [21]:
#Focus on active companies

uk_comps = cb_orgs_geo.loc[(cb_orgs_geo['primary_role']=='company')&(cb_orgs_geo['status']=='operating')&(cb_orgs_geo['country']=='United Kingdom')]


In [22]:
uk_comps.shape

(34721, 40)

In [23]:
uk_comps.shape

(34721, 40)

In [24]:
uk_comps['nuts218nm'].value_counts().head(n=10)

Inner London - West                                 16823
Berkshire, Buckinghamshire and Oxfordshire           1636
Surrey, East and West Sussex                         1181
East Anglia                                          1122
Greater Manchester                                   1081
Gloucestershire, Wiltshire and Bath/Bristol area     1055
Eastern Scotland                                      878
West Midlands                                         758
Hampshire and Isle of Wight                           610
Bedfordshire and Hertfordshire                        596
Name: nuts218nm, dtype: int64

In [25]:
uk_comps['lep17nm'].value_counts().head(n=10)

London                                        17606
Greater Manchester                             1081
South East                                      863
Greater Cambridge and Greater Peterborough      818
Coast to Capital                                796
Leeds City Region                               795
Enterprise M3                                   716
West of England                                 683
Greater Birmingham and Solihull                 622
Oxfordshire                                     599
Name: lep17nm, dtype: int64

In [26]:
#Create activity files
nuts_orgs,leps_orgs = [uk_comps.groupby(var).size() for var in [['nuts218nm','nuts218cd'],['lep17nm','lep17cd']]]

#Name the series
nuts_orgs.name='company_n'
leps_orgs.name='company_n'

### b. Level of VC and seed funding indicator

Here we merge the geocoded df with the funding one and then create a function that aggregates funding by location & category for a threshold year.

In [27]:
inv_geo = pd.merge(cb_funding_rounds,places,left_on='location_id',right_on='location_id')

In [28]:
#We will need a python currency converter.
inv_geo['raised_amount_currency_code'].value_counts().head()

GBP    9194
EUR    1110
AUD      13
ISK      13
CAD       9
Name: raised_amount_currency_code, dtype: int64

In [29]:
inv_geo.columns

Index(['funding_round_id', 'company_name', 'location_id', 'country',
       'state_code', 'region', 'city', 'investment_type', 'announced_on',
       'raised_amount_usd', 'raised_amount', 'raised_amount_currency_code',
       'post_money_valuation_usd', 'post_money_valuation',
       'post_money_currency_code', 'investor_count', 'cb_url', 'company_id',
       'created_at', 'updated_at', 'investor_names', 'investor_ids',
       'nuts218cd', 'nuts218nm', 'lep17cd', 'lep17nm'],
      dtype='object')

#### Conversion strategy

We will use the announcement date and the currency information

In [30]:
from forex_python.converter import CurrencyRates
c = CurrencyRates()

In [31]:
#If an amount is not in GBP convert to GBP, if not, keep it as is

inv_geo['raised_amount_gbp'] = [make_conversion(x,rid) for rid,x in inv_geo.iterrows()]

971
980
1158
2033
2242
3078
4419
5273
5961
6408
9648
15351
15356
15358
17646


In [32]:
inv_geo['announced_year'] = [x.year for x in inv_geo['announced_on']]

In [33]:
#Use the indicators_w_threshold function to calculate levels of investment by NUTS and LEPS area
inv_nuts_2,inv_leps = [aggregate_investments(inv_geo,var) for var in ['nuts218cd','lep17cd']]

In [34]:
inv_nuts_2_recent = inv_nuts_2.loc[inv_nuts_2.index.get_level_values('announced_year')>2010]

In [35]:
#inv_nuts_2.sum().sort_values(ascending=False).head()

In [39]:
inv_venture = inv_nuts_2_recent[[x for x in inv_nuts_2.columns if 'series_' in x]].sum(axis=1)

inv_venture.name = 'venture_capital_investment'

inv_venture = pd.DataFrame(inv_venture)

inv_venture

Unnamed: 0_level_0,Unnamed: 1_level_0,venture_capital_investment
nuts218cd,announced_year,Unnamed: 2_level_1
UKC1,2011,2836118.0
UKC1,2012,10000000.0
UKC1,2013,19031230.0
UKC1,2014,35292594.0
UKC1,2015,92000000.0
...,...,...
UKN0,2015,1732240.0
UKN0,2016,16600000.0
UKN0,2017,10300000.0
UKN0,2018,14382985.0


## 3. Save data

#### Org_data

In [40]:
save_path = '../../data/processed/crunchbase'

In [41]:
# for file,name in zip([nuts_orgs,inv_nuts_2,leps_orgs,inv_leps],['nuts_2_orgs','nuts_2_investment','leps_orgs','leps_investment']):
#     save_data(file,name,save_path)

In [42]:
make_indicator(inv_venture,'crunchbase',{'venture_capital_investment':'gbp_venture_capital_received'},
              year_var='announced_year',nuts_spec=2016,nuts_var='nuts218cd',decimals=0)

   year nuts_id  nuts_year_spec  gbp_venture_capital_received
0  2011    UKC1            2016                       2836118
1  2012    UKC1            2016                      10000000
2  2013    UKC1            2016                      19031230
3  2014    UKC1            2016                      35292594
4  2015    UKC1            2016                      92000000
