# CrunchBase indicators: number of organisations and investment.

Here we produce indicators about level of Venture & Seed Funding in the UK using proprietary CrunchBase data licensed by Nesta. 

This involves:

* Download the data from Nesta DAPS system
* Merge organisations & funders to create org - funding matches
* Geocode with NUTS2 and LEPS geographies
* Create indicators
  * This will be based on a function that subsets by year and distinguishes between seed funding and venture capital

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import re
import random
from zipfile import ZipFile
from io import BytesIO
import csv
from data_getters.labs.core import download_file
from ast import literal_eval
from data_getters.core import get_engine


In [None]:
#dirs

if 'crunchbase' not in os.listdir('../../data/raw'):
    os.makedirs('../../data/raw/crunchbase')


if 'crunchbase' not in os.listdir('../../data/interim'):
    os.makedirs('../../data/interim/crunchbase')

    
if 'crunchbase' not in os.listdir('../../data/processed/'):
    os.makedirs('../../data/processed/crunchbase')

In [None]:
# %load ../utilities.py
# Some utilities

import random

def make_data_dict(table,name,path,sample=5):
    '''
    A function to output the form for a data dictionary
    
    Args:
        -table (df) is the df we want to create the data dictionary for
        -name (str) of the df
        -path (str) is the place where we want to save the file
        

    
    '''
    
    types = [estimate_type(table[x],sample=sample) for x in table.columns]
        
    data_dict = pd.DataFrame()
    data_dict['variable'] = table.columns
        
    data_dict['type'] = types
    
    data_dict['description'] = ['' for x in data_dict['variable']]
        
    out = os.path.join(path,f'{today_str}_{name}.csv')
    
    #print(data_dict.columns)
    
    data_dict.to_csv(out)
    

def estimate_type(variable,sample):
    '''
    Estimates the type of a column. 

    Args:
        variable (iterable) with values
        sample (n) is the number of values to test
    
    '''
    
    selection = random.sample(list(variable),sample)
    
    types = pd.Series([type(x) for x in selection]).value_counts().sort_values(ascending=False)
    
    return(types.index[0])



In [None]:
def save_data(df,name,path,today=today_str):
    '''
    Utility to save processed data quicker
    
    Arguments:
        df (df) is the dataframe we want to save
        name (str) is the name of the file
        path (str) is the path where we want to save the file
        today (str) is the day when the data is saved
    
    '''
    
    df.to_csv(f'{path}/{today_str}_{name}.csv')
    

In [None]:
def get_daps_data(table,connection,chunksize=1000):
    '''
    Utility function to get data from DAPS with less faff
    
    Args:
        -table is the SQL table in DAPS that we are extracting
        -connection is the database connection we are using
        -Chunksize are the chunks to download
    
    Returns:
        -A dataframe with the data we have collected
    
    '''
    #Get chunks
    chunks = pd.read_sql_table(table, connection, chunksize=chunksize)
    
    #Create df
    df = pd.concat(chunks)
    
    #Return data
    return(df)

In [None]:
def make_conversion(x,tid):
    '''
    Function to convert funding rounds from CrunchBase into GBP
    
    Args:
        transaction: a transaction from the CB funding rounds dataset
        tid: transaction id (to track issues)
    
    Returns:
        A conversion (if possible)
    '''
    
    #If an amount is not in GBP convert to GBP, if not, keep it as is
    
    #The currency converter doesn't work with Lebanese pounds so we will skip that
    if (x['raised_amount_currency_code']=='LBP')|(x['raised_amount_currency_code']==None):
        return(np.nan)
    
    else:
    
        try:
            out = x['raised_amount']*c.get_rate(
                x['raised_amount_currency_code'],'GBP',x['announced_on']) if x['raised_amount_currency_code']!='GBP' else x['raised_amount']
            return(out)

        except: 
            print(tid)

In [None]:
def aggregate_investments(df,geography):
    '''
    This function aggregates level of funding over a geography and investment type for a selected period
    
    Arguments:
        df: df with investment levels by geocoded organisaton, year and type
        years: (list) year range to be considered
        geography: (str) what geography name to use
    
    Returns a table where the rows are the geography and the columns are levels of funding by investment type
    
    '''
    
    #Period to consider
    #period = np.arange(years[0],years[1])
    
    #Subset by the year
    #df_in_period = df.loc[[x.year in period for x in df['announced_on']]]
    
    #Pivot
    out = pd.pivot_table(df,index=[geography,'announced_year'],columns='investment_type',values='raised_amount_gbp',aggfunc='sum').fillna(0)
    
    return(out)
    

In [None]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018,decimals=3):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    #Round variables
    t[var_code] = [np.round(x,decimals) if decimals>0 else int(x) for x in t[var_code]]
    
    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv',index=False)

## 1. Load Data

### Setup

In [None]:
# Download CrunchBase data using DAPS

my_config = '../../mysqldb_team.config'

#Create connection with SQL
con = get_engine(my_config)

#### Organisations

This is the list of organisations we want to wo

In [None]:
#Read data
cb_orgs = get_daps_data('crunchbase_organizations',con)

In [None]:
cb_orgs.head()

Every organisation has an id and a location id

### Funding rounds

Funding rounds for organisations

In [None]:
cb_funding_rounds = get_daps_data('crunchbase_funding_rounds',con)

In [None]:
cb_funding_rounds.head()

Each funding round has the company name and location id, the investment type and the year. This means that we don't need the organisation data for the funding measurements

### Reverse geocoded place ids

We have reverse geocoded place ids with their NUTS and LEPS code in notebook `0_rev_geocoder`. 

We load that information here and use it to generate indicators of activity by NUTS and LEPS area in the UK.

In [None]:
places = pd.read_csv('../../data/interim/crunchbase/2020_02_18_rev_geocoded_places.csv')

In [None]:
places.head()

## 2. Process data

### a. Number of technology companies indicator

This is the number of active companies in a NUTS or LEP.

In [None]:
cb_orgs_geo = pd.merge(cb_orgs,places,left_on='location_id',right_on='location_id')

In [None]:
cb_orgs.columns

In [None]:
#Focus on active companies

uk_comps = cb_orgs_geo.loc[(cb_orgs_geo['primary_role']=='company')&(cb_orgs_geo['status']=='operating')&(cb_orgs_geo['country']=='United Kingdom')]


In [None]:
uk_comps.shape

In [None]:
uk_comps['nuts218nm'].value_counts().head(n=10)

In [None]:
uk_comps['lep17nm'].value_counts().head(n=10)

In [None]:
#Create activity files
nuts_orgs,leps_orgs = [uk_comps.groupby(var).size() for var in [['nuts218nm','nuts218cd'],['lep17nm','lep17cd']]]

#Name the series
nuts_orgs.name='company_n'
leps_orgs.name='company_n'

### b. Level of VC and seed funding indicator

Here we merge the geocoded df with the funding one and then create a function that aggregates funding by location & category for a threshold year.

In [None]:
inv_geo = pd.merge(cb_funding_rounds,places,left_on='location_id',right_on='location_id')

In [None]:
#We will need a python currency converter.
inv_geo['raised_amount_currency_code'].value_counts().head()

In [None]:
inv_geo.columns

#### Conversion strategy

We will use the announcement date and the currency information

In [None]:
from forex_python.converter import CurrencyRates
c = CurrencyRates()

In [None]:
#If an amount is not in GBP convert to GBP, if not, keep it as is

inv_geo['raised_amount_gbp'] = [make_conversion(x,rid) for rid,x in inv_geo.iterrows()]

In [None]:
inv_geo['announced_year'] = [x.year for x in inv_geo['announced_on']]

In [None]:
#Use the indicators_w_threshold function to calculate levels of investment by NUTS and LEPS area
inv_nuts_2,inv_leps = [aggregate_investments(inv_geo,var) for var in ['nuts218cd','lep17cd']]

In [None]:
inv_nuts_2_recent = inv_nuts_2.loc[inv_nuts_2.index.get_level_values('announced_year')>2010]

In [None]:
#inv_nuts_2.sum().sort_values(ascending=False).head()

In [None]:
inv_venture = inv_nuts_2_recent[[x for x in inv_nuts_2.columns if 'series_' in x]].sum(axis=1)

inv_venture.name = 'venture_capital_investment'

inv_venture = pd.DataFrame(inv_venture)

inv_venture

## 3. Save data

#### Org_data

In [None]:
save_path = '../../data/processed/crunchbase'

In [None]:
# for file,name in zip([nuts_orgs,inv_nuts_2,leps_orgs,inv_leps],['nuts_2_orgs','nuts_2_investment','leps_orgs','leps_investment']):
#     save_data(file,name,save_path)

In [None]:
make_indicator(inv_venture,'crunchbase',{'venture_capital_investment':'gbp_venture_capital_received'},
              year_var='announced_year',nuts_spec=2016,nuts_var='nuts218cd',decimals=0)