# CrunchBase analysis

Here we load the CB data that we processed before and report emerging findings

## 0.Preamble

In [None]:
%run ../notebook_preamble.ipy

import random

In [None]:
def get_cb(file,file_path,progress=True):
    """ Fetch Gateway To Research predicted industries

    Repo: https://github.com/nestauk/gtr_data_processing
    Commit: cd3cddb
    File: https://github.com/nestauk/gtr_data_processing/blob/master/notebooks/05_jmg_data_demo.ipynb

    Args:
        file_path (`str`, optional): Path to download to. If None, stream file.
        progress (`bool`, optional): If `True` and `file_path` is not `None`,
            display download progress.
    """
    
    return download_file(file_to_fetch=file, download_path=file_path+file, progress=progress)

def get_example(df,number,length,var='long_description'):
    '''
    Gets random examples in a field
    
    Args:
        Df is the dataframe we want to use
        number is the number of examples we want
        length is the length of the examples
    
    '''
    
    choose = random.sample(list(df.index),number)
    
    for x in df.loc[choose][var]:
        
        print(x[:length])
        print('\n')
        
def create_lq(X, binary=False):
    """ Calculate the location quotient.

    Divides the share of activity in a location by the share of activity in the UK total

    Args:
        X (pandas.DataFrame): DataFrame where rows are locations, columns are sectors and values are activity in a given sector at a location.
        binary (bool, optional): If True, discretise the data with a cut-off value of 1

    Returns:
        pandas.DataFrame
    """
    Xm = X.values
    X = pd.DataFrame((Xm/Xm.sum(1)[:, np.newaxis])/(Xm.sum(0)/Xm.sum()),
            index=X.index, columns=X.columns)
    
    return (X > 1) if binary else X

def flat_freqs(a_list):
    '''
    Gets elxement frequencies in a nested list
    
    '''
    
    return(pd.Series(flatten(a_list)).value_counts())

def flatten(a_list):
    '''
    Flattens a list
    
    '''
    
    return([x for el in a_list for x in el])

## 1. Load data

In [None]:
cb = pd.read_csv('../../data/processed/20_9_2019_cb_fnf.csv')

##### Read CB funding data

In [None]:
my_config = '../../../mysqldb_team.config'

In [None]:
from data_getters.core import get_engine

con = get_engine(my_config)
funding_rounds = pd.read_sql_table('crunchbase_funding_rounds', con, chunksize=1000)

In [None]:
cb_fr_df = pd.concat(funding_rounds).reset_index(drop=True)

In [None]:
cb_fr_df.head()

In [None]:
cb_fr_df['company_name'].value_counts().head()

We confirm that this is capturing round-company pairs

### Processing of funding data

In [None]:
cb_fr_df['year'] = [x.year for x in cb_fr_df['announced_on']]

#### Parse investor names

In [None]:
#They are sets, sometimes malformed
#We parse them hackily by removing leading and trailing '{}' etc
cb_fr_df['investor_names_list'] = [re.sub('"','',str(x[1:-1])).split(',') if x!='{}' else np.nan for x in cb_fr_df['investor_names']]

## 2. Analysis

#### Activity and funding trends

In [None]:
colors = ['lightcoral','red','darkorchid','deepskyblue','blue','cadetblue']
sectors = ['artificial_intelligence','advertising','creative_content','news_high','public_news']

In [None]:
def make_trends(df_act,df_fund,sectors,colors,thres=0.75):
    '''
    Various plots comparing evolution of activity in different sectors
    
    Args:
        df contains the company data and df fund the funding data
        sectors is sectors
        colors is colors
        thres is the threshold for considering a company in a sector (defaults to 0.75)
    
    Will save the plots and return the source dfs
    
    '''
    
    output = {}
    
    #######
    #Activity plot
    #######
    
    fig,ax = plt.subplots(figsize=(12,7),nrows=2,sharex=True,gridspec_kw={'height_ratios':[3,1]})
    
    #First plot component (activity in a sector as a share of the total)
    
    #This stores the year shares
    year_shares = []
    
    for n,s in enumerate(sectors):
        sh = (100*pd.crosstab(df_act['year'],df_act[s]>thres,normalize=0)).loc[np.arange(2000,2019)][True]
    
        sh.rolling(window=3).mean().dropna().plot(ax=ax[0],color=colors[n],linewidth=3 if 'news' in s else 1)

        year_shares.append(sh)
    
    #Axes
    ax[0].set_ylabel('% of all companies')
    ax[0].legend(sectors,bbox_to_anchor=(1,1))

    year_shares_df = pd.concat(year_shares,axis=1)
    year_shares_df.columns = sectors
    
    output['year_shares'] = year_shares_df
    
    #Add the second figure with news as a share of the total 
    
    news = df_act.loc[cb['news_high']==True]
    
    news_shares = (100*pd.crosstab(news['year'],news['public_news']>thres,normalize=0)).loc[np.arange(2000,2019)][True]
    news_shares.name = 'pin_share_of_news'
    
    news_shares.rolling(window=3).mean().dropna().plot(
        ax=ax[1],color='blue',linewidth=3)
    
    ax[1].set_ylabel('% of all \n news companies')
    
    output['pin_share_of_news'] = news_shares
    
    plt.tight_layout()

    plt.savefig('../../reports/figures/research_slides/cb/activity_trends.pdf')
    
    
    #########
    #Funding plot
    #########
    
    df_fund_2 = df_fund.copy()
    
    #This identifies IDs
    rel_sets = [set(df_act.loc[df_act[s]>thres]['id']) for s in sectors]
    
    for s,s_set in zip(sectors,rel_sets):
        
        df_fund_2[s] = [x in s_set for x in df_fund_2['company_id']]
        
        
    fig,ax = plt.subplots(figsize=(12,7),nrows=2,sharex=True,gridspec_kw={'height_ratios':[3,1]})

    #This is calculating totals raised per sector
    total_raised = pd.concat([df_fund_2.loc[df_fund_2[s]==True].groupby('year')['raised_amount_usd'].sum() for s in sectors],axis=1).fillna(0).loc[
        np.arange(2000,2019)]/1e9

    total_raised.columns = sectors
    
    total_raised.rolling(window=3).mean().dropna().plot(color=colors,ax=ax[0])

    ax[0].set_ylabel('$ Billion')
    
    #Store total raised  
    output['total_raised'] = total_raised
    

    #Now calculate funding raised by news as a share of the total
    
    news_funding= df_fund_2.loc[df_fund_2['news']==True]

    news_share_funding = (100*news_funding.groupby(['year','pi_news'])['raised_amount_usd'].sum().reset_index(drop=False).pivot(
        index='year',columns='pi_news',values='raised_amount_usd').apply(lambda x: x/x.sum(),axis=1).loc[np.arange(2000,2019)].fillna(0))
                  
    news_share_funding.rolling(window=3).mean().dropna()[True].plot(color='blue',ax=ax[1],linewidth=3)

    ax[1].set_ylabel('PI news as \n  % of all news')
    
    output['pin_shares_of_news_funding'] = news_share_funding

    plt.tight_layout()

    plt.savefig('../../reports/figures/research_slides/cb/funding_trends.pdf')

    return(output)


In [None]:
trend_outputs = make_trends(cb,cb_fr_df,sectors,colors)

What explains the surge in news funding?

In [None]:
rel_sets = [set(cb.loc[cb[s]>0.75]['id']) for s in ['news_high','public_news']]

cb_fr_df['news'],cb_fr_df['pin'] = [[x in s_set for x in cb_fr_df['company_id']] for s_set in rel_sets]

In [None]:
#This thing compares geographical activity before and after 2015
ax = cb_fr_df.loc[(cb_fr_df['news']==True)].assign(threshold=cb_fr_df['year']>2015).groupby([
    'country','threshold'])['raised_amount_usd'].sum().reset_index(drop=False).pivot(index='country',columns='threshold',
                                                                                     values='raised_amount_usd').sort_values(
    True,ascending=False)[:10].plot.bar()

ax.legend(['Before 2015','After 2015'])

In [None]:
trend_outputs['total_raised'].loc[2018]*1e3

#### Situation in the UK

In [None]:
uk_new_sets = [set(cb.loc[(cb[s]>0.75)&(cb['country_alpha_2']=='GB')]['id']) for s in ['news_high','public_news']]

In [None]:
cb_fr_df['uk_news'],cb_fr_df['uk_pi_news'] = [[x in one_set for x in cb_fr_df['company_id']] for one_set in uk_new_sets]

In [None]:
pd.DataFrame([cb_fr_df.groupby(var)['raised_amount_usd'].sum()[True] for var in ['uk_news','uk_pi_news']],
            index=['All News','Public Interest'],columns=['Total raised USD'])/1e6

#### Key investors in public interest news

In [None]:
ax = flat_freqs(cb_fr_df.loc[cb_fr_df['pi_news']==True,'investor_names_list'].dropna()).head(n=20)[::-1].plot.barh(color='blue',figsize=(8,8))

plt.tight_layout()

plt.savefig('../../reports/figures/research_slides/cb/top_funders.pdf')


In most UK deals the investor data is missing

#### Types of investment in public news globally

In [None]:
all_pi_finance_comp = pd.crosstab(cb_fr_df['investment_type'],cb_fr_df['pi_news'],normalize=1).sort_values(True,ascending=False)


finance_uk = cb_fr_df.loc[cb_fr_df['country']=='United Kingdom']
uk_pi_finance_comp = pd.crosstab(finance_uk['investment_type'],finance_uk['pi_news'],normalize=1)


pi_finance_global_uk = pd.concat([all_pi_finance_comp[True],uk_pi_finance_comp[True],all_pi_finance_comp[False]],axis=1).fillna(0)

pi_finance_global_uk.columns = ['Global','UK','Non-PIN']

pi_finance_global_uk = pi_finance_global_uk.sort_values('UK',ascending=True)[['UK','Global','Non-PIN']]

ax = pi_finance_global_uk.plot.barh(figsize=(8,10),color=['blue','coral','white'],edgecolor='black')

ax.legend(loc='lower right')
ax.set_xlabel('% of all funding')

plt.tight_layout()

plt.savefig('../../reports/figures/research_slides/cb/funding_modalities.pdf')

#### Geography

In [None]:
fig,ax = plt.subplots(figsize=(12,9),nrows=2,sharex=True)

pd.crosstab(cb['country'],cb['public_news']).sort_values(True,ascending=False)[:10][True].plot.bar(ax=ax[0],color='blue')

ax[0].set_ylabel('Total number of \n PIN companies')

create_lq(pd.crosstab(cb['country'],cb['public_news']).sort_values(True,ascending=False))[:10][True].plot.bar(ax=ax[1],color='blue')

ax[1].hlines(y=1,xmin=-0.5,xmax=25,linestyle=':',color='red',linewidth=3)
ax[1].set_ylabel('Specialisation in \n PIN')

plt.tight_layout()

plt.savefig('../../reports/figures/research_slides/cb/country_activity.pdf')


In [None]:
uk_pin = pd.crosstab(cb['country'],cb['public_news']).loc['United Kingdom']

100*uk_pin[True]/uk_pin.sum()

In [None]:
get_example(cb.loc[(cb['country']=='Netherlands')&(cb['public_news']==True)],5,600)

#### Geography in the UK

In [None]:
uk_comps = cb.loc[(cb['country_alpha_2']=='GB')]

In [None]:
get_example(uk_comps.loc[uk_comps['public_news']==True],4,800)

In [None]:
pd.crosstab(uk_comps['lad18nm'],uk_comps['public_news'],normalize=1).sort_values(True,ascending=False)

### Sector crossover

Now we check what % of companies in news have overlaps with other sectors

In [None]:
cb_sectors = cb['sector_dom'].value_counts().index[1:]

In [None]:
def sector_crossover(df,sectors,cb_sectors,normalise=True,thres=[0.75,0.3]):
    '''
    This outputs the % of companies in a sector overlapping with another sector
    
    Args:
        df is the activity df
        sectors is the sectors whose overlaps we want to study
        cb_sectors are all the cb_sectors that could overlap with a focus sectors
        normalise if we want to get the % of companies in sector overlapping
        threshold is the threshold for considering a company in a sector
    '''
    
    crossovers = []
    
    for s in sectors:
        sector_crossover = pd.Series([len(df[(df[s]>thres[0])&(df[cb_s]>thres[1])]) for cb_s in cb_sectors],index=cb_sectors)
        
        #print(sector_crossover)
        
        if normalise==True:
            sector_crossover = sector_crossover/len(df.loc[df[s]>thres[0]])
    
        crossovers.append(sector_crossover)
        
    crossover_df = pd.concat(crossovers,axis=1)
    crossover_df.columns = sectors
    
    
    return(crossover_df)
    

In [None]:
sectors = ['news_high','public_news']

cross = sector_crossover(cb,sectors,cb_sectors.drop('publishing_news'),normalise=True)

In [None]:
import seaborn as sns

fig,ax = plt.subplots(figsize=(10,2))

sns.heatmap(cross.sort_values('public_news',ascending=False).T.iloc[:,:20],ax=ax)