#### Linking Datasets: Foodtech

+ **Main input:** output of `create_foodtech.py` - the csv file `input/foodtech.csv`

+ **Other inputs:** csv files from main input Crunchbase's `bulk download` method 

+ **Other inputs:**  the output from `impute_postal_codes.py` and `concat_all.py` - the csv file `postal_codes.csv`

+ **Reference files:** `variables.csv` for a list of variables for each csv

+ **Output file:** `foodtech_final.csv`

*Creates a database that merges datasets contained in csv's retrieved via Crunchbase's `bulk download` method.
Database is uniquely identified by funding rounds and agrofood tech startups*




| Organization | Funding Round   |
|--------------|-----------------|
| org_1        | funding_round_A |
| org_1        | funding_round_B |
| org_2        | NaN             |
| org_3        | funding_round_C |



In [35]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

folder = '../bulk_download/'

orgs = pd.read_csv('../input/foodtech.csv')

`organizations`

In [36]:
orgs = orgs[['uuid', 'name','cb_url', 'homepage_url', 'type', 'rank',
            'country_code',  'region', 'city', 'postal_code', 
            'status', 'short_description',
            'category_list', 'category_groups_list',   
            'founded_on', 'closed_on', 'employee_count',  
            'facebook_url', 'linkedin_url', 'twitter_url',  'num_exits']]

`organization_description`

In [37]:
file = 'organization_descriptions.csv'
desc = pd.read_csv(folder + file)

In [38]:
df = pd.merge(orgs, desc[['uuid', 'description']], how='left', on='uuid')

In [39]:
del orgs
del desc

In [40]:
df.rename(columns={

                   'type':              'Type_x', 
                   'uuid':              'org_uuid', 
                   'name':              'Organization Name',
                   'cb_url':            'Organization Name URL', 
                   'rank':              'CB Rank Company',   
                   'homepage_url':      'Organization Website', 
                   'country_code':      'Headquarters Location Country',
                   'region':            'Headquarters Location Region', 
                   'city':              'Headquarters Location City', 
                   'postal_code':       'Headquarters Location Postal Code', 
                   'status':            'Operating Status', 
                   'short_description': 'Short Description',
                   'description':       'Full Description',
                   'category_list':     'Organization Industries',
                   'category_groups_list': 'Organization Industry Groups', 
                   'founded_on':        'Founded Date', 
                   'closed_on':         'Closed Date', 
                   'employee_count':    'Number of Employees',
                   'facebook_url':      'Facebook', 
                   'linkedin_url':      'LinkedIn', 
                   'twitter_url':       'Twitter', 
                   'num_exits':         'Number of Exits'

       }, inplace=True)

df['Founded Year'] = df['Founded Date'].str[:4]


In [41]:
print('Resulting number of entries: ', len(df))

Resulting number of entries:  166742


`funding_rounds`

In [42]:
file = 'funding_rounds.csv'
rounds = pd.read_csv(folder + file)

In [43]:
rounds = rounds[['uuid',
                 'cb_url', 
                 'rank',
                 'name', 
                 'type',
                 'investment_type', 
                 'announced_on',
                 'raised_amount_usd', 
                 'raised_amount', 
                 'raised_amount_currency_code', 
                 'investor_count', 
                 'lead_investor_uuids', 
                 'org_uuid']]


In [44]:
rounds.rename(columns={

                'uuid':             'transaction_uuid',
                'cb_url':           'Transaction Name URL', 
                'name':             'Transaction Name', 
                'rank':             'CBRank Funding Round',
                'type':             'Type',
                'investment_type':  'Funding Type', 
                'announced_on':     'Announced Date',
                'raised_amount_usd': 'Money Raised Currency in USD', 
                'raised_amount':    'Money Raised', 
                'raised_amount_currency_code': 'Money Raised Currency', 
                'investor_count':   'Number of Investors'

                }, inplace=True)

rounds['Announced Year'] = rounds['Announced Date'].str[:4]


In [45]:
df = pd.merge(df, rounds, how='left', on='org_uuid')

df['Type'].fillna(df['Type_x'], inplace=True)
df.drop(['Type_x'], axis=1, inplace=True)

print('Resulting number of entries: ', len(df))

Resulting number of entries:  189306


In [46]:
del rounds  

`investment_partners`

In [47]:
file = 'investment_partners.csv'
partners = pd.read_csv(folder + file)
partners = partners.groupby('funding_round_uuid').size().reset_index(name='Number of Partner Investors')

In [48]:
df = pd.merge(df, partners, how='left', left_on='transaction_uuid', right_on='funding_round_uuid')
df.drop(columns=['funding_round_uuid'], inplace=True)

In [49]:
del partners

`investors`

In [50]:
file = 'investors.csv'
investors = pd.read_csv(folder + file)

inv = df[['transaction_uuid', 'lead_investor_uuids']]

In [51]:
expanded_columns = inv['lead_investor_uuids'].str.split(',', expand=True)

inv = pd.concat([inv['transaction_uuid'], expanded_columns], axis=1)
inv = inv.melt(id_vars=['transaction_uuid'], value_name='lead_investor_uuid')
inv.dropna(subset=['lead_investor_uuid'], inplace=True)

In [52]:
inv = pd.merge(inv, investors, left_on='lead_investor_uuid', right_on='uuid', how='left')
inv = inv[['transaction_uuid', 'name', 'investor_types']]

In [53]:
inv['investor_types'] = inv['investor_types'].apply(lambda x: [x])
inv = inv.groupby('transaction_uuid').agg({'name': list, 'investor_types': list}).reset_index()

inv.rename(columns={'name': 'Investor Names',
                    'investor_types': 'Investor Type'}, inplace=True)

In [54]:
df = pd.merge(df, inv, how='left', on='transaction_uuid')

In [55]:
del expanded_columns
del inv
del investors

`people`

In [56]:
file = 'people.csv'
people = pd.read_csv(folder + file)

In [57]:
people['featured_job_title'].fillna('', inplace=True)
people = people[people['featured_job_title'].str.contains('Founder')]
people = people.groupby('featured_job_organization_uuid').agg({'name': list, 'gender': list}).reset_index()

people.rename(columns={'name': 'Founders', 
                       'gender': 'Founder Genders'}, inplace=True)

In [58]:
def has_female(lst):
    return 1 if 'female' in lst else 0

people['Female Founders'] = people['Founder Genders'].apply(has_female)

In [59]:
df = pd.merge(df, people, how='left', left_on='org_uuid', right_on='featured_job_organization_uuid')

print('Resulting number of entries: ', len(df))


Resulting number of entries:  189306


In [60]:
del people

`acquisitions`

In [61]:
file = 'acquisitions.csv'
acq = pd.read_csv(folder + file)

In [62]:
acq = acq[['cb_url',
           'name',
           'rank',
           'type',
           'acquiree_uuid', 
           'acquiree_name',  
           'acquirer_name',  
           'acquirer_cb_url',
           'acquisition_type', 
           'acquired_on', 
           'price_usd', 
           'price', 
           'price_currency_code']]

In [63]:
acq.rename(columns={
    
            'cb_url':           'Transaction Name URL_x',
            'name':             'Transaction Name_x',
            'type':             'Type_x',
            'rank':             'CBRank Funding Round_x',
            'acquiree_uuid':    'org_uuid', 
            'acquiree_name':    'Organization Name',  
            'acquirer_name':    'Acquired By',  
            'acquirer_cb_url':  'Acquired By URL',
            'acquisition_type': 'Acquisition Type', 
            'acquired_on':      'Acquisition Date', 
            'price_usd':        'Acquisition Price in USD', 
            'price':            'Acquisition Price', 
            'price_currency_code': 'Acquisition Currency'

            }, inplace=True)

In [64]:
df = pd.merge(df, acq, how='left', on=['org_uuid', 'Organization Name'])

In [65]:
df['Transaction Name URL'].fillna(df['Transaction Name URL_x'], inplace=True)
df['Transaction Name'].fillna(df['Transaction Name_x'], inplace=True)
df['Type'].fillna(df['Type_x'], inplace=True)
df['CBRank Funding Round'].fillna(df['CBRank Funding Round_x'], inplace=True)

df.drop(['Transaction Name URL_x', 
         'Transaction Name_x', 
         'Type_x', 
         'CBRank Funding Round_x'], 
         axis=1, inplace=True)

In [66]:
del acq

In [67]:
print('Resulting number of entries: ', len(df))

Resulting number of entries:  190335


*Merging postal codes*

In [73]:
file = '../output/postal_codes.csv'
codes = pd.read_csv(file)

cols = ['uuid', 'pc_cb', 'pc_fill', 'lat_fill', 'long_fill']
codes = codes[cols]

In [76]:
df = pd.merge(df, codes, how='left', left_on='org_uuid', right_on='uuid')

In [77]:
del codes

In [79]:
print('Resulting shape of dataframe: ', df.shape)

Resulting shape of dataframe:  (190335, 54)


*Cleaning*

In [80]:
order = [
    
    'Type',
    'CBRank Funding Round',
    'Transaction Name',
    'Transaction Name URL',
    'Funding Type',
    'Announced Date',
    'Announced Year',
    'Organization Name',
    'Organization Name URL',
    'Short Description',
    'Full Description',
    'Investor Names',
    'Investor Type',
    'Number of Investors',
    'Number of Partner Investors',
    'Organization Industries',
    'Organization Industry Groups',
    # 'Aggregated Mode Sector',
    # 'CCI',
    'Money Raised',
    'Money Raised Currency',
    'Money Raised Currency in USD',
    'Founded Date',
    'Founded Year',
    'Operating Status',
    'Headquarters Location City',
    'Headquarters Location Region',
    'Headquarters Location Country',
    'Headquarters Location Postal Code',
    'CB Rank Company',
    'Closed Date',
    'Number of Employees',
    'Acquired By',
    'Acquired By URL',
    'Acquisition Type', 
    'Acquisition Date', 
    'Acquisition Price in USD',
    'Acquisition Price', 
    'Acquisition Currency',
    'LinkedIn',
    'Organization Website',
    'Twitter',
    'Facebook',
    'Founders',
    'Founder Genders',
    'Female Founders',
    'pc_cb', 
    'pc_fill', 
    'lat_fill', 
    'long_fill'
]

df = df[order]


*Save as csv*

In [81]:
df.to_csv('../output/foodtech_final.csv')