Library and Directory Setup

In [1]:
options(warn=-1)

In [2]:
library(data.table)
library(dplyr)
library(readr)
library(purrr)
library(tidyr)
library(lubridate)
library(tidyverse)
library(stringr)
library(rio)
library(readxl)


Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'purrr'

The following object is masked from 'package:data.table':

    transpose


Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following object is masked from 'package:base':

    date

-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.0     v stringr 1.4.0
v tibble  2.1.3     v forcats 0.5.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x lubridate::as.difftime() masks base::as.difftime()
x dplyr::between()         masks data.table::between()
x lubridate::date()        m

In [3]:
setwd("F:/Thesis/Source_Data/AI_Index_2019")
getwd()

# 1) Input

In [4]:
Venture_Raw <- data.table(readxl::read_excel("Chapter 4.2. CAPIQ, CB, Quid - Investment Activity.xlsx",
                                    sheet = "raw_event report"))
colnames(Venture_Raw)

New names:
* `` -> ...1


Remove unnecesary columns

In [5]:
DropFeatures <- c("...1","Date of Funding Event", "Event ID", "Target Company ID", "isProfileOrganization",
                  "Quarter of Funding Event")
Venture_Raw <-dplyr::select(Venture_Raw, -!!DropFeatures)
head(Venture_Raw)
n_rows0 <- nrow(Venture_Raw)

Event Type,Funding in USD,countries,clusters,Target Name,Year of Funding Event
Private Investment,1740000,Italy,"Crypto, Wealth management, Traders, Cryptocurrency",Kellify,2018
Private Investment,600000,Canada,"Sentiment, Customer feedback, Employee experience, Text analytics",Swae,2018
Private Investment,0,United Kingdom,"Invoices, Medium businesses, Cash flow, Receipts",Crystal Apps Ltd.,2017
Private Investment,1131601,United Kingdom,"Invoices, Medium businesses, Cash flow, Receipts",Crystal Apps Ltd.,2016
Private Investment,218623,United Kingdom,"Invoices, Medium businesses, Cash flow, Receipts",Crystal Apps Ltd.,2016
Private Investment,607927,United Kingdom,"Sql, Hadoop, Python, Data preparation",Lumilinks,2019


# 2) Data Preparation

## 2.1) Minor Adjustment

Header Adjustments

In [6]:
colnames(Venture_Raw) <- toupper(colnames(Venture_Raw))
colnames(Venture_Raw) <- gsub(" ","_", colnames(Venture_Raw))
colnames(Venture_Raw)[5] <- "Original_Venture_Name"

colnames(Venture_Raw)

Capital Letters

In [7]:
Venture_Raw <- Venture_Raw[, c("CLUSTERS", "COUNTRIES") := lapply(.SD, toupper),
                               .SDcols = c("CLUSTERS", "COUNTRIES")]

head(Venture_Raw)

EVENT_TYPE,FUNDING_IN_USD,COUNTRIES,CLUSTERS,Original_Venture_Name,YEAR_OF_FUNDING_EVENT
Private Investment,1740000,ITALY,"CRYPTO, WEALTH MANAGEMENT, TRADERS, CRYPTOCURRENCY",Kellify,2018
Private Investment,600000,CANADA,"SENTIMENT, CUSTOMER FEEDBACK, EMPLOYEE EXPERIENCE, TEXT ANALYTICS",Swae,2018
Private Investment,0,UNITED KINGDOM,"INVOICES, MEDIUM BUSINESSES, CASH FLOW, RECEIPTS",Crystal Apps Ltd.,2017
Private Investment,1131601,UNITED KINGDOM,"INVOICES, MEDIUM BUSINESSES, CASH FLOW, RECEIPTS",Crystal Apps Ltd.,2016
Private Investment,218623,UNITED KINGDOM,"INVOICES, MEDIUM BUSINESSES, CASH FLOW, RECEIPTS",Crystal Apps Ltd.,2016
Private Investment,607927,UNITED KINGDOM,"SQL, HADOOP, PYTHON, DATA PREPARATION",Lumilinks,2019


Funding Million Dollars

In [8]:
Venture_Raw <- Venture_Raw[,FUNDING_IN_USD:= FUNDING_IN_USD/1000000]

## 2.2) Data Selection

In [9]:
unique(Venture_Raw$EVENT_TYPE)

Select only Private Investment 

In [10]:
Venture_Raw <- Venture_Raw[EVENT_TYPE == "Private Investment",]
n_rows1 <- nrow(Venture_Raw)
length(unique((Venture_Raw$Original_Venture_Name)))

Select only ventures based on selected countries

In [11]:
Top_Countries <- c("UNITED STATES", "JAPAN","UNITED KINGDOM", "GERMANY", "CHINA", "CANADA", "NETHERLANDS",
                   "ISRAEL", "SOUTH KOREA", "FRANCE", "SWITZERLAND", "AUSTRALIA", "FINLAND", "ITALY",
                   "SWEDEN", "INDIA", "BELGIUM", "SINGAPORE", "SPAIN", "DENMARK")

Venture_Raw <- Venture_Raw[COUNTRIES %in% Top_Countries,]
n_rows2 <- nrow(Venture_Raw)
length(unique((Venture_Raw$Original_Venture_Name)))

## 2.3) Removal Legal Designation and Non-alphanumeric Characters

In [12]:
#Non-aplhanumeric characters
Venture_Raw[, Clean_Name := Original_Venture_Name %>%
                  iconv(., from = "UTF-8", to="ASCII//TRANSLIT") %>%
                  gsub("[^[:alnum:][:blank:].,]","",.)]

Venture_Raw[, Clean_Name:= Clean_Name %>%
                     gsub("\\.$","",.) %>%
                     trimws() %>%
                     toupper(.)]

head(Venture_Raw$Clean_Name,50)

Legal Designation by Country

In [13]:
#United States
Venture_Raw[COUNTRIES=="UNITED STATES", Clean_Name := Clean_Name %>%
                  gsub("INCORPORATED$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(",INC$","",.) %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  gsub(" LLC$","",.) %>%
                  gsub(",LLC$","",.) %>%
                  gsub("L.L.C$","",.) %>%
                  gsub(" LLP$","",.) %>%
                  gsub(",LLP$","",.) %>%
                  gsub("CORPORATION$","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(",CORP$","",.) %>%
                  gsub(" CO INC$","",.) %>%
                  gsub(" CO LLC$","",.) %>%
                  gsub(" CO$","",.) %>%
                  gsub(",CO$","",.) %>%
                  gsub(" LP$","",.) %>%
                  gsub(",LP$","",.) %>%
                  trimws()]

In [14]:
#China
Venture_Raw[COUNTRIES=="CHINA", Clean_Name := Clean_Name %>%
                  gsub("CO.,LTD$","",.) %>%
                  gsub("CO., LTD$","",.) %>%
                  gsub("CO. LTD$","",.) %>%
                  gsub("CO.LTD$","",.) %>%
                  gsub("CORPORATION$","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(",CORP$","",.) %>%
                  gsub(" CO$","",.) %>%
                  gsub("CO$","",.) %>%
                  gsub(" CORP LTD$","",.) %>%
                  gsub(",CORP LTD$","",.) %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  gsub("INCORPORATED$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(",INC$","",.) %>%
                  trimws()]

In [15]:
#United Kingdom 
Venture_Raw[COUNTRIES=="UNITED KINGDOM", Clean_Name := Clean_Name %>%
                  gsub("LIMITED$", "",.) %>%
                  gsub(" LTD$", "",.) %>%
                  gsub(",LTD$", "",.) %>%
                  gsub(" LLC$", "",.) %>%
                  gsub(",LLC$", "",.) %>%
                  gsub(" PLC$", "",.) %>%
                  gsub(",PLC$", "",.) %>%
                  gsub(" INC$", "",.) %>%
                  gsub(",INC$", "",.) %>%
                  trimws()]

In [16]:
#France
Venture_Raw[COUNTRIES=="FRANCE", Clean_Name := Clean_Name %>%
                  gsub("SÀRL$","",.) %>%
                  gsub(" SARL$","",.) %>%
                  gsub(",SARL$","",.) %>%
                  gsub(" SASU$","",.) %>%
                  gsub(",SASU$","",.) %>%
                  gsub(" SRL$","",.) %>%
                  gsub(",SRL$","",.) %>%
                  gsub("S.A.S$","",.) %>%
                  gsub(" SAS$","",.) %>%
                  gsub(",SAS$","",.) %>%
                  gsub("S.A$","",.) %>%
                  gsub(" SA$","",.) %>%
                  gsub(",SA$","",.) %>%
                  gsub(" SE$","",.) %>%
                  gsub(",SE$","",.) %>%
                  gsub("SOCIÉTÉ ANONYME$","",.) %>%
                  gsub("^SAS ","",.) %>%
                  trimws()]

In [17]:
#Israel
Venture_Raw[COUNTRIES=="ISRAEL", Clean_Name := Clean_Name %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$", "",.) %>%
                  gsub(",LTD$", "",.) %>%
                  gsub("INCOPORATED$","",.) %>%
                  gsub(" INC$", "",.) %>%
                  gsub(",INC$", "",.) %>%
                  gsub("CO LTD$","",.) %>%
                  trimws()]

In [18]:
#Canada
Venture_Raw[COUNTRIES=="CANADA", Clean_Name := Clean_Name %>%
                  gsub("INCORPORATED$","",.) %>%
                  gsub(" INC$", "",.) %>%
                  gsub(",INC$", "",.) %>%
                  gsub("CORPORATION$", "",.) %>%
                  gsub(" CORP$", "",.) %>%
                  gsub(",CORP$", "",.) %>%
                  gsub("LIMITED$", "",.) %>%
                  gsub(" LTD$", "",.) %>%
                  gsub(",LTD$", "",.) %>%
                  gsub(" ULC$","",.) %>%
                  gsub(",ULC$","",.) %>%
                  gsub(" LP$","",.) %>%
                  gsub(",LP$","",.) %>%
                  trimws()]

In [19]:
#Japan
Venture_Raw[COUNTRIES=="JAPAN", Clean_Name := Clean_Name %>%
                  gsub("CO.,LTD$","",.) %>%
                  gsub("CO., LTD$","",.) %>%
                  gsub("CO. LTD$","",.) %>%
                  gsub("CO.LTD$","",.) %>% 
                  gsub("CORPORATION$","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(",CORP$","",.) %>%
                  gsub("CO INC$","",.) %>%
                  gsub(" CO$","",.) %>%
                  gsub(",CO$","",.) %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  gsub("INCORPORATED$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(",INC$","",.) %>%
                  gsub("K.K$","",.) %>%
                  gsub(" KK$","",.) %>%
                  gsub(",KK$","",.) %>%
                  gsub("^CO LTD","",.) %>%
                  trimws()]

In [20]:
#India
Venture_Raw[COUNTRIES=="INDIA", Clean_Name :=  Clean_Name %>%
                  gsub("PRIVATE LIMITED$","",.) %>%
                  gsub("PRIVATE LTD$","",.) %>%
                  gsub("PVT.LTD$","",.) %>%
                  gsub("PVT. LTD$","",.) %>%
                  gsub("PVT LTD$","",.) %>%
                  gsub(" PVT$","",.) %>%
                  gsub(",PVT$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(",INC$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  trimws()]

In [21]:
#Germany
Venture_Raw[COUNTRIES=="GERMANY", Clean_Name := Clean_Name %>%
                  gsub(" SLU GMBH$", "",.) %>%
                  gsub("GMBH & CO KG$", "",.) %>%
                  gsub("GMBH & CO. KG$", "",.) %>%
                  gsub("GMBH & CO$", "",.) %>%
                  gsub(" KG$", "",.) %>%
                  gsub(",KG$", "",.) %>%
                  gsub("GMBH$", "",.) %>%
                  gsub("MBH$", "",.) %>%
                  gsub(" AG$", "",.) %>%
                  gsub(",AG$", "",.) %>%
                  gsub(" SE$", "",.) %>%
                  gsub(",SE$", "",.) %>%
                  trimws()]

In [22]:
#Singapore
Venture_Raw[COUNTRIES=="SINGAPORE", Clean_Name := Clean_Name %>%
                  gsub("PRIVATE LIMITED$", "",.) %>%
                  gsub("PTE. LTD$", "",.) %>%
                  gsub("PTE LTD$", "",.) %>%
                  gsub("LIMITED$", "",.) %>%
                  gsub(" LTD$", "",.) %>%
                  gsub(",LTD$", "",.) %>%
                  gsub("CORP LTD$","",.) %>%
                  trimws()]

In [23]:
#Australia
Venture_Raw[COUNTRIES=="AUSTRALIA", Clean_Name := Clean_Name %>%
                  gsub("PTY. LTD$","",.) %>%
                  gsub("PTY.LTD$","",.) %>%
                  gsub("PTY LTD$","",.) %>%
                  gsub("(PTY)$","",.) %>%
                  gsub("PTY LIMITED$","",.) %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  gsub("CORPORATION$","",.) %>%
                  gsub(" CROP$","",.) %>%
                  gsub(" CO$","",.) %>%
                  gsub(",CO$","",.) %>%
                  trimws()]

In [24]:
#Sweden
Venture_Raw[COUNTRIES=="SWEDEN", Clean_Name := Clean_Name %>%
                  gsub("\\(PUBL)$","",.) %>%
                  gsub("SE AB$","",.) %>% 
                  gsub(" AB$","",.) %>%
                  gsub(",AB$","",.) %>%
                  gsub("^AB ","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(",CORP$","",.) %>%
                  trimws()]

In [25]:
#Spain
Venture_Raw[COUNTRIES=="SPAIN", Clean_Name := Clean_Name %>%
                  gsub(" SL$","",.) %>%
                  gsub(",SL$","",.) %>%
                  gsub("S.L$","",.) %>%
                  gsub(", S.L$","",.) %>%
                  gsub(" SA$","",.) %>%
                  gsub(",SA$","",.) %>%
                  gsub("S.A$","",.) %>%
                  gsub(", S.A$","",.) %>%
                  gsub("S.L.U$","",.) %>%
                  gsub(" SLU$","",.) %>%
                  gsub(",SLU$","",.) %>%
                  trimws()]

In [26]:
#Switzerland
Venture_Raw[COUNTRIES=="SWITZERLAND", Clean_Name := Clean_Name %>%
                  gsub("SÀRL$","",.) %>%
                  gsub(" AG$","",.) %>% 
                  gsub(",AG$","",.) %>% 
                  gsub("GMBH$","",.) %>%
                  gsub("S.A$","",.) %>%
                  gsub(" SA$","",.) %>%
                  gsub(",SA$","",.) %>%
                  gsub(" SARL$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  trimws()]

In [27]:
#Belgium
Venture_Raw[COUNTRIES=="BELGIUM", Clean_Name := Clean_Name %>%
                  gsub(" NV SA$","",.) %>%
                  gsub(" NV$","",.) %>%
                  gsub(" SA$","",.) %>%
                  gsub(" VZW$","",.) %>%
                  gsub(" BVBA$","",.) %>%
                  gsub(" SPRL$","",.) %>%
                  trimws()]

In [28]:
#Netherlands
Venture_Raw[COUNTRIES=="NETHERLANDS", Clean_Name := Clean_Name %>%
                  gsub("N.V$","",.) %>%
                  gsub(" NV$","",.) %>%
                  gsub(",NV$","",.) %>%
                  gsub("B.V$","",.) %>%
                  gsub(" BV$","",.) %>%
                  gsub(",BV$","",.) %>%
                  trimws()]

In [29]:
#South Korea
Venture_Raw[COUNTRIES=="SOUTH KOREA", Clean_Name := Clean_Name %>%
                  gsub("CO.,LTD$","",.) %>%
                  gsub("CO., LTD$","",.) %>%
                  gsub("CO. LTD$","",.) %>%
                  gsub("CO.LTD$","",.) %>%
                  gsub("CORPORATION$","",.) %>%
                  gsub("CORP$","",.) %>%
                  gsub(" CO$","",.) %>%
                  gsub(",CO$","",.) %>%
                  gsub("LIMITED$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(",LTD$","",.) %>%
                  gsub("INCORPORATED$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(", INC$","",.) %>%
                  trimws()]

In [30]:
#Italy
Venture_Raw[COUNTRIES=="ITALY", Clean_Name := Clean_Name %>%
                  gsub("S.P.A$","",.) %>%
                  gsub(" SPA$","",.) %>%
                  gsub(",SPA$","",.) %>%
                  gsub("S.R.L$","",.) %>%
                  gsub(" SRL$","",.) %>%
                  gsub(",SRL$","",.) %>%
                  trimws()]

In [31]:
#Finland
Venture_Raw[COUNTRIES=="FINLAND", Clean_Name := Clean_Name %>%
                  gsub("OYJ$","",.) %>%
                  gsub("OY AB$","",.) %>%
                  gsub("OY$","",.) %>%
                  gsub(" INC$","",.) %>%
                  gsub(",INC$","",.) %>%
                  gsub(" LTD$","",.) %>%
                  gsub(", LTD$","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(" CORP$","",.) %>%
                  gsub(" AB$","",.) %>%
                  gsub(" AB LTD$","",.) %>%
                  gsub(" PLC$","",.) %>%
                  gsub("^OY ","",.) %>%
                  trimws()]

In [32]:
#Denmark
Venture_Raw[COUNTRIES=="DEMARK", Clean_Name := Clean_Name %>%
                  gsub(" APS$","",.) %>%
                  gsub(" AS$","",.) %>%
                  trimws()]

In [33]:
#Remove commas and dots at end of names
Venture_Raw[, Clean_Name := Clean_Name %>%
                    gsub("\\.$","",.) %>%
                    gsub(",$","",.) %>%
                    trimws()]

# 3) Foundation Year

## 3.1) Input

In [34]:
Mannual_Venture_Data <- fread("F:/Thesis/Source_Data/AI_Index_2019\\Mannual_Venture_Data.csv",
                              stringsAsFactors = FALSE, na.strings = "")
head(Mannual_Venture_Data)

Original_Venture_Name,Year_Foundation,Alternative_Name
Zippy Inc.,2017,Zippy.ai
Blue Jay Finlease Ltd.,2015,ZipLoan
"Cognical, Inc.",2012,Zibby
"Zhuiyi Shenzhen Chaoyi Technology Co., Ltd.",2016,Zhuiyi Technology
"Shenzhen Zhongshunyi Financial Services Co., Ltd.",2015,Zhongshunyi Financial
"Anhui Zhongke Haoyin Technology Co., Ltd.",2019,Zhongke Haoyin


In [35]:
nrow(Mannual_Venture_Data)
sum(is.na(Mannual_Venture_Data$Year_Foundation))

## 3.2) Data Combination

In [36]:
setkey(Venture_Raw, Original_Venture_Name)
setkey(Mannual_Venture_Data, Original_Venture_Name)

Venture_Raw <- Mannual_Venture_Data[Venture_Raw]
head(Venture_Raw)

Original_Venture_Name,Year_Foundation,Alternative_Name,EVENT_TYPE,FUNDING_IN_USD,COUNTRIES,CLUSTERS,YEAR_OF_FUNDING_EVENT,Clean_Name
"1-10 HOLDINGS, Inc.",1997,,Private Investment,4.26864,JAPAN,"AUGMENTED REALITY, VR, VIRTUAL REALITY, AR",2016,110 HOLDINGS
"101 Commerce, Inc.",2018,,Private Investment,2.532098,UNITED STATES,"ECOMMERCE, MARKETING AUTOMATION, SHOPPERS, RETAIL TECHNOLOGY",2019,101 COMMERCE
"101 Commerce, Inc.",2018,,Private Investment,12.74552,UNITED STATES,"ECOMMERCE, MARKETING AUTOMATION, SHOPPERS, RETAIL TECHNOLOGY",2018,101 COMMERCE
"10XTS, Inc.",2017,,Private Investment,0.0,UNITED STATES,"CRYPTO, WEALTH MANAGEMENT, TRADERS, CRYPTOCURRENCY",2018,10XTS
"10XTS, Inc.",2017,,Private Investment,1.069999,UNITED STATES,"CRYPTO, WEALTH MANAGEMENT, TRADERS, CRYPTOCURRENCY",2018,10XTS
10x Future Technologies Limited,2015,,Private Investment,19.642249,UNITED KINGDOM,"LENDING, LOANS, CREDIT SCORE, CONSUMER FINANCE",2019,10X FUTURE TECHNOLOGIES


## 3.3) Drop Startups for which the Founding Year is not available

In [38]:
Venture_Raw <- Venture_Raw[is.na(Year_Foundation)==FALSE,]
sum(is.na(Venture_Raw$Year_Foundation))

# 4) WIPO Sector Mapping

## 4.1) Input

In [39]:
WIPO_Sector_Mapping <- data.table(readxl::read_excel("F:/Thesis/Source_Data/AI_Index_2019\\AI Index - WIPO Mapping.xlsx",
                              sheet = "AI Index - WIPO Mapping"))
head(WIPO_Sector_Mapping)

Cluster,Field,Justification
"AGRICULTURE, FARMERS, FARMING, CROP",Agriculture,"Key phrases ""Agriculture"""
"LENDING, LOANS, CREDIT SCORE, CONSUMER FINANCE",Banking and Finance,"Key phrases ""Finance"""
"CRYPTO, WEALTH MANAGEMENT, TRADERS, CRYPTOCURRENCY",Banking and Finance,"Key phrases ""Trading"""
"INSURTECH, INSURANCE INDUSTRY, UNDERWRITING, INSURANCE PRODUCTS",Banking and Finance,"Key phrases ""Insurance"""
"INVOICES, MEDIUM BUSINESSES, CASH FLOW, RECEIPTS",Business,Enterprise Computing is a subfield of Business
"SQL, HADOOP, PYTHON, DATA PREPARATION",Business,Enterprise Computing is a subfield of Business


## 4.2) Data Combination

In [40]:
setkey(Venture_Raw, CLUSTERS)
setkey(WIPO_Sector_Mapping, Cluster)
Venture_Raw <- WIPO_Sector_Mapping[Venture_Raw]

head(Venture_Raw)

Cluster,Field,Justification,Original_Venture_Name,Year_Foundation,Alternative_Name,EVENT_TYPE,FUNDING_IN_USD,COUNTRIES,YEAR_OF_FUNDING_EVENT,Clean_Name
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,3LOQ Labs Pvt. Ltd.,2012,,Private Investment,2.0,INDIA,2013,3LOQ LABS
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,4w MarketPlace srl,2007,,Private Investment,3.240231,ITALY,2012,4W MARKETPLACE
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADCORE Inc.,2003,,Private Investment,1.903872,ISRAEL,2019,ADCORE
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADCORE Inc.,2003,,Private Investment,3.724201,ISRAEL,2019,ADCORE
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADEXT Inc.,2017,Adext AI,Private Investment,3.0,UNITED STATES,2017,ADEXT
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,"ADFIN Solutions, Inc.",2012,,Private Investment,4.15,UNITED STATES,2013,ADFIN SOLUTIONS


# 5) GPT Market Cluster Mapping

In [41]:
Field_Applications <- c("Agriculture","Arts and Humanities","Banking and Finance","Business","Cartography",
                        "Computing in Government","Document Management and Text Processing","Education",
                        "Energy Management","Entertainment","Industry and manufacturing",
                        "Law Social and Behavioral Sciences","Life and Medical Sciences", "Military","Networks",
                        "Personal Devices Computing and Hc","Physical Sciences and Engineering","Publishing",
                        "Security","Telecommunications","Transportation")

Venture_Raw <- Venture_Raw[,GPT_Scope := ifelse(Field=="Miscellaneous","Other",
                                                                    Field %in% Field_Applications)]
Venture_Raw <- Venture_Raw[,GPT_Scope := ifelse(GPT_Scope==TRUE,"Applied_AI",
                                                                    ifelse(GPT_Scope==FALSE,"Core_AI", "Other"))]
head(Venture_Raw)

Cluster,Field,Justification,Original_Venture_Name,Year_Foundation,Alternative_Name,EVENT_TYPE,FUNDING_IN_USD,COUNTRIES,YEAR_OF_FUNDING_EVENT,Clean_Name,GPT_Scope
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,3LOQ Labs Pvt. Ltd.,2012,,Private Investment,2.0,INDIA,2013,3LOQ LABS,Applied_AI
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,4w MarketPlace srl,2007,,Private Investment,3.240231,ITALY,2012,4W MARKETPLACE,Applied_AI
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADCORE Inc.,2003,,Private Investment,1.903872,ISRAEL,2019,ADCORE,Applied_AI
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADCORE Inc.,2003,,Private Investment,3.724201,ISRAEL,2019,ADCORE,Applied_AI
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,ADEXT Inc.,2017,Adext AI,Private Investment,3.0,UNITED STATES,2017,ADEXT,Applied_AI
"ADVERTISERS, PROGRAMMATIC, MOBILE ADVERTISING, REAL TIME BIDDING",Business,E-commerce is a subfield of Business,"ADFIN Solutions, Inc.",2012,,Private Investment,4.15,UNITED STATES,2013,ADFIN SOLUTIONS,Applied_AI


In [42]:
length(unique(Venture_Raw$Original_Venture_Name))
sum(is.na(Venture_Raw$Year_Foundation))

sum(is.na(Venture_Raw$Year_Foundation))/length(unique(Venture_Raw$Original_Venture_Name))

# 6) Age of Startups at Time of Investment

In [43]:
Venture_Raw <- Venture_Raw[, Venture_Age:= YEAR_OF_FUNDING_EVENT - Year_Foundation]
summary(Venture_Raw$Venture_Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -9.000   1.000   2.000   3.511   4.000 217.000 

In [44]:
table(Venture_Raw$Venture_Age)


  -9   -8   -7   -6   -5   -4   -3   -2   -1    0    1    2    3    4    5    6 
   1    1    2    2    5   13   20   27   49 1316 2513 2388 1850 1280  832  496 
   7    8    9   10   11   12   13   14   15   16   17   18   19   20   21   22 
 381  251  209  135  128   99   84   63   43   36   22   18   19   10   13    7 
  23   24   25   26   27   28   29   30   31   32   33   34   35   36   39   41 
  11    5    4   12    9    2    3    4    2    3    2    2    1    1    1    1 
  43   44   45   46   47   48   55   57   65   77   82   83   89   91   93   95 
   1    1    1    1    1    1    1    1    1    1    1    1    1    2    1    1 
 108  112  217 
   1    1    1 

Remove companeis that received funding before being founded

In [50]:
Venture_Raw <- Venture_Raw[Venture_Age>=0,]
summary(Venture_Raw$Venture_Age)
length(unique(Venture_Raw$Original_Venture_Name))

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.000   2.000   3.568   4.000 217.000 

# 7) OUTPUT

In [51]:
fwrite(Venture_Raw, "F:/Thesis/Working_Data/Final\\Entrepreneurial_Dataset.csv", col.name=TRUE)