Library and Directory Setup

In [1]:
library(data.table)
library(dplyr)
library(readr)
library(purrr)
library(tidyr)
library(lubridate)
library(tidyverse)
library(stringr)
library(rio)

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:data.table':

    between, first, last

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'purrr' was built under R version 3.6.3"
Attaching package: 'purrr'

The following object is masked from 'package:data.table':

    transpose

"package 'lubridate' was built under R version 3.6.3"
Attaching package: 'lubridate'

The following objects are masked from 'package:data.table':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following object is masked from 'package:base':

    date

"package 'tidyverse' was built under R version 3.6.3"

ERROR: Error: package or namespace load failed for 'tidyverse' in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
 there is no package called 'nlme'


In [2]:
setwd("F:/Thesis/Source_Data/People_Data_Lab")
getwd()

# 1) Input

Due to the sizez of the Free Company dataset, only the relevant columns are imported from original file

In [3]:
PDL_Raw <- fread("F:/Thesis/Source_Data/People_Data_Lab/companies_sorted\\companies_sorted.csv", 
                 stringsAsFactors = FALSE,
                 header = TRUE,
                 na.strings = "",
                 select = c("V1", "name", "year founded", "industry", "size range", "country"),
                 encoding = "UTF-8")

#Overview Statistics
dim(PDL_Raw)
n_rows0 <- nrow(PDL_Raw)

# 2) Data Preparation

In [4]:
head(PDL_Raw,5)

V1,name,year founded,industry,size range,country
5872184,ibm,1911,information technology and services,10001+,united states
4425416,tata consultancy services,1968,information technology and services,10001+,india
21074,accenture,1989,information technology and services,10001+,ireland
2309813,us army,1800,military,10001+,united states
1558607,ey,1989,accounting,10001+,united kingdom


## 2.1) Edit Column Headers

In [5]:
colnames(PDL_Raw) <- toupper(colnames(PDL_Raw))
colnames(PDL_Raw)[1] <- "PDL_ID"
colnames(PDL_Raw)[2] <- "PDL_ORIGINAL_NAME"
colnames(PDL_Raw) <- gsub(" ","_",colnames(PDL_Raw))
colnames(PDL_Raw)

## 2.2) Capital Letters

Change from small to capital letters columns

In [6]:
PDL_Raw <- PDL_Raw[, c("PDL_ORIGINAL_NAME", "INDUSTRY", "COUNTRY") := lapply(.SD, toupper),
                   .SDcols= c("PDL_ORIGINAL_NAME", "INDUSTRY", "COUNTRY")]
head(PDL_Raw,5)

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
5872184,IBM,1911,INFORMATION TECHNOLOGY AND SERVICES,10001+,UNITED STATES
4425416,TATA CONSULTANCY SERVICES,1968,INFORMATION TECHNOLOGY AND SERVICES,10001+,INDIA
21074,ACCENTURE,1989,INFORMATION TECHNOLOGY AND SERVICES,10001+,IRELAND
2309813,US ARMY,1800,MILITARY,10001+,UNITED STATES
1558607,EY,1989,ACCOUNTING,10001+,UNITED KINGDOM


## 2.3) Size Clusters

In [7]:
#Identify original size ranges
unique(PDL_Raw$SIZE_RANGE)

In [8]:
#Clean classification
PDL_Raw <- PDL_Raw[,SIZE_RANGE := SIZE_RANGE %>% 
                         gsub("10001\\+",">10000",.) %>%
                         gsub("5001 - 10000","5001-10000",.) %>%
                         gsub("1001 - 5000","1001-5000",.) %>%
                         gsub("501 - 1000","501-1,000",.) %>%
                         gsub("201 - 500","201-500",.) %>%
                         gsub("51 - 200","51-200",.) %>%
                         gsub("11 - 50","11-50",.) %>%
                         gsub("1 - 10","1-10",.) %>%
                         trimws()]
#Overview
table(PDL_Raw[,SIZE_RANGE])


    >10000       1-10  1001-5000      11-50    201-500 5001-10000  501-1,000 
      2095    5585399      24537    1126610      78494       3044      28394 
    51-200 
    324853 

## 2.4) Remove Organisations for which Name, Year of Foundation or Industry are blank

In [9]:
PDL_Raw <- na.omit(PDL_Raw, cols = c(2,3,4))

#Number remaining Organizations
n_rows1 <- nrow(PDL_Raw)
n_rows1

#Deleted entries
n_rows0-n_rows1

# 3) Data Filtering

## 3.1) Include only Organization from Selected Countries

In [10]:
Top_Countries <- c("UNITED STATES", "JAPAN","UNITED KINGDOM", "GERMANY", "CHINA", "CANADA", "NETHERLANDS",
                   "ISRAEL", "SOUTH KOREA", "FRANCE", "SWITZERLAND", "AUSTRALIA", "FINLAND", "ITALY",
                   "SWEDEN", "INDIA", "BELGIUM", "SINGAPORE", "SPAIN", "DENMARK")

PDL_Raw <- PDL_Raw[COUNTRY %in% Top_Countries,]

#Number remaining Organizations
n_rows2 <- nrow(PDL_Raw)
n_rows2

#Deleted entries
n_rows1-n_rows2

## 3.2) Elimination Duplicates

There are duplicate entries in the dataset. Example od well known companies: "IBM", "APPLE". Uknwon company; "ELEVATE"

In [11]:
PDL_Raw[PDL_ORIGINAL_NAME=="IBM",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
5872184,IBM,1911,INFORMATION TECHNOLOGY AND SERVICES,>10000,UNITED STATES
2567154,IBM,1914,INFORMATION TECHNOLOGY AND SERVICES,1-10,FRANCE
5479744,IBM,2003,COMPUTER SOFTWARE,1-10,ISRAEL


In [12]:
PDL_Raw[PDL_ORIGINAL_NAME=="APPLE",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
5735407,APPLE,1976,CONSUMER ELECTRONICS,>10000,UNITED STATES
2389734,APPLE,1998,COMPUTER SOFTWARE,"501-1,000",UNITED STATES


In [13]:
PDL_Raw[PDL_ORIGINAL_NAME=="ELEVATE",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
4629811,ELEVATE,2014,FINANCIAL SERVICES,"501-1,000",UNITED STATES
3382314,ELEVATE,1995,MARKETING AND ADVERTISING,51-200,UNITED KINGDOM
2436542,ELEVATE,2003,MARKETING AND ADVERTISING,11-50,UNITED STATES
2308685,ELEVATE,2005,COMPUTER NETWORKING,11-50,UNITED STATES
4256243,ELEVATE,2010,MANAGEMENT CONSULTING,11-50,SWEDEN
193396,ELEVATE,2005,MEDIA PRODUCTION,1-10,UNITED STATES
1772962,ELEVATE,2017,INFORMATION TECHNOLOGY AND SERVICES,1-10,CANADA
867427,ELEVATE,2018,CONSUMER GOODS,1-10,UNITED STATES
7155184,ELEVATE,2017,"HEALTH, WELLNESS AND FITNESS",1-10,UNITED KINGDOM
4727117,ELEVATE,2010,MARKETING AND ADVERTISING,1-10,UNITED STATES


Normally, wihtin a jurisdiction company names cannot be repeated, however companies established in different countries can have the same name. Thus duplicate entries with the same name, coming from the same country will be deletd. The organization with the earliest year of foundation will be kept in the database.

In [14]:
#Order dataset by fuondation year
PDL_Raw <-PDL_Raw[order(PDL_Raw[,"YEAR_FOUNDED"]),]

#Elimincate Duplicates
PDL_Raw <- unique(PDL_Raw, by= c("PDL_ORIGINAL_NAME", "COUNTRY"))

#Number remaining Organizations
n_rows3 <- nrow(PDL_Raw)
n_rows3

#Number organizations removed
n_rows2-n_rows3


In [15]:
PDL_Raw[PDL_ORIGINAL_NAME=="IBM",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
5872184,IBM,1911,INFORMATION TECHNOLOGY AND SERVICES,>10000,UNITED STATES
2567154,IBM,1914,INFORMATION TECHNOLOGY AND SERVICES,1-10,FRANCE
5479744,IBM,2003,COMPUTER SOFTWARE,1-10,ISRAEL


In [16]:
PDL_Raw[PDL_ORIGINAL_NAME=="APPLE",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
5735407,APPLE,1976,CONSUMER ELECTRONICS,>10000,UNITED STATES


In [17]:
PDL_Raw[PDL_ORIGINAL_NAME=="ELEVATE",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY
3382314,ELEVATE,1995,MARKETING AND ADVERTISING,51-200,UNITED KINGDOM
2436542,ELEVATE,2003,MARKETING AND ADVERTISING,11-50,UNITED STATES
4256243,ELEVATE,2010,MANAGEMENT CONSULTING,11-50,SWEDEN
1772962,ELEVATE,2017,INFORMATION TECHNOLOGY AND SERVICES,1-10,CANADA


## 3.3) Remove Legal Designation and Non-Alphanumeric Charcters 

In [18]:
#Create new column to store edited names 
PDL_Raw <- PDL_Raw[, PDL_MATCH_NAME := PDL_ORIGINAL_NAME]
                   
#Remove non-alphanumeric charcters
PDL_Raw <- PDL_Raw[, PDL_MATCH_NAME := PDL_MATCH_NAME %>%
                         iconv(., from = "UTF-8", to="ASCII//TRANSLIT") %>%
                         gsub("[^[:alnum:][:blank:].,&]","",.)]

#Remove dots at the end of the name
PDL_Raw[, PDL_MATCH_NAME:= PDL_MATCH_NAME %>%
              gsub("\\.$","",.) %>%
              trimws()]
#Overview
length(unique(PDL_Raw$PDL_MATCH_NAME))

Legal designations were removed on a country basis. 

In [19]:
#United States
PDL_Raw[COUNTRY=="UNITED STATES", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("INCORPORATED$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(",INC$","",.) %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              gsub(" LLC$","",.) %>%
              gsub(",LLC$","",.) %>%
              gsub("L.L.C$","",.) %>%
              gsub(" LLP$","",.) %>%
              gsub(",LLP$","",.) %>%
              gsub("CORPORATION$","",.) %>%
              gsub(" CORP$","",.) %>%
              gsub(",CORP$","",.) %>%
              gsub(" CO INC$","",.) %>%
              gsub(" CO LLC$","",.) %>%
              gsub(" CO$","",.) %>%
              gsub(",CO$","",.) %>%
              gsub(" LP$","",.) %>%
              gsub(",LP$","",.) %>%
              trimws()]


In [20]:
#China
PDL_Raw[COUNTRY=="CHINA", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("CO.,LTD$","",.) %>%
              gsub("CO., LTD$","",.) %>%
              gsub("CO. LTD$","",.) %>%
              gsub("CO.LTD$","",.) %>%
              gsub("CORPORATION$","",.) %>%
              gsub(" CORP$","",.) %>%
              gsub(",CORP$","",.) %>%
              gsub(" CO$","",.) %>%
              gsub("CO$","",.) %>%
              gsub(" CORP LTD$","",.) %>%
              gsub(",CORP LTD$","",.) %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              gsub("INCORPORATED$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(",INC$","",.) %>%
              trimws()]

In [21]:
#United Kingdom 
PDL_Raw[COUNTRY=="UNITED KINGDOM", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("LIMITED$", "",.) %>%
              gsub(" LTD$", "",.) %>%
              gsub(",LTD$", "",.) %>%
              gsub(" LLC$", "",.) %>%
              gsub(",LLC$", "",.) %>%
              gsub(" PLC$", "",.) %>%
              gsub(",PLC$", "",.) %>%
              gsub(" INC$", "",.) %>%
              gsub(",INC$", "",.) %>%
              trimws()]


In [22]:
#France
PDL_Raw[COUNTRY=="FRANCE", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("SÀRL$","",.) %>%
              gsub(" SARL$","",.) %>%
              gsub(",SARL$","",.) %>%
              gsub(" SASU$","",.) %>%
              gsub(",SASU$","",.) %>%
              gsub(" SRL$","",.) %>%
              gsub(",SRL$","",.) %>%
              gsub("S.A.S$","",.) %>%
              gsub(" SAS$","",.) %>%
              gsub(",SAS$","",.) %>%
              gsub("S.A$","",.) %>%
              gsub(" SA$","",.) %>%
              gsub(",SA$","",.) %>%
              gsub(" SE$","",.) %>%
              gsub(",SE$","",.) %>%
              gsub("SOCIÉTÉ ANONYME$","",.) %>%
              gsub("^SAS ","",.) %>%
              trimws()]

In [23]:
#Israel
PDL_Raw[COUNTRY=="ISRAEL", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$", "",.) %>%
              gsub(",LTD$", "",.) %>%
              gsub("INCOPORATED$","",.) %>%
              gsub(" INC$", "",.) %>%
              gsub(",INC$", "",.) %>%
              gsub("CO LTD$","",.) %>%
              trimws()]

In [24]:
#Canada
PDL_Raw[COUNTRY=="CANADA", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("INCORPORATED$","",.) %>%
              gsub(" INC$", "",.) %>%
              gsub(",INC$", "",.) %>%
              gsub("CORPORATION$", "",.) %>%
              gsub(" CORP$", "",.) %>%
              gsub(",CORP$", "",.) %>%
              gsub("LIMITED$", "",.) %>%
              gsub(" LTD$", "",.) %>%
              gsub(",LTD$", "",.) %>%
              gsub(" ULC$","",.) %>%
              gsub(",ULC$","",.) %>%
              gsub(" LP$","",.) %>%
              gsub(",LP$","",.) %>%
              trimws()]


In [25]:
#Japan
PDL_Raw[COUNTRY=="JAPAN", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("CO.,LTD$","",.) %>%
              gsub("CO., LTD$","",.) %>%
              gsub("CO. LTD$","",.) %>%
              gsub("CO.LTD$","",.) %>% 
              gsub("CORPORATION$","",.) %>%
              gsub(" CORP$","",.) %>%
              gsub(",CORP$","",.) %>%
              gsub("CO INC$","",.) %>%
              gsub(" CO$","",.) %>%
              gsub(",CO$","",.) %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              gsub("INCORPORATED$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(",INC$","",.) %>%
              gsub("K.K$","",.) %>%
              gsub(" KK$","",.) %>%
              gsub(",KK$","",.) %>%
              gsub("^CO LTD","",.) %>%
              trimws()]

In [26]:
#India
PDL_Raw[COUNTRY=="INDIA", PDL_MATCH_NAME :=  PDL_MATCH_NAME %>%
              gsub("PRIVATE LIMITED$","",.) %>%
              gsub("PRIVATE LTD$","",.) %>%
              gsub("PVT.LTD$","",.) %>%
              gsub("PVT. LTD$","",.) %>%
              gsub("PVT LTD$","",.) %>%
              gsub(" PVT$","",.) %>%
              gsub(",PVT$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(",INC$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              trimws()]

In [27]:
#Germany
PDL_Raw[COUNTRY=="GERMANY", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub(" SLU GMBH$", "",.) %>%
              gsub("GMBH & CO KG$", "",.) %>%
              gsub("GMBH$", "",.) %>%
              gsub("MBH$", "",.) %>%
              gsub(" AG$", "",.) %>%
              gsub(",AG$", "",.) %>%
              gsub(" SE$", "",.) %>%
              gsub(",SE$", "",.) %>%
              trimws()]

In [28]:
#Singapore
PDL_Raw[COUNTRY=="SINGAPORE", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("PRIVATE LIMITED$", "",.) %>%
              gsub("PTE. LTD$", "",.) %>%
              gsub("PTE LTD$", "",.) %>%
              gsub("LIMITED$", "",.) %>%
              gsub(" LTD$", "",.) %>%
              gsub(",LTD$", "",.) %>%
              gsub("CORP LTD$","",.) %>%
              trimws()]

In [29]:
#Australia
PDL_Raw[COUNTRY=="AUSTRALIA", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("PTY. LTD$","",.) %>%
              gsub("PTY.LTD$","",.) %>%
              gsub("PTY LTD$","",.) %>%
              gsub("(PTY)$","",.) %>%
              gsub("PTY LIMITED$","",.) %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              gsub("CORPORATION$","",.) %>%
              gsub(" CROP$","",.) %>%
              gsub(" CO$","",.) %>%
              gsub(",CO$","",.) %>%
              trimws()]

In [30]:
#Sweden
PDL_Raw[COUNTRY=="SWEDEN", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("\\(PUBL)$","",.) %>%
              gsub("SE AB$","",.) %>% 
              gsub(" AB$","",.) %>%
              gsub(",AB$","",.) %>%
              gsub("AB $","",.) %>%
              gsub(" CORP$","",.) %>%
              gsub(",CORP$","",.) %>%
              trimws()]

In [31]:
#Spain
PDL_Raw[COUNTRY=="SPAIN", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub(" SL$","",.) %>%
              gsub(",SL$","",.) %>%
              gsub("S.L$","",.) %>%
              gsub(", S.L$","",.) %>%
              gsub(" SA$","",.) %>%
              gsub(",SA$","",.) %>%
              gsub("S.A$","",.) %>%
              gsub(", S.A$","",.) %>%
              gsub("S.L.U$","",.) %>%
              gsub(" SLU$","",.) %>%
              gsub(",SLU$","",.) %>%
              trimws()]

In [32]:
#Switzerland
PDL_Raw[COUNTRY=="SWITZERLAND", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("SÀRL$","",.) %>%
              gsub(" AG$","",.) %>% 
              gsub(",AG$","",.) %>% 
              gsub("GMBH$","",.) %>%
              gsub("S.A$","",.) %>%
              gsub(" SA$","",.) %>%
              gsub(",SA$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              trimws()]

In [33]:
#Belgium
PDL_Raw[COUNTRY=="BELGIUM", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub(" NVSA$","",.) %>%
              gsub(" NV$","",.) %>%
              gsub(" N.V$","",.) %>%
              gsub(" BVBA$","",.) %>%
              gsub(" BV$","",.) %>%
              gsub(" B.V$","",.) %>%
              gsub(" CVBA$","",.) %>%
              gsub(" SA$","",.) %>%
              gsub(" S.A$","",.) %>%
              gsub(" SPRL$","",.) %>%
              gsub(" S.P.R.L$","",.) %>%
              gsub(" ASBL$","",.) %>%
              gsub(" VZW$","",.) %>%
              gsub(" V.Z.W$","",.) %>%
              gsub(" SPA$","",.) %>%
              gsub(" S.P.A$","",.) %>%
              trimws()]

In [34]:
#Netherlands
PDL_Raw[COUNTRY=="NETHERLAND", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("N.V$","",.) %>%
              gsub(" NV$","",.) %>%
              gsub(",NV$","",.) %>%
              gsub("B.V$","",.) %>%
              gsub(" BV$","",.) %>%
              gsub(",BV$","",.) %>%
              trimws()]

In [35]:
#South Korea
PDL_Raw[COUNTRY=="SOUTH KOREA", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("CO.,LTD$","",.) %>%
              gsub("CO., LTD$","",.) %>%
              gsub("CO. LTD$","",.) %>%
              gsub("CO.LTD$","",.) %>%
              gsub("CORPORATION$","",.) %>%
              gsub("CORP$","",.) %>%
              gsub(" CO$","",.) %>%
              gsub(",CO$","",.) %>%
              gsub("LIMITED$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(",LTD$","",.) %>%
              gsub("INCORPORATED$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(", INC$","",.) %>%
              trimws()]

In [36]:
#Italy
PDL_Raw[COUNTRY=="ITALY", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("S.P.A$","",.) %>%
              gsub(" SPA$","",.) %>%
              gsub(",SPA$","",.) %>%
              gsub("S.R.L$","",.) %>%
              gsub(" SRL$","",.) %>%
              gsub(",SRL$","",.) %>%
              trimws()]


In [37]:
#Finland
PDL_Raw[COUNTRY=="FINLAND", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("OYJ$","",.) %>%
              gsub("OY AB$","",.) %>%
              gsub("OY$","",.) %>%
              gsub(" INC$","",.) %>%
              gsub(",INC$","",.) %>%
              gsub(" LTD$","",.) %>%
              gsub(", LTD$","",.) %>%
              gsub(" CORP$","",.) %>%
              gsub(" CORP$","",.) %>%
              trimws()]

In [38]:
#Denmark
PDL_Raw[COUNTRY=="DENMARK", PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub(" APS$","",.) %>%
              gsub(" AS$","",.) %>%
              gsub(" A.S$","",.) %>%
              trimws()]

In [39]:
#Remove commas and dots at end of names
PDL_Raw[, PDL_MATCH_NAME := PDL_MATCH_NAME %>%
              gsub("\\.$","",.) %>%
              gsub(",$","",.) %>%
              gsub("^\\....","",.) %>%
              gsub("^\\..","",.) %>%
              gsub("^\\.","",.) %>%
              gsub("^\\,","",.) %>%
              trimws()]

There are still duplicates as in the example below with "VESTA". However it is difficult to determine which is correct. In this sense it is preferable to err to the conservative side

In [42]:
PDL_Raw[PDL_MATCH_NAME=="VESTA",]

PDL_ID,PDL_ORIGINAL_NAME,YEAR_FOUNDED,INDUSTRY,SIZE_RANGE,COUNTRY,PDL_MATCH_NAME
4010381,VESTA INC.,1972,MEDICAL DEVICES,51-200,UNITED STATES,VESTA
4178973,VESTA VZW,1975,HOSPITAL & HEALTH CARE,11-50,BELGIUM,VESTA
5589323,VESTA S.R.L.,1975,CONSTRUCTION,1-10,ITALY,VESTA
2753399,"VESTA, INC",1983,MENTAL HEALTH CARE,51-200,UNITED STATES,VESTA
1040963,VESTA CORPORATION,1995,FINANCIAL SERVICES,201-500,UNITED STATES,VESTA
1274879,VESTA,2010,CONSTRUCTION,1-10,UNITED STATES,VESTA
1281511,VESTA SRL,2013,RENEWABLES & ENVIRONMENT,1-10,ITALY,VESTA
552637,VESTA,2017,DESIGN,1-10,UNITED KINGDOM,VESTA
6091337,VESTA GMBH,2017,WHOLESALE,1-10,GERMANY,VESTA


## 3.4) Selection of Industries

The number of entries is excesively large for the Convoluted Fuzzy Match Algorith so that the number of organizations need to be reduced. A series of indsutries are removed under the assumption that they are less likely to be related to the development of Artificial Intelligence.

In [43]:
Drop_Industries <- c ("GOVERNMENT ADMINISTRATION","NEWSPAPERS","MILITARY",
                      "NON-PROFIT ORGANIZATION MANAGEMENT","EDUCATION MANAGEMENT","LEGAL SERVICES",
                      "LAW ENFORCEMENT","RELIGIOUS INSTITUTIONS","ARCHITECTURE & PLANNING","LIBRARIES",
                      "LAW PRACTICE","MUSEUMS AND INSTITUTIONS","PRIMARY/SECONDARY EDUCATION",
                      "FINE ART","PUBLIC SAFETY","NONPROFIT ORGANIZATION MANAGEMENT","GOVERNMENT RELATIONS",
                      "CIVIC & SOCIAL ORGANIZATION","PERFORMING ARTS","RECREATIONAL FACILITIES AND SERVICES",
                      "MENTAL HEALTH CARE","MUSIC","PHILANTHROPY","INDIVIDUAL & FAMILY SERVICES",
                      "FUND-RAISING","LEGISLATIVE OFFICE","JUDICIARY","ENTERTAINMENT","EVENTS SERVICES",
                      "INTERNATIONAL AFFAIRS","POLITICAL ORGANIZATION","PUBLIC POLICY","RESTAURANTS",
                      "PROFESSIONAL TRAINING & COACHING","INTERNATIONAL TRADE AND DEVELOPMENT",
                      "EXECUTIVE OFFICE","GAMBLING & CASINOS","SPORTS","ARTS AND CRAFTS","VETERINARY",
                      "SUPERMARKETS","PHOTOGRAPHY","ALTERNATIVE MEDICINE","MOTION PICTURES AND FILM",
                      "THINK TANKS","ALTERNATIVE DISPUTE RESOLUTION", "HIGHER EDUCATION","CONSTRUCTION",
                      "FURNITURE","WINE AND SPIRITS", "MANAGEMENT CONSULTING", "FOOD & BEVERAGES", "DESIGN",
                      "PROGRAM DEVELOPMENT", "MEDICAL CARE", "DAIRY", "FACILITIES SERVICE",
                      "GLASS, CERAMICS & CONCRETE", "IMPORT AND EXPORT", "LUXURY GOODS & JEWELRY", "MARITIME",
                      "OUTSOURCING/OFFSHORING", "PAPER & FOREST PRODUCTS", "PUBLIC RELATIONS AND COMMUNICATIONS",
                      "RANCHING", "SHIPBUILDING", "TOBACCO", "WAREHOUSING", "WHOLESALE")

PDL_Raw <- PDL_Raw[!(INDUSTRY %in% Drop_Industries),]

#Remaining Industries
n_rows4 <- nrow(PDL_Raw)
n_rows4

#Dropped Organizations
n_rows3-n_rows4


# 4) Add Column with Key Word for Fuzzy Match

In [44]:
PDL_Raw[, PDL_Key_Word := stringr::word(PDL_MATCH_NAME,1)]
head(PDL_Raw$PDL_Key_Word)

# 5) OUTPUT

In [45]:
fwrite(PDL_Raw,"F:/Thesis/Working_Data/Final\\Industrial_Dataset.csv", col.names = TRUE)