# Text Data Preprocessing Steps

1. **Feature Scaling**: Checked which column needs feature scaling (Company.Size) and scaled it
   
2. **Split Experience**: Split Experience into Min Experience and Max Experience
   
3. **Extracted Sector from JSON**: Extracted Sector as a useful feature from Company JSON Datatype
   
4. **One hot encoding**: One-hot encoded relevant columns ("Qualifications", "Work.Type", "Preference", "Sector") and mapped corresponding their corresponding numerical and categorical values below (under variable name **mapping_dict**) 

<br/><br/>

Other Possible Preprocessing Steps: 
- Text Preprocessing
- Extracting Month From Date

In [1]:
df <- read.csv("sampled_job_descriptions.csv", sep=",", header=TRUE, fill=TRUE)
library(tidyverse)
library(lubridate)
library(caret) # For scaling
library(jsonlite)
library(readr)
library(dplyr)
library(stringr)

"package 'tidyverse' was built under R version 4.3.3"
"package 'ggplot2' was built under R version 4.3.3"
"package 'tibble' was built under R version 4.3.3"
"package 'tidyr' was built under R version 4.3.3"
"package 'readr' was built under R version 4.3.3"
"package 'purrr' was built under R version 4.3.3"
"package 'dplyr' was built under R version 4.3.3"
"package 'stringr' was built under R version 4.3.3"
"package 'forcats' was built under R version 4.3.3"
"package 'lubridate' was built under R version 4.3.3"
── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.0     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ──────────

In [2]:
# Example data
data <- data.frame(Salary.Range = c("$12K-$128K", "$40K-$90K", "$80K-$150K"))

# Convert Salary.Range to character type if necessary
data <- data %>%
  mutate(Salary.Range = as.character(Salary.Range)) %>%
  # Split the Salary.Range column by "-"
  mutate(Salary.Range = strsplit(Salary.Range, "-")) %>%
  # Extract numeric values for minRange and maxRange
  mutate(minRange = extract_numeric(sapply(Salary.Range, `[`, 1)) * 1000,
         maxRange = extract_numeric(sapply(Salary.Range, `[`, 2)) * 1000)


# Print the modified data frame
print(data)

extract_numeric() is deprecated: please use readr::parse_number() instead

extract_numeric() is deprecated: please use readr::parse_number() instead



  Salary.Range minRange maxRange
1  $12K, $128K    12000   128000
2   $40K, $90K    40000    90000
3  $80K, $150K    80000   150000


In [3]:
# Function to extract numeric values
extract_numeric <- function(x) {
  # Extract numeric part and remove non-numeric characters
  numeric_part <- gsub("\\D", "", x)
  # Convert to numeric (handling empty strings)
  ifelse(numeric_part == "", NA, as.numeric(numeric_part))
}

# Split the "Salary.Range" column by "-"
df_new <- df %>%
  mutate(Salary.Range = strsplit(as.character(Salary.Range), "-")) %>%
  # Extract minimum and maximum values
  mutate(
    minRange = extract_numeric(sapply(Salary.Range, `[`, 1)) * 1000,
    maxRange = extract_numeric(sapply(Salary.Range, `[`, 2)) * 1000
  )

# Print the modified data frame
print(head(df_new, 5))

     Experience Qualifications Salary.Range          Country Work.Type
1 4 to 11 Years          M.Com  $61K, $128K   Macedonia, FYR Temporary
2 3 to 10 Years         M.Tech  $65K, $129K        Greenland    Intern
3 0 to 15 Years         M.Tech  $56K, $114K         Kiribati Full-Time
4 2 to 12 Years            PhD  $57K, $113K Papua New Guinea    Intern
5  2 to 9 Years          B.Com  $57K, $110K          Georgia Temporary
  Company.Size Job.Posting.Date Preference           Job.Title
1        90997       2023-02-08     Female  Landscape Designer
2       123218       2023-05-10     Female Structural Engineer
3        62356       2023-04-11       Male  Physical Therapist
4        61041       2023-07-13       Both           Architect
5        84565       2023-08-10       Both      Office Manager
                              Role   Job.Portal
1 Sustainable Landscape Specialist ZipRecruiter
2            Construction Engineer     FlexJobs
3     Geriatric Physical Therapist  SimplyHired
4   

In [4]:
missing_profiles <- sum(is.na(df_new$Company_Profile))

# Print the result
if (missing_profiles > 0) {
  print(paste("There are", missing_profiles, "missing company profiles."))
} else {
  print("There are no missing company profiles.")
}

[1] "There are no missing company profiles."


In [5]:
# Check if each column in the data frame is numeric
numerical_columns <- sapply(df, is.numeric)

# Extract the names of the numerical columns
numerical_column_names <- names(numerical_columns[numerical_columns])

# Filter the data frame to only include the numerical columns
numerical_data <- df[numerical_column_names]

# Check the range of values for each numerical feature
feature_ranges <- sapply(numerical_data, function(x) c(min = min(x, na.rm = TRUE), max = max(x, na.rm = TRUE)))

# Check if the range of values for any feature is significantly different
threshold <- 15

# Check if the range of values for any feature is greater than the threshold
features_to_scale <- names(which(apply(feature_ranges, 2, function(x) diff(x) > threshold)))

features_to_scale


In [6]:
preproc <- preProcess(df_new[, "Company.Size", drop = FALSE], method = c("center", "scale"))
df_new$Company_Size_Scaled <- predict(preproc, df_new[, "Company.Size", drop = FALSE])$`Company.Size`

# df_new

In [7]:
# Create an empty vector to store the names of categorical columns
categorical_columns <- c()

# Iterate over each column in the data frame
for (col in names(df_new)) {
  # Check if the column is categorical
  if (is.factor(df_new[[col]]) || is.character(df_new[[col]])) {
    categorical_columns <- c(categorical_columns, col)
  }
}

cat("Categorical columns in df_new:", "\n")
  print(categorical_columns)


categorical_head <- list()

# Iterate over each categorical column in the data frame
for (col in categorical_columns) {
  # Convert the column to character if it's a factor
  if (is.factor(df_new[[col]])) {
    df_new[[col]] <- as.character(df_new[[col]])
  }
  
  # Print the name of the column
  cat( col, ":", "\n")
  
  # Print the head of the column
  cat(head(df_new[[col]]), "\n\n")
}

Categorical columns in df_new: 
 [1] "Experience"       "Qualifications"   "Country"          "Work.Type"       
 [5] "Job.Posting.Date" "Preference"       "Job.Title"        "Role"            
 [9] "Job.Portal"       "Benefits"         "skills"           "Company.Profile" 
Experience : 
4 to 11 Years 3 to 10 Years 0 to 15 Years 2 to 12 Years 2 to 9 Years 4 to 9 Years 

Qualifications : 
M.Com M.Tech M.Tech PhD B.Com PhD 

Country : 
Macedonia, FYR Greenland Kiribati Papua New Guinea Georgia New Zealand 

Work.Type : 
Temporary Intern Full-Time Intern Temporary Intern 

Job.Posting.Date : 
2023-02-08 2023-05-10 2023-04-11 2023-07-13 2023-08-10 2023-01-19 

Preference : 
Female Female Male Both Both Female 

Job.Title : 
Landscape Designer Structural Engineer Physical Therapist Architect Office Manager Environmental Engineer 

Role : 
Sustainable Landscape Specialist Construction Engineer Geriatric Physical Therapist Sustainable Design Specialist Office Coordinator Sustainability Specia

In [8]:
# Splitting the string
extracted_values <- str_match(df_new$Experience, "(\\d+) to (\\d+) Years")

# Extracting min and max experience
min_experience <- as.numeric(extracted_values[, 2])
max_experience <- as.numeric(extracted_values[, 3])

# Creating new columns for minimum and maximum experience
df_new$Min_Experience <- min_experience
df_new$Max_Experience <- max_experience

# Converting Job.Posting.Date to date format
df_new$Job.Posting.Date <- as.Date(df_new$Job.Posting.Date)



In [9]:
json_string <- '{"Sector":"Oil & Gas","Industry":"Oil and Gas","City":"London","State":"N/A","Zip":"N/A","Website":"www.bp.com","Ticker":"BP","CEO":"Bernard Looney"}'

# Parse the JSON string and extract the "Sector"
parsed <- fromJSON(json_string, simplifyVector = FALSE)
sector <- parsed$Sector

# Print the extracted "Sector"
print(sector)

[1] "Oil & Gas"


In [10]:
n <- nrow(df_new)
sector_values <- vector("character", n) # Pre-allocate a vector to store the extracted sectors

for (i in 1:n) {
  json_string <- df_new$Company.Profile[i]
  
  # Try to parse JSON and extract the 'Sector'
  tryCatch({
    parsed <- fromJSON(json_string, simplifyVector = FALSE)
    if (!is.null(parsed$Sector)) {
      sector_values[i] <- parsed$Sector
    } else {
      sector_values[i] <- NA # Assign NA if 'Sector' is not present
    }
  }, error = function(e) {
    sector_values[i] <- NA # Assign NA in case of parsing errors
  })
}

df_new$Sector <- sector_values # Assign the vector of sectors back to the data frame

# Check the result
head(df_new$Sector)

In [11]:
unique_encoded_qualifications <- unique(df_new$Qualifications)
unique_encoded_work_type <- unique(df_new$Work.Type)
unique_encoded_preference <- unique(df_new$Preference)
unique_encoded_sector <- unique(df_new$Sector)

# unique_encoded_qualifications
# unique_encoded_work_type
# unique_encoded_preference
# unique_encoded_sector

In [12]:
columns_to_encode <- c("Qualifications", "Work.Type", "Preference", "Sector")

for (col in columns_to_encode) {
  df_new[[col]] <- as.integer(factor(df_new[[col]]))
}


In [13]:
numerical_unique_encoded_qualifications <- unique(df_new$Qualifications)
numerical_unique_encoded_work_type <- unique(df_new$Work.Type)
numerical_unique_encoded_preference <- unique(df_new$Preference)
numerical_unique_encoded_sector <- unique(df_new$Sector)

# numerical_unique_encoded_qualifications
# numerical_unique_encoded_work_type
# numerical_unique_encoded_preference
# numerical_unique_encoded_sector

In [14]:
mapping_dict <- list(
  Qualifications = data.frame(Categorical = unique_encoded_qualifications,
                              Numerical = numerical_unique_encoded_qualifications),
  Work.Type = data.frame(Categorical = unique_encoded_work_type,
                         Numerical = numerical_unique_encoded_work_type),
  Preference = data.frame(Categorical = unique_encoded_preference,
                          Numerical = numerical_unique_encoded_preference),
  Sector = data.frame(Categorical = unique_encoded_sector,
                      Numerical = numerical_unique_encoded_sector)
)

# Display the dictionary
print(mapping_dict)

$Qualifications
   Categorical Numerical
1        M.Com         6
2       M.Tech         7
3          PhD        10
4        B.Com         1
5          MBA         8
6          BBA         4
7          BCA         5
8          MCA         9
9       B.Tech         2
10          BA         3

$Work.Type
  Categorical Numerical
1   Temporary         5
2      Intern         3
3   Full-Time         2
4    Contract         1
5   Part-Time         4

$Preference
  Categorical Numerical
1      Female         2
2        Male         3
3        Both         1

$Sector
                                 Categorical Numerical
1                                  Oil & Gas       137
2                                  Utilities       202
3                      Aerospace and Defense         5
4                                 Automotive        17
5                                     Retail       157
6                                 Healthcare        93
7                                   Airlines      

In [15]:
head(df_new, 10)

Unnamed: 0_level_0,Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,Job.Portal,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience,Sector
Unnamed: 0_level_1,<chr>,<int>,<list>,<chr>,<int>,<int>,<date>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,4 to 11 Years,6,"$61K , $128K","Macedonia, FYR",5,90997,2023-02-08,2,Landscape Designer,Sustainable Landscape Specialist,ZipRecruiter,"{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'}",Sustainable landscape design Native plant selection Water conservation Soil health Green building practices LEED certification,"{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,11,137
2,3 to 10 Years,7,"$65K , $129K",Greenland,3,123218,2023-05-10,2,Structural Engineer,Construction Engineer,FlexJobs,"{'Health Insurance, Retirement Plans, Flexible Work Arrangements, Employee Assistance Programs (EAP), Bonuses and Incentive Programs'}",Construction management Structural design AutoCAD proficiency Project management Site inspection,"{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,3,10,202
3,0 to 15 Years,7,"$56K , $114K",Kiribati,2,62356,2023-04-11,3,Physical Therapist,Geriatric Physical Therapist,SimplyHired,"{'Casual Dress Code, Social and Recreational Activities, Employee Referral Programs, Health and Wellness Facilities, Life and Disability Insurance'}",Geriatric rehabilitation Fall prevention Elderly patient care,"{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,0,15,5
4,2 to 12 Years,10,"$57K , $113K",Papua New Guinea,3,61041,2023-07-13,1,Architect,Sustainable Design Specialist,Jobs2Careers,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}",Sustainable design principles Energy efficiency LEED certification Green building materials Environmental impact assessment,"{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,2,12,17
5,2 to 9 Years,1,"$57K , $110K",Georgia,5,84565,2023-08-10,1,Office Manager,Office Coordinator,Idealist,"{'Childcare Assistance, Paid Time Off (PTO), Relocation Assistance, Flexible Work Arrangements, Professional Development'}",Office administration Calendar management Reception duties Communication skills Record keeping Office supplies management,"{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,2,9,157
6,4 to 9 Years,10,"$57K, $81K",New Zealand,3,99068,2023-01-19,2,Environmental Engineer,Sustainability Specialist,Indeed,"{'Employee Referral Programs, Financial Counseling, Health and Wellness Facilities, Casual Dress Code, Flexible Spending Accounts (FSAs)'}",Sustainability practices Environmental management systems Green certifications,"{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,9,93
7,3 to 10 Years,8,"$62K , $128K",USA,1,26842,2023-06-09,3,Research Scientist,Senior Researcher,Internships.com,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}","Research methodologies and data analysis Literature review and synthesis Statistical analysis software (e.g., SPSS, R) Research proposal development Strong communication and reporting skills","{""Sector"":""Airlines"",""Industry"":""Airlines"",""City"":""Dallas"",""State"":""Texas"",""Zip"":""75235"",""Website"":""www.southwest.com"",""Ticker"":""LUV"",""CEO"":""Robert E. Jordan""}",62000,128000,-1.3263608,3,10,10
8,2 to 11 Years,4,"$57K , $119K",Brazil,3,21673,2023-05-10,2,Architect,Project Architect,Monster,"{'Health Insurance, Retirement Plans, Paid Time Off (PTO), Flexible Work Arrangements, Employee Assistance Programs (EAP)'}",Architectural project management Construction documents Building systems Design coordination Client communication,"{""Sector"":""Telecommunications"",""Industry"":""Telecommunications"",""City"":""Beijing"",""State"":""Beijing"",""Zip"":""100033"",""Website"":""https://www.hk.chinamobile.com/en/"",""Ticker"":""600029.SS"",""CEO"":""Li Huiliang""}",57000,119000,-1.4726725,2,11,184
9,5 to 14 Years,5,"$62K, $86K",Mongolia,5,24518,2023-02-14,3,Financial Controller,Finance Manager,USAJOBS,"{'Tuition Reimbursement, Stock Options or Equity Grants, Parental Leave, Wellness Programs, Childcare Assistance'}","Financial management Budgeting Financial analysis Financial reporting Risk management Financial software (e.g., Excel, QuickBooks) Strategic planning","{""Sector"":""Agriculture"",""Industry"":""Construction and Farm Machinery"",""City"":""Duluth"",""State"":""Georgia"",""Zip"":""30096"",""Website"":""www.agcocorp.com"",""Ticker"":""AGCO"",""CEO"":""Eric P. Hansotia""}",62000,86000,-1.3921431,5,14,8
10,2 to 15 Years,9,"$59K , $105K",Austria,3,13604,2023-06-08,1,Sales Representative,Inside Sales Representative,Stack Overflow Jobs,"{'Flexible Spending Accounts (FSAs), Relocation Assistance, Legal Assistance, Employee Recognition Programs, Financial Counseling'}","Sales prospecting and lead generation Sales presentation and communication CRM software (e.g., Salesforce) Sales negotiation and closing techniques Product knowledge Relationship building","{""Sector"":""Automotive"",""Industry"":""Motor Vehicles & Parts"",""City"":""Bellevue"",""State"":""Washington"",""Zip"":""98004"",""Website"":""www.paccar.com"",""Ticker"":""PCAR"",""CEO"":""Preston Feight""}",59000,105000,-1.7010705,2,15,17


In [16]:
# Convert the date column to Date format
df_new$Job.Posting.Date <- as.Date(df_new$Job.Posting.Date)

# Extract the month from the date column
df_new$Job.Posting.Month <- month(df_new$Job.Posting.Date)

In [17]:
head(df_new)

Unnamed: 0_level_0,Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,⋯,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience,Sector,Job.Posting.Month
Unnamed: 0_level_1,<chr>,<int>,<list>,<chr>,<int>,<int>,<date>,<int>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1,4 to 11 Years,6,"$61K , $128K","Macedonia, FYR",5,90997,2023-02-08,2,Landscape Designer,Sustainable Landscape Specialist,⋯,"{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'}",Sustainable landscape design Native plant selection Water conservation Soil health Green building practices LEED certification,"{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,11,137,2
2,3 to 10 Years,7,"$65K , $129K",Greenland,3,123218,2023-05-10,2,Structural Engineer,Construction Engineer,⋯,"{'Health Insurance, Retirement Plans, Flexible Work Arrangements, Employee Assistance Programs (EAP), Bonuses and Incentive Programs'}",Construction management Structural design AutoCAD proficiency Project management Site inspection,"{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,3,10,202,5
3,0 to 15 Years,7,"$56K , $114K",Kiribati,2,62356,2023-04-11,3,Physical Therapist,Geriatric Physical Therapist,⋯,"{'Casual Dress Code, Social and Recreational Activities, Employee Referral Programs, Health and Wellness Facilities, Life and Disability Insurance'}",Geriatric rehabilitation Fall prevention Elderly patient care,"{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,0,15,5,4
4,2 to 12 Years,10,"$57K , $113K",Papua New Guinea,3,61041,2023-07-13,1,Architect,Sustainable Design Specialist,⋯,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}",Sustainable design principles Energy efficiency LEED certification Green building materials Environmental impact assessment,"{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,2,12,17,7
5,2 to 9 Years,1,"$57K , $110K",Georgia,5,84565,2023-08-10,1,Office Manager,Office Coordinator,⋯,"{'Childcare Assistance, Paid Time Off (PTO), Relocation Assistance, Flexible Work Arrangements, Professional Development'}",Office administration Calendar management Reception duties Communication skills Record keeping Office supplies management,"{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,2,9,157,8
6,4 to 9 Years,10,"$57K, $81K",New Zealand,3,99068,2023-01-19,2,Environmental Engineer,Sustainability Specialist,⋯,"{'Employee Referral Programs, Financial Counseling, Health and Wellness Facilities, Casual Dress Code, Flexible Spending Accounts (FSAs)'}",Sustainability practices Environmental management systems Green certifications,"{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,9,93,1


In [18]:
separate_skills <- function(text) {
  # Regular expression to find whitespace followed by uppercase letters
  pattern <- "(\\s+)(?=[A-Z])"
  # Replace matching whitespace with comma
  replaced_text <- gsub(pattern, ",", text, perl = TRUE)
  return(replaced_text)
}

# Example usage:
text <- df_new$skills
df_new$skills <- separate_skills(text)
head(df_new)

Unnamed: 0_level_0,Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,⋯,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience,Sector,Job.Posting.Month
Unnamed: 0_level_1,<chr>,<int>,<list>,<chr>,<int>,<int>,<date>,<int>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1,4 to 11 Years,6,"$61K , $128K","Macedonia, FYR",5,90997,2023-02-08,2,Landscape Designer,Sustainable Landscape Specialist,⋯,"{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'}","Sustainable landscape design,Native plant selection,Water conservation,Soil health,Green building practices,LEED certification","{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,11,137,2
2,3 to 10 Years,7,"$65K , $129K",Greenland,3,123218,2023-05-10,2,Structural Engineer,Construction Engineer,⋯,"{'Health Insurance, Retirement Plans, Flexible Work Arrangements, Employee Assistance Programs (EAP), Bonuses and Incentive Programs'}","Construction management,Structural design,AutoCAD proficiency,Project management,Site inspection","{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,3,10,202,5
3,0 to 15 Years,7,"$56K , $114K",Kiribati,2,62356,2023-04-11,3,Physical Therapist,Geriatric Physical Therapist,⋯,"{'Casual Dress Code, Social and Recreational Activities, Employee Referral Programs, Health and Wellness Facilities, Life and Disability Insurance'}","Geriatric rehabilitation,Fall prevention,Elderly patient care","{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,0,15,5,4
4,2 to 12 Years,10,"$57K , $113K",Papua New Guinea,3,61041,2023-07-13,1,Architect,Sustainable Design Specialist,⋯,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}","Sustainable design principles,Energy efficiency,LEED certification,Green building materials,Environmental impact assessment","{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,2,12,17,7
5,2 to 9 Years,1,"$57K , $110K",Georgia,5,84565,2023-08-10,1,Office Manager,Office Coordinator,⋯,"{'Childcare Assistance, Paid Time Off (PTO), Relocation Assistance, Flexible Work Arrangements, Professional Development'}","Office administration,Calendar management,Reception duties,Communication skills,Record keeping,Office supplies management","{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,2,9,157,8
6,4 to 9 Years,10,"$57K, $81K",New Zealand,3,99068,2023-01-19,2,Environmental Engineer,Sustainability Specialist,⋯,"{'Employee Referral Programs, Financial Counseling, Health and Wellness Facilities, Casual Dress Code, Flexible Spending Accounts (FSAs)'}","Sustainability practices,Environmental management systems,Green certifications","{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,9,93,1


In [19]:
# Convert the 'skills' column to character format
df_new$skills <- as.character(df_new$skills)

# Split the comma-separated values into a list
skills_list <- strsplit(df_new$skills, ",")

df_new$skills <- skills_list
head(df_new)

Unnamed: 0_level_0,Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,⋯,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience,Sector,Job.Posting.Month
Unnamed: 0_level_1,<chr>,<int>,<list>,<chr>,<int>,<int>,<date>,<int>,<chr>,<chr>,⋯,<chr>,<list>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1,4 to 11 Years,6,"$61K , $128K","Macedonia, FYR",5,90997,2023-02-08,2,Landscape Designer,Sustainable Landscape Specialist,⋯,"{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'}","Sustainable landscape design, Native plant selection , Water conservation , Soil health , Green building practices , LEED certification","{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,11,137,2
2,3 to 10 Years,7,"$65K , $129K",Greenland,3,123218,2023-05-10,2,Structural Engineer,Construction Engineer,⋯,"{'Health Insurance, Retirement Plans, Flexible Work Arrangements, Employee Assistance Programs (EAP), Bonuses and Incentive Programs'}","Construction management, Structural design , AutoCAD proficiency , Project management , Site inspection","{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,3,10,202,5
3,0 to 15 Years,7,"$56K , $114K",Kiribati,2,62356,2023-04-11,3,Physical Therapist,Geriatric Physical Therapist,⋯,"{'Casual Dress Code, Social and Recreational Activities, Employee Referral Programs, Health and Wellness Facilities, Life and Disability Insurance'}","Geriatric rehabilitation, Fall prevention , Elderly patient care","{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,0,15,5,4
4,2 to 12 Years,10,"$57K , $113K",Papua New Guinea,3,61041,2023-07-13,1,Architect,Sustainable Design Specialist,⋯,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}","Sustainable design principles , Energy efficiency , LEED certification , Green building materials , Environmental impact assessment","{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,2,12,17,7
5,2 to 9 Years,1,"$57K , $110K",Georgia,5,84565,2023-08-10,1,Office Manager,Office Coordinator,⋯,"{'Childcare Assistance, Paid Time Off (PTO), Relocation Assistance, Flexible Work Arrangements, Professional Development'}","Office administration , Calendar management , Reception duties , Communication skills , Record keeping , Office supplies management","{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,2,9,157,8
6,4 to 9 Years,10,"$57K, $81K",New Zealand,3,99068,2023-01-19,2,Environmental Engineer,Sustainability Specialist,⋯,"{'Employee Referral Programs, Financial Counseling, Health and Wellness Facilities, Casual Dress Code, Flexible Spending Accounts (FSAs)'}","Sustainability practices , Environmental management systems, Green certifications","{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,9,93,1


In [20]:
# Remove curly brackets and single quotes
df_new$Benefits <- as.character(df_new$Benefits)
df_new$Benefits <- gsub("^\\{|\\}|'", "", df_new$Benefits)
df_new$Benefits <- gsub(",\\s+", ",", df_new$Benefits)

print(df_new$Benefits[1])

# Split the comma-separated values into a list
Benefits_list <- strsplit(df_new$Benefits, ",")

df_new$Benefits <- Benefits_list
head(df_new)


[1] "Transportation Benefits,Professional Development,Bonuses and Incentive Programs,Profit-Sharing,Employee Discounts"


Unnamed: 0_level_0,Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,⋯,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience,Sector,Job.Posting.Month
Unnamed: 0_level_1,<chr>,<int>,<list>,<chr>,<int>,<int>,<date>,<int>,<chr>,<chr>,⋯,<list>,<list>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>
1,4 to 11 Years,6,"$61K , $128K","Macedonia, FYR",5,90997,2023-02-08,2,Landscape Designer,Sustainable Landscape Specialist,⋯,"Transportation Benefits , Professional Development , Bonuses and Incentive Programs, Profit-Sharing , Employee Discounts","Sustainable landscape design, Native plant selection , Water conservation , Soil health , Green building practices , LEED certification","{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,11,137,2
2,3 to 10 Years,7,"$65K , $129K",Greenland,3,123218,2023-05-10,2,Structural Engineer,Construction Engineer,⋯,"Health Insurance , Retirement Plans , Flexible Work Arrangements , Employee Assistance Programs (EAP), Bonuses and Incentive Programs","Construction management, Structural design , AutoCAD proficiency , Project management , Site inspection","{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,3,10,202,5
3,0 to 15 Years,7,"$56K , $114K",Kiribati,2,62356,2023-04-11,3,Physical Therapist,Geriatric Physical Therapist,⋯,"Casual Dress Code , Social and Recreational Activities, Employee Referral Programs , Health and Wellness Facilities , Life and Disability Insurance","Geriatric rehabilitation, Fall prevention , Elderly patient care","{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,0,15,5,4
4,2 to 12 Years,10,"$57K , $113K",Papua New Guinea,3,61041,2023-07-13,1,Architect,Sustainable Design Specialist,⋯,"Life and Disability Insurance , Stock Options or Equity Grants , Employee Recognition Programs , Health Insurance , Social and Recreational Activities","Sustainable design principles , Energy efficiency , LEED certification , Green building materials , Environmental impact assessment","{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,2,12,17,7
5,2 to 9 Years,1,"$57K , $110K",Georgia,5,84565,2023-08-10,1,Office Manager,Office Coordinator,⋯,"Childcare Assistance , Paid Time Off (PTO) , Relocation Assistance , Flexible Work Arrangements, Professional Development","Office administration , Calendar management , Reception duties , Communication skills , Record keeping , Office supplies management","{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,2,9,157,8
6,4 to 9 Years,10,"$57K, $81K",New Zealand,3,99068,2023-01-19,2,Environmental Engineer,Sustainability Specialist,⋯,"Employee Referral Programs , Financial Counseling , Health and Wellness Facilities , Casual Dress Code , Flexible Spending Accounts (FSAs)","Sustainability practices , Environmental management systems, Green certifications","{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,9,93,1


In [21]:

print(df_new$Benefits[1])

[[1]]
[1] "Transportation Benefits"        "Professional Development"      
[3] "Bonuses and Incentive Programs" "Profit-Sharing"                
[5] "Employee Discounts"            

