In [3]:
df <- read.csv("sampled_job_descriptions.csv", sep=",", header=TRUE, fill=TRUE)
install.packages("tm", repos="http://R-Forge.R-project.org")
library(tidyverse)
library(lubridate)
library(caret) # For scaling
library(jsonlite)
library(readr)
library(dplyr)
library(stringr)

"dependency 'slam' is not available"

package 'tm' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Ng\AppData\Local\Temp\RtmpewEYwv\downloaded_packages


In [4]:
# Example data
data <- data.frame(Salary.Range = c("$12K-$128K", "$40K-$90K", "$80K-$150K"))

# Convert Salary.Range to character type if necessary
data <- data %>%
  mutate(Salary.Range = as.character(Salary.Range)) %>%
  # Split the Salary.Range column by "-"
  mutate(Salary.Range = strsplit(Salary.Range, "-")) %>%
  # Extract numeric values for minRange and maxRange
  mutate(minRange = extract_numeric(sapply(Salary.Range, `[`, 1)) * 1000,
         maxRange = extract_numeric(sapply(Salary.Range, `[`, 2)) * 1000)


# Print the modified data frame
print(data)

extract_numeric() is deprecated: please use readr::parse_number() instead
extract_numeric() is deprecated: please use readr::parse_number() instead


  Salary.Range minRange maxRange
1  $12K, $128K    12000   128000
2   $40K, $90K    40000    90000
3  $80K, $150K    80000   150000


In [5]:
# Function to extract numeric values
extract_numeric <- function(x) {
  # Extract numeric part and remove non-numeric characters
  numeric_part <- gsub("\\D", "", x)
  # Convert to numeric (handling empty strings)
  ifelse(numeric_part == "", NA, as.numeric(numeric_part))
}

# Split the "Salary.Range" column by "-"
df_new <- df %>%
  mutate(Salary.Range = strsplit(as.character(Salary.Range), "-")) %>%
  # Extract minimum and maximum values
  mutate(
    minRange = extract_numeric(sapply(Salary.Range, `[`, 1)) * 1000,
    maxRange = extract_numeric(sapply(Salary.Range, `[`, 2)) * 1000
  )

# Print the modified data frame
print(head(df_new, 5))

     Experience Qualifications Salary.Range          Country Work.Type
1 4 to 11 Years          M.Com  $61K, $128K   Macedonia, FYR Temporary
2 3 to 10 Years         M.Tech  $65K, $129K        Greenland    Intern
3 0 to 15 Years         M.Tech  $56K, $114K         Kiribati Full-Time
4 2 to 12 Years            PhD  $57K, $113K Papua New Guinea    Intern
5  2 to 9 Years          B.Com  $57K, $110K          Georgia Temporary
  Company.Size Job.Posting.Date Preference           Job.Title
1        90997       2023-02-08     Female  Landscape Designer
2       123218       2023-05-10     Female Structural Engineer
3        62356       2023-04-11       Male  Physical Therapist
4        61041       2023-07-13       Both           Architect
5        84565       2023-08-10       Both      Office Manager
                              Role   Job.Portal
1 Sustainable Landscape Specialist ZipRecruiter
2            Construction Engineer     FlexJobs
3     Geriatric Physical Therapist  SimplyHired
4   

In [6]:
missing_profiles <- sum(is.na(df_new$Company_Profile))

# Print the result
if (missing_profiles > 0) {
  print(paste("There are", missing_profiles, "missing company profiles."))
} else {
  print("There are no missing company profiles.")
}

[1] "There are no missing company profiles."


In [7]:
# Check if each column in the data frame is numeric
numerical_columns <- sapply(df, is.numeric)

# Extract the names of the numerical columns
numerical_column_names <- names(numerical_columns[numerical_columns])

# Filter the data frame to only include the numerical columns
numerical_data <- df[numerical_column_names]

# Check the range of values for each numerical feature
feature_ranges <- sapply(numerical_data, function(x) c(min = min(x, na.rm = TRUE), max = max(x, na.rm = TRUE)))

# Check if the range of values for any feature is significantly different
threshold <- 15

# Check if the range of values for any feature is greater than the threshold
features_to_scale <- names(which(apply(feature_ranges, 2, function(x) diff(x) > threshold)))

features_to_scale


In [8]:
preproc <- preProcess(df_new[, "Company.Size", drop = FALSE], method = c("center", "scale"))
df_new$Company_Size_Scaled <- predict(preproc, df_new[, "Company.Size", drop = FALSE])$`Company.Size`

# df_new

In [9]:
# Create an empty vector to store the names of categorical columns
categorical_columns <- c()

# Iterate over each column in the data frame
for (col in names(df_new)) {
  # Check if the column is categorical
  if (is.factor(df_new[[col]]) || is.character(df_new[[col]])) {
    categorical_columns <- c(categorical_columns, col)
  }
}

cat("Categorical columns in df_new:", "\n")
  print(categorical_columns)


categorical_head <- list()

# Iterate over each categorical column in the data frame
for (col in categorical_columns) {
  # Convert the column to character if it's a factor
  if (is.factor(df_new[[col]])) {
    df_new[[col]] <- as.character(df_new[[col]])
  }
  
  # Print the name of the column
  cat( col, ":", "\n")
  
  # Print the head of the column
  cat(head(df_new[[col]]), "\n\n")
}

Categorical columns in df_new: 
 [1] "Experience"       "Qualifications"   "Country"          "Work.Type"       
 [5] "Job.Posting.Date" "Preference"       "Job.Title"        "Role"            
 [9] "Job.Portal"       "Benefits"         "skills"           "Company.Profile" 


Experience : 
4 to 11 Years 3 to 10 Years 0 to 15 Years 2 to 12 Years 2 to 9 Years 4 to 9 Years 

Qualifications : 
M.Com M.Tech M.Tech PhD B.Com PhD 

Country : 
Macedonia, FYR Greenland Kiribati Papua New Guinea Georgia New Zealand 

Work.Type : 
Temporary Intern Full-Time Intern Temporary Intern 

Job.Posting.Date : 
2023-02-08 2023-05-10 2023-04-11 2023-07-13 2023-08-10 2023-01-19 

Preference : 
Female Female Male Both Both Female 

Job.Title : 
Landscape Designer Structural Engineer Physical Therapist Architect Office Manager Environmental Engineer 

Role : 
Sustainable Landscape Specialist Construction Engineer Geriatric Physical Therapist Sustainable Design Specialist Office Coordinator Sustainability Specialist 

Job.Portal : 
ZipRecruiter FlexJobs SimplyHired Jobs2Careers Idealist Indeed 

Benefits : 
{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'} {'Health Insurance, Retirement Plans, Flexible Work Arr

In [12]:




# Splitting the string
extracted_values <- str_match(df_new$Experience, "(\\d+) to (\\d+) Years")

# Extracting min and max experience
min_experience <- as.numeric(extracted_values[2])
max_experience <- as.numeric(extracted_values[3])

# Creating new columns for minimum and maximum experience
df_new$Min_Experience <- min_experience
df_new$Max_Experience <- max_experience

# Converting Job.Posting.Date to date format
df_new$Job.Posting.Date <- as.Date(df_new$Job.Posting.Date)

# Printing the head of the modified data frame
head(df_new)

Experience,Qualifications,Salary.Range,Country,Work.Type,Company.Size,Job.Posting.Date,Preference,Job.Title,Role,Job.Portal,Benefits,skills,Company.Profile,minRange,maxRange,Company_Size_Scaled,Min_Experience,Max_Experience
4 to 11 Years,M.Com,"$61K , $128K","Macedonia, FYR",Temporary,90997,2023-02-08,Female,Landscape Designer,Sustainable Landscape Specialist,ZipRecruiter,"{'Transportation Benefits, Professional Development, Bonuses and Incentive Programs, Profit-Sharing, Employee Discounts'}",Sustainable landscape design Native plant selection Water conservation Soil health Green building practices LEED certification,"{""Sector"":""Oil & Gas"",""Industry"":""Oil and Gas"",""City"":""London"",""State"":""N/A"",""Zip"":""N/A"",""Website"":""www.bp.com"",""Ticker"":""BP"",""CEO"":""Bernard Looney""}",61000,128000,0.4895857,4,
3 to 10 Years,M.Tech,"$65K , $129K",Greenland,Intern,123218,2023-05-10,Female,Structural Engineer,Construction Engineer,FlexJobs,"{'Health Insurance, Retirement Plans, Flexible Work Arrangements, Employee Assistance Programs (EAP), Bonuses and Incentive Programs'}",Construction management Structural design AutoCAD proficiency Project management Site inspection,"{""Sector"":""Utilities"",""Industry"":""Utilities: Gas and Electric"",""City"":""King of Prussia"",""State"":""Pennsylvania"",""Zip"":""19406"",""Website"":""www.ugicorp.com"",""Ticker"":""UGI"",""CEO"":""Roger Perreault""}",65000,129000,1.4016207,4,
0 to 15 Years,M.Tech,"$56K , $114K",Kiribati,Full-Time,62356,2023-04-11,Male,Physical Therapist,Geriatric Physical Therapist,SimplyHired,"{'Casual Dress Code, Social and Recreational Activities, Employee Referral Programs, Health and Wellness Facilities, Life and Disability Insurance'}",Geriatric rehabilitation Fall prevention Elderly patient care,"{""Sector"":""Aerospace and Defense"",""Industry"":""Aerospace & Defense"",""City"":""Reston"",""State"":""Virginia"",""Zip"":""20190"",""Website"":""www.gd.com"",""Ticker"":""GD"",""CEO"":""Phebe N. Novakovic""}",56000,114000,-0.3211153,4,
2 to 12 Years,PhD,"$57K , $113K",Papua New Guinea,Intern,61041,2023-07-13,Both,Architect,Sustainable Design Specialist,Jobs2Careers,"{'Life and Disability Insurance, Stock Options or Equity Grants, Employee Recognition Programs, Health Insurance, Social and Recreational Activities'}",Sustainable design principles Energy efficiency LEED certification Green building materials Environmental impact assessment,"{""Sector"":""Automotive"",""Industry"":""Automotive Retailing, Services"",""City"":""Medford"",""State"":""Oregon"",""Zip"":""97501"",""Website"":""www.lithiadriveway.com"",""Ticker"":""LAD"",""CEO"":""Bryan B. Deboer""}",57000,113000,-0.3583372,4,
2 to 9 Years,B.Com,"$57K , $110K",Georgia,Temporary,84565,2023-08-10,Both,Office Manager,Office Coordinator,Idealist,"{'Childcare Assistance, Paid Time Off (PTO), Relocation Assistance, Flexible Work Arrangements, Professional Development'}",Office administration Calendar management Reception duties Communication skills Record keeping Office supplies management,"{""Sector"":""Retail"",""Industry"":""Automotive Retailing, Services"",""City"":""Richmond"",""State"":""Virginia"",""Zip"":""23238"",""Website"":""www.carmax.com"",""Ticker"":""KMX"",""CEO"":""William D. Nash""}",57000,110000,0.307524,4,
4 to 9 Years,PhD,"$57K, $81K",New Zealand,Intern,99068,2023-01-19,Female,Environmental Engineer,Sustainability Specialist,Indeed,"{'Employee Referral Programs, Financial Counseling, Health and Wellness Facilities, Casual Dress Code, Flexible Spending Accounts (FSAs)'}",Sustainability practices Environmental management systems Green certifications,"{""Sector"":""Healthcare"",""Industry"":""Healthcare"",""City"":""Sydney"",""State"":""NSW"",""Zip"":""2000"",""Website"":""https://www.sonichealthcare.com/"",""Ticker"":""SHL"",""CEO"":""Dr. Colin Goldschmidt""}",57000,81000,0.7180402,4,


In [14]:
# Sample data
experience <- "4 to 11 Years"

# Splitting the string
extracted_values <- str_match(experience, "(\\d+) to (\\d+) Years")

# Extracting min and max experience
min_experience <- as.numeric(extracted_values[2])
max_experience <- as.numeric(extracted_values[3])
# Printing the results
print(min_experience)
print(max_experience)

[1] 4
[1] 11


In [None]:
print(max_experience)

     [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
    [25] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
    [49] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
    [73] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
    [97] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [121] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [145] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [169] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [193] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [217] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [241] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [265] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
   [289] NA NA NA NA NA NA N