## Introduction

## Methods and Results

In [1]:
library(broom)
library(repr)
library(infer)
library(gridExtra)
library(faraway)
library(mltools)
library(leaps)
library(dplyr)
library(glmnet)
library(cowplot)
library(modelr)
library(tidyverse)
library(caret)

salaries <- read_csv("data/ds_salaries.csv")


Attaching package: ‘dplyr’


The following object is masked from ‘package:gridExtra’:

    combine


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Matrix

Loaded glmnet 4.1-8


Attaching package: ‘modelr’


The following objects are masked from ‘package:mltools’:

    mse, rmse


The following object is masked from ‘package:broom’:

    bootstrap


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() 

### EDA

### Methods


In [5]:
# remove unnecessary variables
preprocessed_salaries <- salaries |>
  select(-c("...1", salary_currency, work_year, salary, employee_residence))

In [6]:
# feature engineering on company_location, job_title
no_am <- c("CA", "US", "MX", "HN")
so_am <- c("CL", "CO", "BR")
eur <- c("SI", "PL", "NL", "MT", "MO", "LU", "MD", "IT", "UA", 
         "IE", "HU", "HR", "GR", "GB", "FR", "ES", "EE", "DK", "DE", "AT", "BE", "CH", "CZ", "RO", "PT")
asia <- c("MY", "VN", "SG", "RU", "JP", "CN", "IN", "PK", "IL", "IQ", "IR", "TR", "AE")
afr <- c("KE", "DZ", "NG")
oce <- c("NZ", "AU", "AS")

In [7]:
# cont is the company location in terms of continent
preprocessed_salaries <- preprocessed_salaries %>%
    mutate(cont = "")

preprocessed_salaries[1, ]

experience_level,employment_type,job_title,salary_in_usd,remote_ratio,company_location,company_size,cont
<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>
MI,FT,Data Scientist,79833,0,DE,L,


In [9]:
for(i in 1:607){
    if(preprocessed_salaries[i, ]$company_location %in% no_am){
        preprocessed_salaries[i, ]$cont = "no_am"
    } else if(preprocessed_salaries[i, ]$company_location %in% so_am) {
        preprocessed_salaries[i, ]$cont = "so_am"
    } else if(preprocessed_salaries[i, ]$company_location %in% eur){
        preprocessed_salaries[i, ]$cont = "eur"
    } else if(preprocessed_salaries[i, ]$company_location %in% asia){
        preprocessed_salaries[i, ]$cont = "asia"
    } else if(preprocessed_salaries[i, ]$company_location %in% afr){
        preprocessed_salaries[i, ]$cont = "afr"
    } else if(preprocessed_salaries[i, ]$company_location %in% oce) {
        preprocessed_salaries[i, ]$cont = "oce"
    }
}

head(select(preprocessed_salaries, company_location, cont), 10)

company_location,cont
<chr>,<chr>
DE,eur
JP,asia
GB,eur
HN,no_am
US,no_am
US,no_am
US,no_am
HU,eur
US,no_am
NZ,oce


In [14]:
counts <- preprocessed_salaries %>%
    group_by(job_title) %>%
    summarize(counts = n())

counts

job_title,counts
<chr>,<int>
3D Computer Vision Researcher,1
AI Scientist,7
Analytics Engineer,4
Applied Data Scientist,5
Applied Machine Learning Scientist,4
BI Data Analyst,6
Big Data Architect,1
Big Data Engineer,8
Business Data Analyst,5
Cloud Data Engineer,2


In [35]:
# field will contain the job_title groupings
preprocessed_salaries <- preprocessed_salaries %>%
    mutate(field = "")

preprocessed_salaries[1, ]

experience_level,employment_type,job_title,salary_in_usd,remote_ratio,company_location,company_size,cont,field
<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
MI,FT,Data Scientist,79833,0,DE,L,eur,


In [38]:
for(i in 1:607){
    if(str_detect(preprocessed_salaries$job_title[i], "Data Scien")) {
        preprocessed_salaries$field[i] = "Data Science"
    } else if(str_detect(preprocessed_salaries$job_title[i], "Data Analy")) {
        preprocessed_salaries$field[i] = "Data Analysis"       
    } else if(str_detect(preprocessed_salaries$job_title[i], "Machine Learning") || str_detect(preprocessed_salaries$job_title[i], "ML")){
        preprocessed_salaries$field[i] = "Machine Learning"
    } else if(str_detect(preprocessed_salaries$job_title[i], "Data Engineer")) {
        preprocessed_salaries$field[i] = "Data Engineering"
    } else {
        preprocessed_salaries$field[i] = "Other"
    }
}

head(select(preprocessed_salaries, job_title, field), 20)

job_title,field
<chr>,<chr>
Data Scientist,Data Science
Machine Learning Scientist,Machine Learning
Big Data Engineer,Data Engineering
Product Data Analyst,Data Analysis
Machine Learning Engineer,Machine Learning
Data Analyst,Data Analysis
Lead Data Scientist,Data Science
Data Scientist,Data Science
Business Data Analyst,Data Analysis
Lead Data Engineer,Data Engineering


In [41]:
# Remove job_title and company_location
feat_salaries <- preprocessed_salaries %>%
    select(-job_title, -company_location)

# Convert all string parameters to factors
feat_salaries$company_size <- as.factor(feat_salaries$company_size)
feat_salaries$cont <- as.factor(feat_salaries$cont)
feat_salaries$field <- as.factor(feat_salaries$field)
head(feat_salaries)

experience_level,employment_type,salary_in_usd,remote_ratio,company_size,cont,field
<chr>,<chr>,<dbl>,<dbl>,<fct>,<fct>,<fct>
MI,FT,79833,0,L,eur,Data Science
SE,FT,260000,0,S,asia,Machine Learning
SE,FT,109024,50,M,eur,Data Engineering
MI,FT,20000,0,S,no_am,Data Analysis
SE,FT,150000,50,L,no_am,Machine Learning
EN,FT,72000,100,L,no_am,Data Analysis


In [6]:
# forward selection

In [7]:
# model comparison

## Discussion

## References