# Data Science Job Postings on Glassdoor (Cleaned Dataset)

### Importing Dataset and Data Exploration

In [1]:
library(dplyr)

ERROR: Error: package or namespace load failed for 'dplyr' in loadNamespace(j <- i[[1L]], c(lib.loc, .libPaths()), versionCheck = vI[[j]]):
 there is no package called 'pillar'


In [None]:
df <- read.csv("Cleaned_DS_Jobs.csv", header=TRUE)

In [None]:
df <- df %>% select(-c("Salary.Estimate", "Job.Description", "Company.Name", "Industry", "Location", "Headquarters", "same_state"))

df


In [None]:
df <- df[df$Rating != 0.0, ]

head(df, 1)

In [None]:
dim(df)

In [None]:
summary(df)

In [None]:
hist(df$avg_salary, breaks=20, main="Salary Distribution", xlab="Average Salary", col="blue")

In [None]:
df$ln_avg_salary <- log(df$avg_salary)
hist(df$ln_avg_salary, breaks=20, main="Log-transformed Average Salary Distribution", xlab="Ln(Average Salary)", col="blue")

In [None]:
str(df)


# Transforming Size to avg_employees

In [None]:
# Extract minimum and maximum employee numbers
df$min_employees <- as.numeric(gsub(" .*", "", df$Size))
df$max_employees <- as.numeric(gsub(".*to ", "", gsub(" employees", "", df$Size)))

# Replace NA values which might occur due to parsing inconsistencies or special cases like "Unknown"
df$min_employees[is.na(df$min_employees)] <- 0
df$max_employees[is.na(df$max_employees)] <- 0

df$avg_employees <- (df$min_employees + df$max_employees) / 2

In [None]:
df$avg_employees[df$avg_employees == 0] <- 10000

head(df, 1)

# One Hot Encoding

In [None]:
unique(df$Type.of.ownership)
unique(df$Revenue)

In [None]:
columns_to_encode <- c("Type.of.ownership", "Sector", "Revenue", "job_state", "job_simp", "seniority")


for (col in columns_to_encode) {
  new_col_name <- paste0(col, "_encoded") # Creating a new column name for the encoded version
  df[[new_col_name]] <- as.integer(factor(df[[col]])) # Assigning encoded values to the new column
}

# # One-hot encode the 'Type.of.ownership' column as an example
# df <- cbind(df, model.matrix(~ Type.of.ownership - 1, data = df))
# df <- cbind(df, model.matrix(~ Sector - 1, data = df))
# df <- cbind(df, model.matrix(~ Revenue - 1, data = df))
# df <- cbind(df, model.matrix(~ job_state - 1, data = df))
# df <- cbind(df, model.matrix(~ job_simp - 1, data = df))
# df <- cbind(df, model.matrix(~ seniority - 1, data = df))

In [None]:
numeric_df <- df[sapply(df, is.numeric)]

In [None]:
library(ggplot2)
library(reshape2)

In [None]:
# Assuming numeric_df is your dataframe with only numerical columns
cor_matrix <- cor(numeric_df, use = "pairwise.complete.obs")
cor_melted <- melt(cor_matrix)

In [None]:
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) + 
  geom_tile(color = "white") +  # Adds borders to the tiles
  geom_text(aes(label = sprintf("%.2f", value)), vjust = 1) +  # Annotates each cell
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +  # Color scheme
  theme_minimal() +  # Minimal theme
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for better readability
        axis.title = element_blank(),  # Remove axis titles
        panel.grid.major = element_blank(), panel.grid.minor = element_blank())  # Remove gridlines
