---
title: Assignment 04
author:
  - name: Bhargavi Manyala
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: today
date-modified: today
date-format: long
format:
  html:
    theme: cerulean
    toc: true
    toc-depth: 2
engine: jupyter
jupyter: assignment-04-kernel
execute:
  echo: true
  eval: true
  output: true
  freeze: auto
---

# Load the Dataset

In [22]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when, expr, trim, regexp_replace
from pyspark.sql.functions import col, sum
import matplotlib.pyplot as plt


np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .option("escape", "\"")  
    .csv("data/lightcast_job_postings.csv")

)

df.createOrReplaceTempView("job_postings")
#df.show(5)


                                                                                

# Feature Engineering

### Take columns needed for anlaysis

In [23]:
eda_cols = [
    "SALARY",
    "MIN_YEARS_EXPERIENCE",
    "DURATION",
    "COMPANY_IS_STAFFING", 
    "IS_INTERNSHIP",
    "STATE_NAME",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME"
]

df_eda = df.dropna(subset=eda_cols)
df_eda = df_eda.select(eda_cols)






### Clean categorical columns 


In [24]:

# Clean Remote Type Name

df_eda = df_eda.withColumn(
    "REMOTE_TYPE_NAME",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "[None]", "Undefined")
    .when(col("REMOTE_TYPE_NAME") == "Not Remote", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
    .otherwise(col("REMOTE_TYPE_NAME"))
)

# Clean Employment Type Name

df_eda = df_eda.withColumn(
    "EMPLOYMENT_TYPE_NAME",
    when(col("EMPLOYMENT_TYPE_NAME") == "Part-time / full-time", "Flexible")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Part-time (â‰¤ 32 hours)", "Parttime")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Full-time (> 32 hours)", "Fulltime")
    .when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Fulltime")
    .otherwise(col("EMPLOYMENT_TYPE_NAME"))
)


# Clean Education level Name

df_eda = df_eda.withColumn(
    "MIN_EDULEVELS_NAME",
    when(col("MIN_EDULEVELS_NAME") == "Bachelor's degree", "Bachelor's")
    .when(col("MIN_EDULEVELS_NAME") == "Ph.D. or professional degree", "PhD")
    .when(col("MIN_EDULEVELS_NAME") == "High school or GED", "High School")
    .when(col("MIN_EDULEVELS_NAME") == "Master's degree", "Master's")
    .when(col("MIN_EDULEVELS_NAME") == "No Education Listed", "None")
    .when(col("MIN_EDULEVELS_NAME") == "Associate degree", "Associate")
    .when(col("MIN_EDULEVELS_NAME").isNull(), "None")
    .otherwise(col("MIN_EDULEVELS_NAME"))
)

# Clean Company Staffing

df_eda = df_eda.withColumn(
    "COMPANY_IS_STAFFING",
    when(col("COMPANY_IS_STAFFING").isNull(), False)
    .otherwise(col("COMPANY_IS_STAFFING"))
)

# Clean Internship

df_eda = df_eda.withColumn(
    "IS_INTERNSHIP",
    when(col("IS_INTERNSHIP").isNull(), False)
    .otherwise(col("IS_INTERNSHIP"))
)

categorical_cols = [
    "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME", "COMPANY_IS_STAFFING", "IS_INTERNSHIP"
]

for colname in categorical_cols:
    print(f"\n---- {colname} ----")
    df_eda.select(colname).distinct().show(truncate=False)




---- REMOTE_TYPE_NAME ----


                                                                                

+----------------+
|REMOTE_TYPE_NAME|
+----------------+
|Remote          |
|On Premise      |
|Hybrid          |
|Undefined       |
+----------------+


---- EMPLOYMENT_TYPE_NAME ----


                                                                                

+--------------------+
|EMPLOYMENT_TYPE_NAME|
+--------------------+
|Flexible            |
|Fulltime            |
|Parttime            |
+--------------------+


---- MIN_EDULEVELS_NAME ----


                                                                                

+------------------+
|MIN_EDULEVELS_NAME|
+------------------+
|High School       |
|None              |
|Associate         |
|PhD               |
|Bachelor's        |
|Master's          |
+------------------+


---- COMPANY_IS_STAFFING ----


                                                                                

+-------------------+
|COMPANY_IS_STAFFING|
+-------------------+
|true               |
|false              |
+-------------------+


---- IS_INTERNSHIP ----


[Stage 206:>                                                        (0 + 1) / 1]

+-------------+
|IS_INTERNSHIP|
+-------------+
|true         |
|false        |
+-------------+



                                                                                