In [23]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [24]:
#To make pyspark importable as a regular library
findspark.init()

In [25]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, when, avg, round, rank
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType

In [26]:
import pyspark.sql.functions as f

In [27]:
%run /notebook/dataproduct/assesment_nyc_job_posting/utils/spark_session.ipynb

Spark Session Imported


In [28]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "1")  # Use 1 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [29]:
#setting spark conf before creating spark session
spark_conf = get_spark_conf()

# Create a SparkSession with the configured settings
spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()

# Listing all the spark conf
spark.sparkContext.getConf().getAll()

# setting spark conf for analysis
spark.conf.set('spark.sql.repl.eagerEval.enabled',True)

#reading dataset
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')

# reducing the shuffle partition to 4 
# reason 1 data size is very less
# reason 2 to use all the availble cores
spark.conf.set('spark.sql.shuffle.partitions',4)

In [31]:
df.select('Minimum Qual Requirements').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [34]:
# Analysis 3: Correlation between higher degree and salary
degree_salary_correlation = df.select("Minimum Qual Requirements", "Salary Range From", "Salary Range To").na.drop().withColumn(
    "HasHigherDegree",
    when(f.lower(col("Minimum Qual Requirements")).contains("master") | f.lower(col("Minimum Qual Requirements")).contains("ph.d."), 1).otherwise(0)
).withColumn(
    "AvgSalary",
    (col("Salary Range From") + col("Salary Range To")) / 2
).select("HasHigherDegree", "AvgSalary").stat.corr("HasHigherDegree", "AvgSalary")

In [33]:
degree_salary_correlation

0.11692069998655688

In [35]:
degree_salary_correlation

0.26861355953871074

In [22]:
spark.stop()