In [24]:
# adding required packages
import findspark
import pprint
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

In [25]:
findspark.init()

In [26]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

# Configure Spark settings
spark_conf = SparkConf()
spark_conf.set("spark.executor.instances", "4") # 2 instance per noce

# Set the number of executor cores
spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

# Set the executor memory
spark_conf.set("spark.executor.memory", "4g")  # Use 4GB memory per executor

# Set the driver memory
spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver

<pyspark.conf.SparkConf at 0x404ab07438>

In [27]:


# Create a SparkSession with the configured settings
spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()


### Getting up current config

In [28]:
spark.sparkContext.getConf().getAll()

[('spark.executor.instances', '4'),
 ('spark.master', 'spark://master:7077'),
 ('spark.executor.memory', '4g'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.host', 'b3ec55ddebbb'),
 ('spark.executor.cores', '2'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.id', 'app-20230824161737-0000'),
 ('spark.driver.memory', '2g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.name', 'MySparkApp'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '44173'),
 ('spark.ui.showConsoleProgress', 'true')]

In [29]:
# setting spark conf for analysis
spark.conf.set('spark.sql.repl.eagerEval.enabled',True)

### Read data

In [30]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True)

In [31]:
#sample records
display(df.limit(10))

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
87990,DEPARTMENT OF BUS...,Internal,1,Account Manager,CONTRACT REVIEWER...,40563,1,,,42405.0,65485.0,Annual,110 William St. N Y,Strategy & Analytics,Division of Econo...,"""1.	A baccalaurea...",all candidates m...,â€¢	Excellent int...,Salary range for ...,,,,,New York City res...,2011-06-24T00:00:...,,2011-06-24T00:00:...
97899,DEPARTMENT OF BUS...,Internal,1,EXECUTIVE DIRECTO...,ADMINISTRATIVE BU...,10009,M3,,F,60740.0,162014.0,Annual,110 William St. N Y,Tech Talent Pipeline,The New York City...,"""1. A baccalaurea...",,,In addition to ap...,,,,New York City res...,2012-01-26T00:00:...,,2012-01-26T00:00:...,2019-12-17T00:00:...
132292,NYC HOUSING AUTHO...,External,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,"""1. Three years o...",mechanical,or construction ...,may be substitut...,all candidates m...,1. A High School...,1. A Motor Vehic...,"""Click the """"Appl...",,,,NYCHA has no resi...
132292,NYC HOUSING AUTHO...,Internal,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,"""1. Three years o...",mechanical,or construction ...,may be substitut...,all candidates m...,1. A High School...,1. A Motor Vehic...,"""Click the """"Appl...",,,,NYCHA has no resi...
133921,NYC HOUSING AUTHO...,Internal,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"""Click the """"Appl...",,,,NYCHA has no resi...,2014-01-09T00:00:...,,2014-01-08T00:00:...,2019-12-17T00:00:...
133921,NYC HOUSING AUTHO...,External,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"""Click the """"Appl...",,,,NYCHA has no resi...,2014-01-09T00:00:...,,2014-01-08T00:00:...,2019-12-17T00:00:...
137433,DEPT OF HEALTH/ME...,Internal,1,Contract Analyst,PROCUREMENT ANALYST,12158,3,"Finance, Accounti...",F,50598.0,85053.0,Annual,42-09 28th Street,HIV Administration,** OPEN TO PERMAN...,"""1. A baccalaurea...",individuals must...,after meeting th...,either one year ...,at least one yea...,or spent perform...,Strong analytical...,,Apply online with...,,42-09 28th Street...,
138531,DEPT OF ENVIRONME...,Internal,1,Associate Chemist,ASSOCIATE CHEMIST,21822,2,Health Public Saf...,F,50623.0,75083.0,Annual,96-05 Horace Hard...,DWOC Labs-Lefrak,Working in the Di...,Qualification Req...,In order to apply...,,"""Click the """"Appl...",35 Hours per week...,96-05 Horace Hard...,,New York City res...,2013-12-20T00:00:...,,2014-07-25T00:00:...,2019-12-17T00:00:...
151131,NYC HOUSING AUTHO...,External,1,Cost Estimating M...,ADMINISTRATIVE ST...,1002D,0,"Engineering, Arch...",F,90000.0,110000.0,Annual,CP Cap Plan-Techn...,Capital Planning ...,Reporting to the ...,"""1. A master's de...",including the 18...,managerial,administrative o...,as described in ...,1. Five years of...,SPECIAL INSTRUCTI...,"""Click the """"Appl...",,,,NYCHA has no resi...
152738,LAW DEPARTMENT,Internal,1,Office Manager,CLERICAL ASSOCIATE,10251,3,Clerical & Admini...,F,30683.0,49707.0,Annual,"100 Church St., N.Y.",Appeals,Performs essentia...,Qualification Req...,Experience with L...,Candidates must b...,"""Please click the...",Monday through Fr...,,,New York City res...,2014-06-26T00:00:...,,2014-06-26T00:00:...,2019-12-17T00:00:...


### Data Exploration

In [32]:
#getting the schema
df.printSchema()

root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locat

In [33]:
# dataset count
df.count()

2946

In [34]:
# Listing the columns based on its type
col_type_dict = {}

for col_name, col_type in df.dtypes:
    if col_type in col_type_dict.keys():
        col_type_dict[col_type].append(col_name)
    else:
        col_type_dict[col_type] = [col_name]


for col_type, col_list in col_type_dict.items():
    if col_type.lower() == 'string':
        print(f"Character columns: {col_list}")
    elif col_type.lower() == 'int':
        print(f"Numerical columns: {col_list}")
    elif col_type.lower() == 'double':
        print(f"Decimal columns: {col_list}")
    else:
        print(f"Other Type {col_type} columns: {col_list}")
    print('============================================================================================================')
    

Numerical columns: ['Job ID', '# Of Positions']
Character columns: ['Agency', 'Posting Type', 'Business Title', 'Civil Service Title', 'Title Code No', 'Level', 'Job Category', 'Full-Time/Part-Time indicator', 'Salary Frequency', 'Work Location', 'Division/Work Unit', 'Job Description', 'Minimum Qual Requirements', 'Preferred Skills', 'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1', 'Recruitment Contact', 'Residency Requirement', 'Posting Date', 'Post Until', 'Posting Updated', 'Process Date']
Decimal columns: ['Salary Range From', 'Salary Range To']


In [35]:
# Identify categorical columns
categorical_columns = []

for col_name in col_type_dict['string']:
    distinct_count = df.select(col_name).distinct().count()
    if distinct_count < df.count() * 0.3:  # Adjust threshold as needed
        categorical_columns.append(col_name)

print("Categorical columns:", categorical_columns)

Categorical columns: ['Agency', 'Posting Type', 'Civil Service Title', 'Title Code No', 'Level', 'Job Category', 'Full-Time/Part-Time indicator', 'Salary Frequency', 'Work Location', 'Division/Work Unit', 'Minimum Qual Requirements', 'Preferred Skills', 'Additional Information', 'To Apply', 'Hours/Shift', 'Work Location 1', 'Recruitment Contact', 'Residency Requirement', 'Posting Date', 'Post Until', 'Posting Updated', 'Process Date']
