In [1]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [2]:
#To make pyspark importable as a regular library
findspark.init()

In [14]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, when, avg, round, rank, isnan
from pyspark.sql.window import Window

In [28]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/data_profiling_transform.ipynb

Data profile Function Imported


In [6]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/kpi_transform.ipynb

KPI Transformations Imported


In [7]:
%run /notebook/dataproduct/assesment_nyc_job_posting/utils/spark_session.ipynb

Spark Session Imported


In [8]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [9]:
# # Listing the columns based on its type
# def get_col_type_dict(df):
#     col_type_dict = {}

#     for col_name, col_type in df.dtypes:
#         if col_type in col_type_dict.keys():
#             col_type_dict[col_type].append(col_name)
#         else:
#             col_type_dict[col_type] = [col_name]

#     return col_type_dict

In [21]:
def data_profile(df):
    
    print("Starting Data Profiling")
    print("Schema of the dataset")
    
    #getting the schema
    df.printSchema()
    
    ## Display the first few rows of the DataFrame
    display(df.limit(10))
    
    # Getting counts
    total_count = df.count()
    print(f"Total Records: {total_count}")
    
    print("Getting missing value counts")
    missing_value_counts = calculate_missing_value_counts(df)
    display(missing_value_counts)
    
    print("Getting numerical status")
    numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
    summary_stats = calculate_summary_stats(df, numerical_columns)
    display(summary_stats)
    
    print("Getting categorical columns") 
    # Most of the time categorical column should be string type but for this analysis -
    # assuming that categorical columns can be any type.
    distinct_threshold=10
    for col_name in df.columns:
        is_categorical_cond, distinct_values, top_values = profile_categorical_column(df, col_name)
        if is_categorical_cond:
            print(f"Column: {col_name}")
            print(f"Distinct Values: {distinct_values}")
            print("Top Values:")
            top_values.show(truncate=False)



In [11]:
def show_kpi(df):        
    # Salary range is linked with salary frequency. 
    # To get any metrics which depends on salary we need to have salary in same frequency.    
    print("Preparing Data for KPI")
    kpi_pre_df = df.withColumn("Annual Salary From",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range From") * 2080)  # Assuming 2080 work hours per year
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range From") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range From") * 260)  # Assuming 5 workdays per week
                       .otherwise(col("Salary Range From")))
    
    kpi_df = kpi_pre_df.withColumn("Annual Salary To",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range To") * 2080)
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range To") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range To") * 260)
                       .otherwise(col("Salary Range To")))
    
    
    print("Getting KPIs")
    print("Top 10 jobs posting per category")
    category_counts = get_top10_job_posting_per_cat(kpi_df)
    category_counts.show(truncate=False)
    
    print("The salary distribution per job category")
    salary_distribution = get_sal_dist_per_cat(kpi_df)
    salary_distribution.show(truncate=False)
    
    print("The job posting having the highest salary per agency")
    max_sal_per_agency_df = get_highes_sal_per_cat(kpi_df)
    max_sal_per_agency_df.show(truncate=False)
    
    #KPI5: Whats the job positings average salary per agency for the last 2 years
    print("The average salary per agency for the last 2 years")
    last_n_year = 2
    avg_salary_df = get_avg_sal_per_agency_last_n_year(kpi_df,last_n_year)
    avg_salary_df.show(truncate=False)

    # KPI6 6: What are the highest paid skills in the US market?

In [12]:
def main():
    
    job_name = 'nyc_assesment'
    
    #setting spark conf before creating spark session
    spark_conf = get_spark_conf()
    
    # Create a SparkSession with the configured settings
    spark = get_spark_session(spark_conf, job_name)
    
    # Listing all the spark conf
    spark.sparkContext.getConf().getAll()
    
    # setting spark conf for analysis
    spark.conf.set('spark.sql.repl.eagerEval.enabled',True)
    
    #reading dataset
    # adding escape charater after data profiling the data 
    df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')
    
    # reducing the shuffle partition to 4 
    # reason 1 data size is very less
    # reason 2 to use all the availble cores
    spark.conf.set('spark.sql.shuffle.partitions',4)
    
    # Creating data profile
    data_profile(df)
    
    # Prepreparing data for KPI
    # Need to implement
    
    # Writing prepared and cleaned data 
    # Needt to implement
    
    # Showing KPIs
    #show_kpi(df)

In [29]:
main()

Starting Data Profiling
Schema of the dataset
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/S

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
87990,DEPARTMENT OF BUS...,Internal,1,Account Manager,CONTRACT REVIEWER...,40563,1,,,42405.0,65485.0,Annual,110 William St. N Y,Strategy & Analytics,Division of Econo...,1.	A baccalaureat...,â€¢	Excellent int...,Salary range for ...,,,,,New York City res...,2011-06-24 00:00:00,,2011-06-24 00:00:00,2019-12-17 00:00:00
97899,DEPARTMENT OF BUS...,Internal,1,EXECUTIVE DIRECTO...,ADMINISTRATIVE BU...,10009,M3,,F,60740.0,162014.0,Annual,110 William St. N Y,Tech Talent Pipeline,The New York City...,1. A baccalaureat...,,,In addition to ap...,,,,New York City res...,2012-01-26 00:00:00,,2012-01-26 00:00:00,2019-12-17 00:00:00
132292,NYC HOUSING AUTHO...,External,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,1. Three years of...,1. A High School...,1. A Motor Vehic...,"Click the ""Apply ...",,,,NYCHA has no resi...,2013-10-24 00:00:00,,2013-12-12 00:00:00,2019-12-17 00:00:00
132292,NYC HOUSING AUTHO...,Internal,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,1. Three years of...,1. A High School...,1. A Motor Vehic...,"Click the ""Apply ...",,,,NYCHA has no resi...,2013-10-24 00:00:00,,2013-12-12 00:00:00,2019-12-17 00:00:00
133921,NYC HOUSING AUTHO...,Internal,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-01-09 00:00:00,,2014-01-08 00:00:00,2019-12-17 00:00:00
133921,NYC HOUSING AUTHO...,External,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-01-09 00:00:00,,2014-01-08 00:00:00,2019-12-17 00:00:00
137433,DEPT OF HEALTH/ME...,Internal,1,Contract Analyst,PROCUREMENT ANALYST,12158,3,"Finance, Accounti...",F,50598.0,85053.0,Annual,42-09 28th Street,HIV Administration,** OPEN TO PERMAN...,1. A baccalaureat...,Strong analytical...,,Apply online with...,,42-09 28th Street...,,New York City res...,2013-12-09 00:00:00,,2013-12-09 00:00:00,2019-12-17 00:00:00
138531,DEPT OF ENVIRONME...,Internal,1,Associate Chemist,ASSOCIATE CHEMIST,21822,2,Health Public Saf...,F,50623.0,75083.0,Annual,96-05 Horace Hard...,DWOC Labs-Lefrak,Working in the Di...,Qualification Req...,In order to apply...,,"Click the ""Apply ...",35 Hours per week...,96-05 Horace Hard...,,New York City res...,2013-12-20 00:00:00,,2014-07-25 00:00:00,2019-12-17 00:00:00
151131,NYC HOUSING AUTHO...,External,1,Cost Estimating M...,ADMINISTRATIVE ST...,1002D,0,"Engineering, Arch...",F,90000.0,110000.0,Annual,CP Cap Plan-Techn...,Capital Planning ...,Reporting to the ...,1. A master's deg...,1. Five years of...,SPECIAL INSTRUCTI...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-06-25 00:00:00,,2014-07-07 00:00:00,2019-12-17 00:00:00
152738,LAW DEPARTMENT,Internal,1,Office Manager,CLERICAL ASSOCIATE,10251,3,Clerical & Admini...,F,30683.0,49707.0,Annual,"100 Church St., N.Y.",Appeals,Performs essentia...,Qualification Req...,Experience with L...,Candidates must b...,Please click the ...,Monday through Fr...,,,New York City res...,2014-06-26 00:00:00,,2014-06-26 00:00:00,2019-12-17 00:00:00


Getting categorical columns
Column: Posting Type
Distinct Values: 2
Top Values:
+------------+-----+
|Posting Type|count|
+------------+-----+
|Internal    |1684 |
|External    |1262 |
+------------+-----+

Column: Full-Time/Part-Time indicator
Distinct Values: 3
Top Values:
+-----------------------------+-----+
|Full-Time/Part-Time indicator|count|
+-----------------------------+-----+
|F                            |2625 |
|null                         |195  |
|P                            |126  |
+-----------------------------+-----+

Column: Salary Frequency
Distinct Values: 3
Top Values:
+----------------+-----+
|Salary Frequency|count|
+----------------+-----+
|Annual          |2712 |
|Hourly          |195  |
|Daily           |39   |
+----------------+-----+

Column: Recruitment Contact
Distinct Values: 1
Top Values:
+-------------------+-----+
|Recruitment Contact|count|
+-------------------+-----+
|null               |2946 |
+-------------------+-----+

Column: Process Date
Dist