In [51]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [52]:
# To make pyspark importable as a regular library
findspark.init()

In [53]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, when, avg, round, rank, isnan, current_timestamp, date_sub
from pyspark.sql.window import Window

In [54]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/data_profiling_transform.ipynb

Data profile Function Imported


In [55]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/kpi_transform.ipynb

KPI Transformations Imported


In [56]:
%run /notebook/dataproduct/assesment_nyc_job_posting/utils/spark_session.ipynb

Spark Session Imported


In [57]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [58]:
# # Listing the columns based on its type
# def get_col_type_dict(df):
#     col_type_dict = {}

#     for col_name, col_type in df.dtypes:
#         if col_type in col_type_dict.keys():
#             col_type_dict[col_type].append(col_name)
#         else:
#             col_type_dict[col_type] = [col_name]

#     return col_type_dict

In [59]:
def data_profile(df):
    
    print("Starting Data Profiling")
    print("Schema of the dataset")
    
    #getting the schema
    df.printSchema()
    
    ## Display the first few rows of the DataFrame
    display(df.limit(10))
    
    # Getting counts
    total_count = df.count()
    print(f"Total Records: {total_count}")
    
    print("Getting missing value counts")
    missing_value_counts = calculate_missing_value_counts(df)
    missing_value_counts.show(vertical=True)
    
    print("Getting numerical status")
    numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
    summary_stats = calculate_summary_stats(df, numerical_columns)
    display(summary_stats)
    
    print("Getting categorical columns") 
    # Most of the time categorical column should be string type but for this analysis -
    # assuming that categorical columns can be any type.
    distinct_threshold=10
    for col_name in df.columns:
        is_categorical_cond, distinct_values, top_values = profile_categorical_column(df, col_name)
        if is_categorical_cond:
            print(f"Column: {col_name}")
            print(f"Distinct Values: {distinct_values}")
            print("Top Values:")
            top_values.show(truncate=False)



In [60]:
def show_kpi(df):        
    # Salary range is linked with salary frequency. 
    # To get any metrics which depends on salary we need to have salary in same frequency.    
    print("Preparing Data for KPI")
    kpi_pre_df = df.withColumn("Annual Salary From",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range From") * 2080)  # Assuming 2080 work hours per year
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range From") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range From") * 260)  # Assuming 5 workdays per week
                       .otherwise(col("Salary Range From")))
    
    kpi_df = kpi_pre_df.withColumn("Annual Salary To",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range To") * 2080)
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range To") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range To") * 260)
                       .otherwise(col("Salary Range To")))
    
    
    print("Getting KPIs")
    print("Top 10 jobs posting per category")
    category_counts = get_top10_job_posting_per_cat(kpi_df)
    category_counts.show(truncate=False)
    
    print("The salary distribution per job category")
    salary_distribution = get_sal_dist_per_cat(kpi_df)
    salary_distribution.show(truncate=False)
    
    print("The job posting having the highest salary per agency")
    max_sal_per_agency_df = get_highes_sal_per_cat(kpi_df)
    max_sal_per_agency_df.show(truncate=False)
    
    #KPI5: Whats the job positings average salary per agency for the last 2 years
    print("The average salary per agency for the last 2 years")
    last_n_year = 2
    avg_salary_df = get_avg_sal_per_agency_last_n_year(kpi_df,last_n_year)
    avg_salary_df.show(truncate=False)

    # KPI6 6: What are the highest paid skills in the US market?

In [61]:
def main():
    
    job_name = 'nyc_assesment'
    
    #setting spark conf before creating spark session
    spark_conf = get_spark_conf()
    
    # Create a SparkSession with the configured settings
    spark = get_spark_session(spark_conf, job_name)
    
    # Listing all the spark conf
    spark.sparkContext.getConf().getAll()
    
    # setting spark conf for analysis
    spark.conf.set('spark.sql.repl.eagerEval.enabled',True)
    
    #reading dataset
    # adding escape charater after data profiling the data 
    df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')
    
    # reducing the shuffle partition to 4 
    # reason 1 data size is very less
    # reason 2 to use all the availble cores
    spark.conf.set('spark.sql.shuffle.partitions',4)
    
    # Creating data profile
    data_profile(df)
    
    # Prepreparing data for KPI
    # Need to implement
    
    # Writing prepared and cleaned data 
    # Needt to implement
    
    # Showing KPIs
    show_kpi(df)
    
    #stoping spark session
    spark.stop()

In [62]:
main()

Starting Data Profiling
Schema of the dataset
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/S

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
381492,NYC HOUSING AUTHO...,Internal,2,Community Associate,COMMUNITY ASSOCIATE,56057,0,"Policy, Research ...",,52000.0,61936.0,Annual,Visual Assessment...,Lead Hazard Contr...,The New York City...,Qualification Req...,1. Experience wi...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-05-10 00:00:00,,2019-07-05 00:00:00,2019-12-17 00:00:00
381492,NYC HOUSING AUTHO...,External,2,Community Associate,COMMUNITY ASSOCIATE,56057,0,"Policy, Research ...",,52000.0,61936.0,Annual,Visual Assessment...,Lead Hazard Contr...,The New York City...,Qualification Req...,1. Experience wi...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-05-10 00:00:00,,2019-07-05 00:00:00,2019-12-17 00:00:00
381752,DEPT OF ENVIRONME...,Internal,1,Associate Public ...,ASSOCIATE PUBLIC ...,31220,1,Health Public Saf...,F,58677.0,91199.0,Annual,96-05 Horace Hard...,Environmental Hea...,The NYC Departmen...,1. A baccalaureat...,â€¢ Excellent int...,Appointments are ...,"Click ""Apply Now""...",40 hours per week...,Owls Head Wastewa...,,New York City res...,2019-01-29 00:00:00,,2019-10-07 00:00:00,2019-12-17 00:00:00
381752,DEPT OF ENVIRONME...,External,1,Associate Public ...,ASSOCIATE PUBLIC ...,31220,1,Health Public Saf...,F,58677.0,91199.0,Annual,96-05 Horace Hard...,Environmental Hea...,The NYC Departmen...,1. A baccalaureat...,â€¢ Excellent int...,Appointments are ...,"Click ""Apply Now""...",40 hours per week...,Owls Head Wastewa...,,New York City res...,2019-01-29 00:00:00,,2019-10-07 00:00:00,2019-12-17 00:00:00
381848,NYC HOUSING AUTHO...,Internal,1,"Director, Lead Ha...",DIRECTOR OF CONTR...,80287,M4,"Public Safety, In...",,78574.0,202744.0,Annual,Lead Hazard - Off...,Lead Hazard Contr...,Please read this ...,A Baccalaureate d...,â€¢ Integrity â€“...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-01-29 00:00:00,,2019-04-24 00:00:00,2019-12-17 00:00:00
381848,NYC HOUSING AUTHO...,External,1,"Director, Lead Ha...",DIRECTOR OF CONTR...,80287,M4,"Public Safety, In...",,78574.0,202744.0,Annual,Lead Hazard - Off...,Lead Hazard Contr...,Please read this ...,A Baccalaureate d...,â€¢ Integrity â€“...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-01-29 00:00:00,,2019-04-24 00:00:00,2019-12-17 00:00:00
381982,ADMIN FOR CHILDRE...,Internal,1,Director of Security,DIRECTOR OF SECUR...,70822,M1,"Public Safety, In...",F,56990.0,115000.0,Annual,150 William Stree...,Facilities (Admin),The New York City...,1. A baccalaureat...,The preferred can...,Section 424-A of ...,"Click on the ""App...",,,,New York City res...,2019-02-05 00:00:00,,2019-12-10 00:00:00,2019-12-17 00:00:00
381982,ADMIN FOR CHILDRE...,External,1,Director of Security,DIRECTOR OF SECUR...,70822,M1,"Public Safety, In...",F,56990.0,115000.0,Annual,150 William Stree...,Facilities (Admin),The New York City...,1. A baccalaureat...,The preferred can...,Section 424-A of ...,"Click on the ""App...",,,,New York City res...,2019-02-05 00:00:00,,2019-12-10 00:00:00,2019-12-17 00:00:00
382050,DEPT OF ENVIRONME...,Internal,10,City Seasonal Aide,CITY SEASONAL AIDE,91406,0,Administration & ...,F,15.0,18.77,Hourly,59-17 Junction Bl...,Dept of Environme...,The NYC Departmen...,While there are n...,1.	Office Automat...,Appointments are ...,To apply click th...,35 hour week,59-17 Junction Bl...,,New York City res...,2019-02-08 00:00:00,,2019-12-11 00:00:00,2019-12-17 00:00:00
382050,DEPT OF ENVIRONME...,External,10,City Seasonal Aide,CITY SEASONAL AIDE,91406,0,Administration & ...,F,15.0,18.77,Hourly,59-17 Junction Bl...,Dept of Environme...,The NYC Departmen...,While there are n...,1.	Office Automat...,Appointments are ...,To apply click th...,35 hour week,59-17 Junction Bl...,,New York City res...,2019-02-08 00:00:00,,2019-12-11 00:00:00,2019-12-17 00:00:00


Total Records: 2946
Getting missing value counts
-RECORD 0-------------------------------------
 Job ID_missing                        | 0    
 Agency_missing                        | 0    
 Posting Type_missing                  | 0    
 # Of Positions_missing                | 0    
 Business Title_missing                | 0    
 Civil Service Title_missing           | 0    
 Title Code No_missing                 | 0    
 Level_missing                         | 0    
 Job Category_missing                  | 2    
 Full-Time/Part-Time indicator_missing | 195  
 Salary Range From_missing             | 0    
 Salary Range To_missing               | 0    
 Salary Frequency_missing              | 0    
 Work Location_missing                 | 0    
 Division/Work Unit_missing            | 0    
 Job Description_missing               | 0    
 Minimum Qual Requirements_missing     | 20   
 Preferred Skills_missing              | 393  
 Additional Information_missing        | 1092 
 To Apply_m

summary,Job ID,# Of Positions,Salary Range From,Salary Range To
mean,384821.5631364562,2.4959266802444,58904.13979385608,85535.71162739306
stddev,53075.33897715408,9.281312826466838,26986.57593579136,42871.31345366744
min,87990.0,1.0,0.0,10.36
max,426238.0,200.0,218587.0,234402.0


Getting categorical columns
Column: Posting Type
Distinct Values: 2
Top Values:
+------------+-----+
|Posting Type|count|
+------------+-----+
|Internal    |1684 |
|External    |1262 |
+------------+-----+

Column: Full-Time/Part-Time indicator
Distinct Values: 3
Top Values:
+-----------------------------+-----+
|Full-Time/Part-Time indicator|count|
+-----------------------------+-----+
|F                            |2625 |
|null                         |195  |
|P                            |126  |
+-----------------------------+-----+

Column: Salary Frequency
Distinct Values: 3
Top Values:
+----------------+-----+
|Salary Frequency|count|
+----------------+-----+
|Annual          |2712 |
|Hourly          |195  |
|Daily           |39   |
+----------------+-----+

Column: Recruitment Contact
Distinct Values: 1
Top Values:
+-------------------+-----+
|Recruitment Contact|count|
+-------------------+-----+
|null               |2946 |
+-------------------+-----+

Column: Process Date
Dist

+------+----------------------+--------------------+
|Agency|Avg_Annual_Salary_From|Avg_Annual_Salary_To|
+------+----------------------+--------------------+
+------+----------------------+--------------------+

