In [2]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [3]:
# Unittest related packages
from unittest import mock

In [4]:
findspark.init()

In [5]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pyspark.sql.functions as f
from pyspark.sql.window import Window

In [7]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "1")  # Use 1 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [6]:
def calculate_missing_value_counts(df):
    missing_value_counts = df.select([(f.count(f.when(f.col(c).contains('None') |
                                  f.col(c).contains('NULL') |
                                  (f.col(c) == '') |
                                  f.col(c).isNull() |
                                  f.isnan(c), c)).alias(c + "_missing"))
                       for c in df.columns])
    return missing_value_counts

In [7]:
# Calculate basic statistics for numerical columns
def calculate_summary_stats(df, numerical_columns):
    summary_stats = df.select(*numerical_columns).summary("mean", "stddev", "min", "max")
    return summary_stats

In [8]:
def profile_categorical_column(df, col_name):
    distinct_values = df.select(col_name).distinct().count()
    top_values = df.groupBy(col_name).count().orderBy(f.col("count").desc()).limit(5)
    return distinct_values, top_values

In [9]:
def get_categorical_column(df, threshold=0.3):
    categorical_columns = []
    for col_name in df.columns:
        distinct_count = df.select(col_name).distinct().count()
        if distinct_count < df.count() * threshold:  # Adjust threshold as needed
            categorical_columns.append(col_name)

    return categorical_columns

In [10]:
# # Listing the columns based on its type
# def get_col_type_dict(df):
#     col_type_dict = {}

#     for col_name, col_type in df.dtypes:
#         if col_type in col_type_dict.keys():
#             col_type_dict[col_type].append(col_name)
#         else:
#             col_type_dict[col_type] = [col_name]

#     return col_type_dict

In [75]:
def data_profile(df):
    
    print("Starting Data Profiling")
    print("Schema of the dataset")
    
    #getting the schema
    df.printSchema()
    
    ## Display the first few rows of the DataFrame
    display(df.limit(10))
    
    # Getting counts
    total_count = df.count()
    print(f"Total Records: {total_count}")
    
    print("Getting missing value counts")
    missing_value_counts = calculate_missing_value_counts(df)
    display(missing_value_counts)
    
    print("Getting numerical status")
    numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
    summary_stats = calculate_summary_stats(df, numerical_columns)
    display(summary_stats)
    
    print("Getting categorical columns") 
    # Most of the time categorical column should be string type but for this analysis -
    # assuming that categorical columns can be any type.
    categorical_columns = get_categorical_column(df, 0.01)
    for col_name in categorical_columns:
        distinct_values, top_values = profile_categorical_column(df, col_name)
        print(f"Column: {col_name}")
        print(f"Distinct Values: {distinct_values}")
        print("Top Values:")
        top_values.show(truncate=False)
        


In [None]:
def show_kpi(df):        
    # Salary range is linked with salary frequency. 
    # To get any metrics which depends on salary we need to have salary in same frequency.    
    print("Preparing Data for KPI")
    kpi_pre_df = df.withColumn("Annual Salary From",
                       f.when(f.col("Salary Frequency") == "Hourly", f.col("Salary Range From") * 2080)  # Assuming 2080 work hours per year
                       .when(f.col("Salary Frequency") == "Weekly", f.col("Salary Range From") * 52)
                       .when(f.col("Salary Frequency") == "Daily", f.col("Salary Range From") * 260)  # Assuming 5 workdays per week
                       .otherwise(f.col("Salary Range From")))
    
    kpi_df = kpi_pre_df.withColumn("Annual Salary To",
                       f.when(f.col("Salary Frequency") == "Hourly", f.col("Salary Range To") * 2080)
                       .when(f.col("Salary Frequency") == "Weekly", f.col("Salary Range To") * 52)
                       .when(f.col("Salary Frequency") == "Daily", f.col("Salary Range To") * 260)
                       .otherwise(f.col("Salary Range To")))
    
    
    print("Getting KPIs")
    # KPI 1: Number of job postings per category (Top 10)
    print("Top 10 jobs posting per category")
    category_counts = kpi_df.groupBy("Job Category").count().orderBy(f.col("count").desc()).limit(10)
    category_counts.show(truncate=False)
    
    # KPI 2: Whats the salary distribution per job category?
    print("The salary distribution per job category")
    salary_distribution = kpi_df.groupBy("Job Category").agg(
    f.round(f.avg("Annual Salary From"),2).alias("AvgSalaryFrom"),
    f.round(f.avg("Annual Salary To"),2).alias("AvgSalaryTo")).orderBy("Job Category")
    salary_distribution.show(truncate=False)
    
    #KPI 4: Whats the job posting having the highest salary per agency?
    # Calculate the maximum salary for each agency
    print("The job posting having the highest salary per agency")
    max_sal_per_agency_window_spec = Window.partitionBy("Agency").orderBy(f.col("Annual Salary To").desc())
    max_sal_per_agency_df = kpi_df.withColumn("rank", f.rank().over(max_sal_per_agency_window_spec)).filter(f.col("rank") == 1)
    # Select the relevant columns for the result
    max_sal_per_agency_df = max_sal_per_agency_df.select("Agency", "Business Title", f.round(f.col("Annual Salary To"),2)).distinct() # Assuming for "Anual Salary To" rounding to 2 decimal required.
    max_sal_per_agency_df.show(truncate=False)
    
    KPI5: Whats the job positings average salary per agency for the last 2 years
    Calculate the date 2 years ago from the current date
    two_years_ago = f.date_sub(f.current_timestamp(), 365 * 5)
    last_2_job_posting_df = kpi_df.select('Agency','Annual Salary From','Annual Salary To').filter(f.col('Posting Date') >= two_years_ago)
    avg_salary_df = last_2_job_posting_df.groupBy("Agency").agg(
                                                        f.round(f.avg(f.col("Annual Salary From")),2).alias("Avg_Annual_Salary_From"),
                                                        f.round(f.avg(f.col("Annual Salary To")),2).alias("Avg_Annual_Salary_To"))
    avg_salary_df.show(truncate=False)

    # KPI6 6: What are the highest paid skills in the US market?

In [76]:
def main():
    
    #setting spark conf before creating spark session
    spark_conf = get_spark_conf()
    
    # Create a SparkSession with the configured settings
    spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()
    
    # Listing all the spark conf
    spark.sparkContext.getConf().getAll()
    
    # setting spark conf for analysis
    spark.conf.set('spark.sql.repl.eagerEval.enabled',True)
    
    #reading dataset
    # adding escape charater after data profiling the data 
    df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')
    
    # reducing the shuffle partition to 4 
    # reason 1 data size is very less
    # reason 2 to use all the availble cores
    spark.conf.set('spark.sql.shuffle.partitions',4)
    
    # Creating data profile
    data_profile(df)
    
    # Prepreparing data for KPI
    # Need to implement
    
    # Writing prepared and cleaned data 
    # Needt to implement
    
    # Showing KPIs
    show_kpi(df)

In [77]:
main()

Starting Data Profiling
Schema of the dataset
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/S

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
87990,DEPARTMENT OF BUS...,Internal,1,Account Manager,CONTRACT REVIEWER...,40563,1,,,42405.0,65485.0,Annual,110 William St. N Y,Strategy & Analytics,Division of Econo...,1.	A baccalaureat...,â€¢	Excellent int...,Salary range for ...,,,,,New York City res...,2011-06-24 00:00:00,,2011-06-24 00:00:00,2019-12-17 00:00:00
97899,DEPARTMENT OF BUS...,Internal,1,EXECUTIVE DIRECTO...,ADMINISTRATIVE BU...,10009,M3,,F,60740.0,162014.0,Annual,110 William St. N Y,Tech Talent Pipeline,The New York City...,1. A baccalaureat...,,,In addition to ap...,,,,New York City res...,2012-01-26 00:00:00,,2012-01-26 00:00:00,2019-12-17 00:00:00
132292,NYC HOUSING AUTHO...,External,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,1. Three years of...,1. A High School...,1. A Motor Vehic...,"Click the ""Apply ...",,,,NYCHA has no resi...,2013-10-24 00:00:00,,2013-12-12 00:00:00,2019-12-17 00:00:00
132292,NYC HOUSING AUTHO...,Internal,52,Maintenance Worke...,MAINTENANCE WORKER,90698,0,Maintenance & Ope...,F,51907.68,54580.32,Annual,Heating Mgt-Opera...,Management Servic...,Under direct supe...,1. Three years of...,1. A High School...,1. A Motor Vehic...,"Click the ""Apply ...",,,,NYCHA has no resi...,2013-10-24 00:00:00,,2013-12-12 00:00:00,2019-12-17 00:00:00
133921,NYC HOUSING AUTHO...,Internal,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-01-09 00:00:00,,2014-01-08 00:00:00,2019-12-17 00:00:00
133921,NYC HOUSING AUTHO...,External,50,Temporary Painter,PAINTER,91830,0,Maintenance & Ope...,F,35.0,35.0,Hourly,DMP-Contract & An...,Dept of Managemen...,Responsibilities ...,1. Five years of ...,,SPECIAL NOTE: ...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-01-09 00:00:00,,2014-01-08 00:00:00,2019-12-17 00:00:00
137433,DEPT OF HEALTH/ME...,Internal,1,Contract Analyst,PROCUREMENT ANALYST,12158,3,"Finance, Accounti...",F,50598.0,85053.0,Annual,42-09 28th Street,HIV Administration,** OPEN TO PERMAN...,1. A baccalaureat...,Strong analytical...,,Apply online with...,,42-09 28th Street...,,New York City res...,2013-12-09 00:00:00,,2013-12-09 00:00:00,2019-12-17 00:00:00
138531,DEPT OF ENVIRONME...,Internal,1,Associate Chemist,ASSOCIATE CHEMIST,21822,2,Health Public Saf...,F,50623.0,75083.0,Annual,96-05 Horace Hard...,DWOC Labs-Lefrak,Working in the Di...,Qualification Req...,In order to apply...,,"Click the ""Apply ...",35 Hours per week...,96-05 Horace Hard...,,New York City res...,2013-12-20 00:00:00,,2014-07-25 00:00:00,2019-12-17 00:00:00
151131,NYC HOUSING AUTHO...,External,1,Cost Estimating M...,ADMINISTRATIVE ST...,1002D,0,"Engineering, Arch...",F,90000.0,110000.0,Annual,CP Cap Plan-Techn...,Capital Planning ...,Reporting to the ...,1. A master's deg...,1. Five years of...,SPECIAL INSTRUCTI...,"Click the ""Apply ...",,,,NYCHA has no resi...,2014-06-25 00:00:00,,2014-07-07 00:00:00,2019-12-17 00:00:00
152738,LAW DEPARTMENT,Internal,1,Office Manager,CLERICAL ASSOCIATE,10251,3,Clerical & Admini...,F,30683.0,49707.0,Annual,"100 Church St., N.Y.",Appeals,Performs essentia...,Qualification Req...,Experience with L...,Candidates must b...,Please click the ...,Monday through Fr...,,,New York City res...,2014-06-26 00:00:00,,2014-06-26 00:00:00,2019-12-17 00:00:00


Preparing Data for KPI
+------------------------------+----------------------+--------------------+
|Agency                        |Avg_Annual_Salary_From|Avg_Annual_Salary_To|
+------------------------------+----------------------+--------------------+
|OFFICE OF MANAGEMENT & BUDGET |66914.0               |78685.65            |
|HRA/DEPT OF SOCIAL SERVICES   |63191.0               |85499.77            |
|DEPARTMENT OF FINANCE         |68758.75              |90428.92            |
|FIRE DEPARTMENT               |79965.98              |93000.73            |
|HOUSING PRESERVATION & DVLPMNT|74597.21              |85607.55            |
|DEPT. OF HOMELESS SERVICES    |47637.75              |64476.5             |
|TEACHERS RETIREMENT SYSTEM    |62397.0               |75760.0             |
|DEPARTMENT OF BUILDINGS       |45581.47              |55310.6             |
|DEPT OF HEALTH/MENTAL HYGIENE |63529.6               |83479.43            |
|OFF OF PAYROLL ADMINISTRATION |49148.83             

In [8]:
#setting spark conf before creating spark session
spark_conf = get_spark_conf()

# Create a SparkSession with the configured settings
spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()

# Listing all the spark conf
spark.sparkContext.getConf().getAll()

# setting spark conf for analysis
spark.conf.set('spark.sql.repl.eagerEval.enabled',True)

#reading dataset
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')

# reducing the shuffle partition to 4 
# reason 1 data size is very less
# reason 2 to use all the availble cores
spark.conf.set('spark.sql.shuffle.partitions',4)

In [9]:
df.printSchema()

root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locat

In [10]:
df.limit(10)

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
381492,NYC HOUSING AUTHO...,Internal,2,Community Associate,COMMUNITY ASSOCIATE,56057,0,"Policy, Research ...",,52000.0,61936.0,Annual,Visual Assessment...,Lead Hazard Contr...,The New York City...,Qualification Req...,1. Experience wi...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-05-10 00:00:00,,2019-07-05 00:00:00,2019-12-17 00:00:00
381492,NYC HOUSING AUTHO...,External,2,Community Associate,COMMUNITY ASSOCIATE,56057,0,"Policy, Research ...",,52000.0,61936.0,Annual,Visual Assessment...,Lead Hazard Contr...,The New York City...,Qualification Req...,1. Experience wi...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-05-10 00:00:00,,2019-07-05 00:00:00,2019-12-17 00:00:00
381752,DEPT OF ENVIRONME...,Internal,1,Associate Public ...,ASSOCIATE PUBLIC ...,31220,1,Health Public Saf...,F,58677.0,91199.0,Annual,96-05 Horace Hard...,Environmental Hea...,The NYC Departmen...,1. A baccalaureat...,â€¢ Excellent int...,Appointments are ...,"Click ""Apply Now""...",40 hours per week...,Owls Head Wastewa...,,New York City res...,2019-01-29 00:00:00,,2019-10-07 00:00:00,2019-12-17 00:00:00
381752,DEPT OF ENVIRONME...,External,1,Associate Public ...,ASSOCIATE PUBLIC ...,31220,1,Health Public Saf...,F,58677.0,91199.0,Annual,96-05 Horace Hard...,Environmental Hea...,The NYC Departmen...,1. A baccalaureat...,â€¢ Excellent int...,Appointments are ...,"Click ""Apply Now""...",40 hours per week...,Owls Head Wastewa...,,New York City res...,2019-01-29 00:00:00,,2019-10-07 00:00:00,2019-12-17 00:00:00
381848,NYC HOUSING AUTHO...,Internal,1,"Director, Lead Ha...",DIRECTOR OF CONTR...,80287,M4,"Public Safety, In...",,78574.0,202744.0,Annual,Lead Hazard - Off...,Lead Hazard Contr...,Please read this ...,A Baccalaureate d...,â€¢ Integrity â€“...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-01-29 00:00:00,,2019-04-24 00:00:00,2019-12-17 00:00:00
381848,NYC HOUSING AUTHO...,External,1,"Director, Lead Ha...",DIRECTOR OF CONTR...,80287,M4,"Public Safety, In...",,78574.0,202744.0,Annual,Lead Hazard - Off...,Lead Hazard Contr...,Please read this ...,A Baccalaureate d...,â€¢ Integrity â€“...,NYCHA employees a...,"Click the ""Apply ...",,,,NYCHA has no resi...,2019-01-29 00:00:00,,2019-04-24 00:00:00,2019-12-17 00:00:00
381982,ADMIN FOR CHILDRE...,Internal,1,Director of Security,DIRECTOR OF SECUR...,70822,M1,"Public Safety, In...",F,56990.0,115000.0,Annual,150 William Stree...,Facilities (Admin),The New York City...,1. A baccalaureat...,The preferred can...,Section 424-A of ...,"Click on the ""App...",,,,New York City res...,2019-02-05 00:00:00,,2019-12-10 00:00:00,2019-12-17 00:00:00
381982,ADMIN FOR CHILDRE...,External,1,Director of Security,DIRECTOR OF SECUR...,70822,M1,"Public Safety, In...",F,56990.0,115000.0,Annual,150 William Stree...,Facilities (Admin),The New York City...,1. A baccalaureat...,The preferred can...,Section 424-A of ...,"Click on the ""App...",,,,New York City res...,2019-02-05 00:00:00,,2019-12-10 00:00:00,2019-12-17 00:00:00
382050,DEPT OF ENVIRONME...,Internal,10,City Seasonal Aide,CITY SEASONAL AIDE,91406,0,Administration & ...,F,15.0,18.77,Hourly,59-17 Junction Bl...,Dept of Environme...,The NYC Departmen...,While there are n...,1.	Office Automat...,Appointments are ...,To apply click th...,35 hour week,59-17 Junction Bl...,,New York City res...,2019-02-08 00:00:00,,2019-12-11 00:00:00,2019-12-17 00:00:00
382050,DEPT OF ENVIRONME...,External,10,City Seasonal Aide,CITY SEASONAL AIDE,91406,0,Administration & ...,F,15.0,18.77,Hourly,59-17 Junction Bl...,Dept of Environme...,The NYC Departmen...,While there are n...,1.	Office Automat...,Appointments are ...,To apply click th...,35 hour week,59-17 Junction Bl...,,New York City res...,2019-02-08 00:00:00,,2019-12-11 00:00:00,2019-12-17 00:00:00


In [11]:
df.show(truncate=False)

+------+------------------------------+------------+--------------+----------------------------------------------------+------------------------------+-------------+-----+------------------------------------------------------------------------------+-----------------------------+-----------------+---------------+----------------+------------------------------+------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------