In [2]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [None]:
pip install --upgrade pip

In [None]:
 pip install ydata-profiling

In [3]:
from ydata_profiling import ProfileReport

In [4]:
# To make pyspark importable as a regular library
findspark.init()

In [5]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, when, avg, round, rank, isnan, current_timestamp, date_sub
from pyspark.sql.window import Window

In [6]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/data_profiling_transform.ipynb

Data profile Function Imported


In [7]:
%run /notebook/dataproduct/assesment_nyc_job_posting/lib/kpi_transform.ipynb

KPI Transformations Imported


In [8]:
%run /notebook/dataproduct/assesment_nyc_job_posting/utils/spark_session.ipynb

Spark Session Imported


In [9]:
def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 4 instance per node

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "1g")  # Use 1GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

## Creating Spark Session

In [10]:
job_name = 'nyc_assesment'
    
#setting spark conf before creating spark session
spark_conf = get_spark_conf()

# Create a SparkSession with the configured settings
spark = get_spark_session(spark_conf, job_name)

# Listing all the spark conf
spark.sparkContext.getConf().getAll()

# setting spark conf for analysis
spark.conf.set('spark.sql.repl.eagerEval.enabled',True)

## Reading Dataset

In [11]:
#reading dataset
# adding escape charater after data profiling the data 
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True, escape='"')

In [None]:
# reducing the shuffle partition to 4 
# reason 1 data size is very less
# reason 2 to use all the availble cores
# reason 3 Since data size is very small and can be fitted in single partition but using 4 just to show data shuffling
spark.conf.set('spark.sql.shuffle.partitions',4)

## Data Profiling

In [None]:
#getting the schema
df.printSchema()

In [None]:
## Display the first few rows of the DataFrame
display(df.limit(10))

In [None]:
# Getting counts
total_count = df.count()
print(f"Total Records: {total_count}")

In [None]:
# Dropping duplicates
df = df.distinct()
dist_tot_cnt = df.count()
print(f"Total Records after removing duplicates: {dist_tot_cnt}")

In [None]:
print("Getting missing value counts")
missing_value_counts = calculate_missing_value_counts(df)
display(missing_value_counts)

In [None]:
print("Getting numerical status")
numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
summary_stats = calculate_summary_stats(df, numerical_columns)
display(summary_stats)

In [None]:
print("Getting categorical columns") 
# Most of the time categorical column should be string type but for this analysis -
# assuming that categorical columns can be any type.
distinct_threshold=10
for col_name in df.columns:
    is_categorical_cond, distinct_values, top_values = profile_categorical_column(df, col_name)
    if is_categorical_cond:
        print(f"Column: {col_name}")
        print(f"Distinct Values: {distinct_values}")
        print("Top Values:")
        top_values.show(truncate=False)
        pandas_df = top_values.toPandas()
        # Plot the pie chart using Pandas
        plt.figure(figsize=(8, 6))
        pandas_df.plot.pie(y="count", labels=pandas_df[f"{col_name}"], autopct="%1.1f%%", startangle=140)
        plt.axis("equal")
        plt.title(f"{col_name} Distribution")
        plt.show();


### Using ydata_profiling to create the data profile based on pandas dataframe

In [14]:
pandas_df = df.toPandas()

In [23]:
profile = ProfileReport(pandas_df, title="Profiling Report New")

In [24]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [None]:
def show_kpi(df):        
    # Salary range is linked with salary frequency. 
    # To get any metrics which depends on salary we need to have salary in same frequency.    
    print("Preparing Data for KPI")
    kpi_pre_df = df.withColumn("Annual Salary From",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range From") * 2080)  # Assuming 2080 work hours per year
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range From") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range From") * 260)  # Assuming 5 workdays per week
                       .otherwise(col("Salary Range From")))
    
    kpi_df = kpi_pre_df.withColumn("Annual Salary To",
                       when(col("Salary Frequency") == "Hourly", col("Salary Range To") * 2080)
                       .when(col("Salary Frequency") == "Weekly", col("Salary Range To") * 52)
                       .when(col("Salary Frequency") == "Daily", col("Salary Range To") * 260)
                       .otherwise(col("Salary Range To")))
    
    
    print("Getting KPIs")
    print("Top 10 jobs posting per category")
    category_counts = get_top10_job_posting_per_cat(kpi_df)
    category_counts.show(truncate=False)
    
    print("The salary distribution per job category")
    salary_distribution = get_sal_dist_per_cat(kpi_df)
    salary_distribution.show(truncate=False)
    
    print("The job posting having the highest salary per agency")
    max_sal_per_agency_df = get_highes_sal_per_cat(kpi_df)
    max_sal_per_agency_df.show(truncate=False)
    
    #KPI5: Whats the job positings average salary per agency for the last 2 years
    print("The average salary per agency for the last 2 years")
    last_n_year = 2
    avg_salary_df = get_avg_sal_per_agency_last_n_year(kpi_df,last_n_year)
    avg_salary_df.show(truncate=False)

    # KPI6 6: What are the highest paid skills in the US market?