In [1]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [2]:
# Unittest related packages
from unittest import mock

In [3]:
findspark.init()

In [4]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pyspark.sql.functions as f

In [5]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
    # Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 2 instance per noce

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "4g")  # Use 4GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [6]:
def calculate_missing_value_counts(df):
    missing_value_counts = df.select([(f.count(f.when(f.col(c).contains('None') |
                                  f.col(c).contains('NULL') |
                                  (f.col(c) == '') |
                                  f.col(c).isNull() |
                                  f.isnan(c), c)).alias(c + "_missing"))
                       for c in df.columns])
    return missing_value_counts

In [7]:
# Calculate basic statistics for numerical columns
def calculate_summary_stats(df, numerical_columns):
    summary_stats = df.select(*numerical_columns).summary("mean", "stddev", "min", "max")
    return summary_stats

In [8]:
def profile_categorical_column(df, col_name):
    distinct_values = df.select(col_name).distinct().count()
    top_values = df.groupBy(col_name).count().orderBy(f.col("count").desc()).limit(5)
    return distinct_values, top_values

In [9]:
def get_categorical_column(df, threshold=0.3):
    categorical_columns = []
    for col_name in df.columns:
        distinct_count = df.select(col_name).distinct().count()
        if distinct_count < df.count() * threshold:  # Adjust threshold as needed
            categorical_columns.append(col_name)

    return categorical_columns

In [10]:
# # Listing the columns based on its type
# def get_col_type_dict(df):
#     col_type_dict = {}

#     for col_name, col_type in df.dtypes:
#         if col_type in col_type_dict.keys():
#             col_type_dict[col_type].append(col_name)
#         else:
#             col_type_dict[col_type] = [col_name]

#     return col_type_dict

In [11]:
def data_profile(df):
    
    print("Starting Data Profiling")
    print("Schema of the dataset")
    
    #getting the schema
    df.printSchema()
    
#     ## Display the first few rows of the DataFrame
#     display(df.limit(5))
    
#     # Getting counts
#     total_count = df.count()
#     print(f"Total Records: {total_count}")
    
    
#     print("Getting missing value counts")
#     missing_value_counts = calculate_missing_value_counts(df)
#     display(missing_value_counts)
    
#     print("Getting numerical status")
#     numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
#     summary_stats = calculate_summary_stats(df, numerical_columns)
#     display(summary_stats)
    
#     print("Getting categorical columns")
#     categorical_columns = get_categorical_column(df, 0.01)
#     for col_name in categorical_columns:
#         distinct_values, top_values = profile_categorical_column(df, col_name)
#         print(f"Column: {col_name}")
#         print(f"Distinct Values: {distinct_values}")
#         print("Top Values:")
#         top_values.show(100,False)
        
    print("Getting KPIs")
    # KPI 1: Number of job postings per category (Top 10)
    category_counts = df.groupBy("Job Category").count().orderBy(f.col("count").desc()).limit(10)
    category_counts
    

In [12]:
def main():
    
    #setting spark conf before creating spark session
    spark_conf = get_spark_conf()
    
    # Create a SparkSession with the configured settings
    spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()
    
    # Listing all the spark conf
    spark.sparkContext.getConf().getAll()
    
    # setting spark conf for analysis
    spark.conf.set('spark.sql.repl.eagerEval.enabled',True)
    
    #reading dataset
    df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True)
    
    # reducing the shuffle partition to 4 
    # reason 1 data size is very less
    # reason 2 to use all the availble cores
    spark.conf.set('spark.sql.shuffle.partitions',4)
    
    # Creating data profile
    data_profile(df)
    
    spark.stop()
    

In [13]:
main()

Starting Data Profiling
Schema of the dataset
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/S