In [65]:
# adding required packages
import findspark
import pprint
import matplotlib.pyplot as plt

In [33]:
# Unittest related packages
from unittest import mock

In [2]:
findspark.init()

In [62]:
#importing pyspark related package
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
import pyspark.sql.functions as f

In [115]:
# Local System Configuration
# Total Memory = 16GB
# Total Cores = 10

def get_spark_conf():
# Configure Spark settings
    spark_conf = SparkConf()
    spark_conf.set("spark.executor.instances", "4") # 2 instance per noce

    # Set the number of executor cores
    spark_conf.set("spark.executor.cores", "2")  # Use 2 cores per executor

    # Set the executor memory
    spark_conf.set("spark.executor.memory", "4g")  # Use 4GB memory per executor

    # Set the driver memory
    spark_conf.set("spark.driver.memory", "2g")    # Use 2GB memory for the driver
    
    return spark_conf

In [140]:
def calculate_missing_value_counts(df, total_count):
    missing_value_counts = df.select([(f.lit(total_count).cast('Integer') - f.count(col_name)).alias(col_name + "_missing") for col_name in df.columns])
    return missing_value_counts

In [114]:
# Calculate basic statistics for numerical columns
def calculate_summary_stats(df, numerical_columns):
    summary_stats = df.select(*numerical_columns).summary("mean", "stddev", "min", "max")
    return summary_stats

In [89]:
def profile_categorical_column(df, col_name):
    distinct_values = df.select(col_name).distinct().count()
    top_values = df.groupBy(col_name).count().orderBy(f.col("count").desc()).limit(5)
    return distinct_values, top_values

In [107]:
def get_categorical_column(df, threshold=0.3):
    categorical_columns = []
    col_type_dict = get_col_type_dict(df)
    for col_name in col_type_dict['string']:
        distinct_count = df.select(col_name).distinct().count()
        if distinct_count < df.count() * threshold:  # Adjust threshold as needed
            categorical_columns.append(col_name)

    return categorical_columns

In [108]:
# Listing the columns based on its type
def get_col_type_dict(df):
    col_type_dict = {}

    for col_name, col_type in df.dtypes:
        if col_type in col_type_dict.keys():
            col_type_dict[col_type].append(col_name)
        else:
            col_type_dict[col_type] = [col_name]

    return col_type_dict

In [146]:
def data_profile(df):
    
    print("Starting Data Profiling")
    print("Schema of the dataset")
    
    #getting the schema
    df.printSchema()
    
    ## Display the first few rows of the DataFrame
    display(df.limit(5))
    
    # Getting counts
    total_count = df.count()
    print(f"Total Records: {total_count}")
    
    
    print("Getting missing value counts")
    missing_value_counts = calculate_missing_value_counts(df, total_count)
    display(missing_value_counts)
    
    print("Getting numerical status")
    numerical_columns = [col_name for col_name, col_type in df.dtypes if col_type in ["int", "double", "float"]]
    summary_stats = calculate_summary_stats(df, numerical_columns)
    display(summary_stats)
    
    print("Getting categorical columns")
    categorical_columns = get_categorical_column(df, 0.01)
    for col_name in categorical_columns:
        distinct_values, top_values = profile_categorical_column(df, col_name)
        print(f"Column: {col_name}")
        print(f"Distinct Values: {distinct_values}")
        print("Top Values:")
        top_values.show(100,False)

In [147]:
def main():
    
    #setting spark conf before creating spark session
    spark_conf = get_spark_conf()
    
    # Create a SparkSession with the configured settings
    spark = SparkSession.builder.config(conf=spark_conf).appName("MySparkApp").getOrCreate()
    
    # Listing all the spark conf
    spark.sparkContext.getConf().getAll()
    
    # setting spark conf for analysis
    spark.conf.set('spark.sql.repl.eagerEval.enabled',True)
    
    #reading dataset
    df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True)
    
    # reducing the shuffle partition to 4 
    # reason 1 data size is very less
    # reason 2 to use all the availble cores
    spark.conf.set('spark.sql.shuffle.partitions',4)
    
    # Creating data profile
    data_profile(df)
    
    

In [148]:
main()

Starting Data Profiling
Schema of the dataset
root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/S

Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
389713,LAW DEPARTMENT,Internal,1,PARALEGAL AIDE - ...,PARALEGAL AIDE,30080,2,Constituent Servi...,F,41939.0,58688.0,Annual,"350 Jay St, Brook...",Family Court: Bro...,The NYC Law Depar...,1. A baccalaureat...,Candidates MUST p...,Due to the curren...,Please click on t...,This is a full ti...,The position is l...,,New York City res...,2019-04-08T00:00:...,,2019-06-11T00:00:...,2019-12-17T00:00:...
389806,DEPT OF INFO TECH...,Internal,1,Information Secur...,CERTIFIED IT ADMI...,13652,4,"Technology, Data ...",F,96020.0,140000.0,Annual,2 Metro Tech,Information Security,WHO WE ARE: The ...,"""Professional/ven...",all candidates m...,as determined by...,The preferred can...,Candidates must p...,*For DoITT Employ...,Day - Due to the ...,"Brooklyn, NY",,New York City Res...,2019-04-09T00:00:...,
389808,DEPT OF INFO TECH...,Internal,1,Information Secur...,IT SECURITY SPECI...,6798,0,"Technology, Data ...",,75000.0,140000.0,Annual,2 Metro Tech,Information Security,WHO WE ARE: The ...,"""1. A baccalaurea...",The preferred can...,,*For DoITT Employ...,Day - Due to the ...,"Brooklyn, NY",,New York City Res...,2019-04-09T00:00:...,,2019-04-25T00:00:...,2019-12-17T00:00:...
389883,DEPARTMENT OF CIT...,External,1,Human Capital Sum...,SUMMER COLLEGE IN...,10234,0,Administration & ...,P,17.5,17.5,Hourly,"120 Broadway, New...",Human Capital,Internship Progra...,As of June of the...,,,Click on â€œApply...,,,,New York City res...,2019-04-09T00:00:...,,2019-05-03T00:00:...,2019-12-17T00:00:...
389883,DEPARTMENT OF CIT...,Internal,1,Human Capital Sum...,SUMMER COLLEGE IN...,10234,0,Administration & ...,P,17.5,17.5,Hourly,"120 Broadway, New...",Human Capital,Internship Progra...,As of June of the...,,,Click on â€œApply...,,,,New York City res...,2019-04-09T00:00:...,,2019-05-03T00:00:...,2019-12-17T00:00:...


Total Records: 2946
Getting missing value counts


Job ID_missing,Agency_missing,Posting Type_missing,# Of Positions_missing,Business Title_missing,Civil Service Title_missing,Title Code No_missing,Level_missing,Job Category_missing,Full-Time/Part-Time indicator_missing,Salary Range From_missing,Salary Range To_missing,Salary Frequency_missing,Work Location_missing,Division/Work Unit_missing,Job Description_missing,Minimum Qual Requirements_missing,Preferred Skills_missing,Additional Information_missing,To Apply_missing,Hours/Shift_missing,Work Location 1_missing,Recruitment Contact_missing,Residency Requirement_missing,Posting Date_missing,Post Until_missing,Posting Updated_missing,Process Date_missing
0,0,0,0,0,0,0,0,2,195,0,0,0,0,0,0,18,259,563,180,1062,1138,1763,678,517,1499,508,425


Getting numerical status


summary,Job ID,# Of Positions,Salary Range From,Salary Range To
mean,384821.5631364562,2.4959266802444,58904.13979385608,85535.71162739304
stddev,53075.33897715395,9.281312826466838,26986.575935791363,42871.31345366745
min,87990.0,1.0,0.0,10.36
max,426238.0,200.0,218587.0,234402.0


Getting categorical columns
Column: Posting Type
Distinct Values: 2
Top Values:
+------------+-----+
|Posting Type|count|
+------------+-----+
|Internal    |1684 |
|External    |1262 |
+------------+-----+

Column: Level
Distinct Values: 14
Top Values:
+-----+-----+
|Level|count|
+-----+-----+
|0    |1112 |
|1    |521  |
|2    |505  |
|3    |299  |
|M1   |161  |
+-----+-----+

Column: Full-Time/Part-Time indicator
Distinct Values: 3
Top Values:
+-----------------------------+-----+
|Full-Time/Part-Time indicator|count|
+-----------------------------+-----+
|F                            |2625 |
|null                         |195  |
|P                            |126  |
+-----------------------------+-----+

Column: Salary Frequency
Distinct Values: 3
Top Values:
+----------------+-----+
|Salary Frequency|count|
+----------------+-----+
|Annual          |2712 |
|Hourly          |195  |
|Daily           |39   |
+----------------+-----+

