Importing libraries

In [1]:
import os
import boto3
import gzip
import json
import pyspark.sql.functions as F
from dotenv import load_dotenv
from io import BytesIO
from pyspark.sql import SparkSession
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Date, Boolean, Text, BigInteger
from datetime import datetime, timedelta
# Load environment variables
load_dotenv(override=True)

True

In [2]:
## print database connection string
print(os.getenv('POSTGRES_CONNECTION_STRING'))

postgresql://airflow:airflow@localhost:5454/job_ads_db


## Preparation

Connecting to S3 bucket

In [3]:
# Creating Boto3 Session
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_REGION')
aws_bucket_name = os.getenv('AWS_BUCKET_NAME')

session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)
print(session)

prefix = 'DE/monthly/'

# Create an S3 client
s3 = boto3.client('s3')


Session(region_name='eu-central-1')


Creating Spark Session

In [4]:
ROOT_DIR = os.path.abspath(os.pardir)
spark = SparkSession.builder \
    .appName("DE-Project") \
    .config("spark.master", "local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", 4) \
    .config("spark.driver.memory", "4g") \
    .config("spark.default.parallelism", 64) \
    .config("spark.sql.shuffle.partitions", 64) \
    .config("spark.memory.offHeap.enabled", True) \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.jars", ROOT_DIR+"/postgresql-42.6.0.jar") \
    .config("spark.driver.extraClassPath", ROOT_DIR+"/postgresql-42.6.0.jar") \
    .config("spark.executor.extraClassPath", ROOT_DIR+"/postgresql-42.6.0.jar") \
    .getOrCreate()
# .config("spark.sql.shuffle.partitions", "50") \
# .config("spark.sql.sources.partitionOverwriteMode", "dynamic") \
print(spark)

23/09/14 22:30:32 WARN Utils: Your hostname, Maher-PC resolves to a loopback address: 127.0.1.1; using 192.168.14.9 instead (on interface wlp5s0)
23/09/14 22:30:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/09/14 22:30:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


<pyspark.sql.session.SparkSession object at 0x7f8f3f9ea710>


Connecting to Postgres

In [5]:
connection_string = os.getenv('POSTGRES_CONNECTION_STRING')
engine = create_engine(connection_string, isolation_level="AUTOCOMMIT")
print(engine)
pgconn = engine.connect()
print(pgconn)

Engine(postgresql://airflow:***@localhost:5454/job_ads_db)
<sqlalchemy.engine.base.Connection object at 0x7f8f3f9e8fa0>


Creating a table in Postgres if it doesn't exist

In [6]:
# Create a table if it doesn't exist
metadata = MetaData()
# table = Table('tk_2023_07', metadata)

if pgconn.dialect.has_table(pgconn, 'tk_2023_07'):
  print('Table already exists.')
else:
    print('Creating table...')
    table = Table(os.getenv("POSTGRES_TABLE"), metadata,
                Column('id', Integer, primary_key=True),
                Column('job_id', String(32)),
                Column('posting_count', Integer),
                Column('source_website_count', Integer),
                Column('date', Date),
                Column('sequence_number', BigInteger),
                Column('expiration_date', Date),
                Column('expired', Boolean),
                Column('duration', Integer),
                Column('source_url', String(255)),
                Column('source_website', String(255)),
                Column('source_type', String(2)),
                # Column('duplicate', Boolean),
                # Column('first_posting', Boolean),
                Column('posting_id', String(32)),
                Column('duplicate_on_jobsite', Boolean),
                Column('via_intermediary', Boolean),
                Column('language', String(3)),
                Column('job_title', String(255)),
                Column('profession', String(4)),
                Column('profession_group', String(4)),
                Column('profession_class', String(4)),
                Column('profession_isco_code', String(10)),
                Column('location', String(5)),
                Column('location_name', String(255)),
                Column('location_coordinates', String(30)),
                Column('location_remote_possible', Boolean),
                Column('region', String(2)),
                Column('education_level', String(2)),
                Column('advertiser_name', String(255)),
                Column('advertiser_type', String(2)),
                Column('advertiser_street', String(255)),
                Column('advertiser_postal_code', String(15)),
                Column('advertiser_location', String(255)),
                Column('advertiser_phone', String(255)),
                Column('available_contact_fields', String(100)),
                # Column('organization', Integer),
                Column('organization_name', String(255)),
                Column('organization_industry', String(2)),
                Column('organization_activity', String(10)),
                Column('organization_size', String(2)),
                Column('organization_address', String(255)),
                Column('organization_street_number', String(100)),
                Column('organization_postal_code', String(5)),
                Column('organization_location', String(5)),
                Column('organization_location_name', String(255)),
                Column('organization_region', String(2)),
                Column('contract_type', String(2)),
                Column('working_hours_type', String(1)),
                Column('hours_per_week_from', Integer),
                Column('hours_per_week_to', Integer),
                Column('employment_type', String(1)),
                Column('full_text', Text),
                Column('job_description', Text),
                Column('candidate_description', Text),
                Column('conditions_description', Text),
                # Column('professional_skill_terms', Text),
                Column('soft_skills', Text),
                Column('professional_skills', Text),
                Column('advertiser_house_number', String(15)),
                Column('advertiser_email', String(255)),
                Column('advertiser_website', String(255)),
                Column('advertiser_contact_person', String(255)),
                Column('advertiser_reference_number', String(255)),
                Column('application_description', Text),
                Column('organization_website', String(100)),
                Column('employer_description', Text),
                Column('language_skills', Text),
                Column('it_skills', Text),
                Column('organization_linkedin_id', String(255)),
                Column('organization_national_id', String(25)),
                Column('experience_years_from', Integer),
                Column('salary', Integer),
                Column('salary_from', Integer),
                Column('salary_to', Integer),
                Column('experience_years_to', Integer),
                Column('advertiser_spend', Integer),
                Column('apply_url', String(255)),
                Column('experience_level', String(17)),
                Column('location_postal_code', String(7)),
                Column('profession_kldb_code', String(5)),
                Column('profession_onet_2019_code', String(10)),
                Column('salary_from_rate', String(10)),
                Column('salary_time_scale', String(1)),
                Column('salary_to_rate', String(10))
                )

metadata.create_all(engine)
  
                                     

Table already exists.


## Extract

Outputting the list of files in the bucket

In [29]:
# Get the list of objects in the S3 bucket
response = s3.list_objects_v2(Bucket=aws_bucket_name, Prefix=prefix, Delimiter='/')
print(response)


{'ResponseMetadata': {'RequestId': 'PDC8SBZ03KK5K60A', 'HostId': 'fUJd8SzZt4B6tBi3Y7d8mNIoirTSaPdSgDs/BTr/aa7CSbFovCxQA4kY2gY2hdcxnfZqDcgf00o=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'fUJd8SzZt4B6tBi3Y7d8mNIoirTSaPdSgDs/BTr/aa7CSbFovCxQA4kY2gY2hdcxnfZqDcgf00o=', 'x-amz-request-id': 'PDC8SBZ03KK5K60A', 'date': 'Thu, 14 Sep 2023 20:47:55 GMT', 'x-amz-bucket-region': 'eu-central-1', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'IsTruncated': False, 'Name': 'jobfeed-data-feeds', 'Prefix': 'DE/monthly/', 'Delimiter': '/', 'MaxKeys': 1000, 'CommonPrefixes': [{'Prefix': 'DE/monthly/2020-06/'}, {'Prefix': 'DE/monthly/2020-07/'}, {'Prefix': 'DE/monthly/2020-08/'}, {'Prefix': 'DE/monthly/2020-09/'}, {'Prefix': 'DE/monthly/2020-10/'}, {'Prefix': 'DE/monthly/2020-11/'}, {'Prefix': 'DE/monthly/2020-12/'}, {'Prefix': 'DE/monthly/2021-01/'}, {'Prefix': 'DE/monthly/2021-02/'}, {'Prefix': 'DE/monthly/2021-03/'}, {'Prefix':

Downloading the files from the bucket

In [30]:
#Number of Months to download
months = 1
# Number of files per month to download
files_per_month = 1
# current project directory parent path
ROOT_DIR = os.path.abspath(os.pardir)

# Get the list of subfolders in the S3 bucket
subfolders = [obj['Prefix'] for obj in response['CommonPrefixes']]
# Get the last N subfolders - N = months of data to download
subfolders = subfolders[-months:]


filesToLoadInDF = []
# Download files from each subfolder
for subfolder in subfolders:
    # Get the list of files in the subfolder
    response = s3.list_objects_v2(Bucket=aws_bucket_name, Prefix=subfolder)
    # Get the file paths
    files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.jsonl.gz')]
    # Only get the first N files
    files = files[:files_per_month]

    # filesToLoadInDF = [filesToLoadInDF.append(f) for f in files]

    # Create the folder in your local machine
    folder = ROOT_DIR + "/data/raw/" + aws_bucket_name + "/" + subfolder
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Download and extract each file
    for file in files:
        filename = file.rsplit("/", 1)[1]
        print('Downloading file {}...'.format(filename))
        print(subfolder + filename)
        print(folder + filename)

        # Check if the file already exists
        localExtractedFilePath = os.path.join(folder + filename[:-3])
        if not os.path.exists(localExtractedFilePath):
            # Download and Save the file
            s3.download_file(Filename=folder + filename, Bucket=aws_bucket_name, Key=subfolder + filename)

            locaFilePath = os.path.join(folder + filename)
            print(localExtractedFilePath)
            filesToLoadInDF.append(localExtractedFilePath)
            # Extract the data from the gzipped file
            with gzip.open(locaFilePath, 'rb') as gz_file, open(localExtractedFilePath, 'wb') as extract_file:
                extract_file.write(gz_file.read())

            # Delete the gzipped file
            os.remove(locaFilePath)
        else:
            filesToLoadInDF.append(localExtractedFilePath)
            print('File already exists. Skipping...')

Downloading file jobs.0.jsonl.gz...
DE/monthly/2023-08/jobs.0.jsonl.gz
/home/maher/GithubProjects/Bigdata-Processing-pipeline/data/raw/jobfeed-data-feeds/DE/monthly/2023-08/jobs.0.jsonl.gz
File already exists. Skipping...


Reading the files into a Spark Dataframe

In [31]:
print(filesToLoadInDF)

['/home/maher/GithubProjects/Bigdata-Processing-pipeline/data/raw/jobfeed-data-feeds/DE/monthly/2023-08/jobs.0.jsonl']


In [32]:
df = spark.read.json(filesToLoadInDF)  # Use the extracted file paths here
df.show(truncate=False)



+--------------------------+-----------------------------------+-----------------------+-------------------+---------------------------------------------+---------------------------------------------------------------+----------------------+---------------------------+----------------+--------------------------+------------------------+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [33]:
df.printSchema()

root
 |-- advertiser_contact_person: string (nullable = true)
 |-- advertiser_email: string (nullable = true)
 |-- advertiser_house_number: string (nullable = true)
 |-- advertiser_location: string (nullable = true)
 |-- advertiser_name: string (nullable = true)
 |-- advertiser_phone: string (nullable = true)
 |-- advertiser_postal_code: string (nullable = true)
 |-- advertiser_reference_number: string (nullable = true)
 |-- advertiser_spend: long (nullable = true)
 |-- advertiser_street: string (nullable = true)
 |-- advertiser_type: struct (nullable = true)
 |    |-- label: string (nullable = true)
 |    |-- value: long (nullable = true)
 |-- advertiser_website: string (nullable = true)
 |-- application_description: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- available_contact_fields: string (nullable = true)
 |-- candidate_description: string (nullable = true)
 |-- conditions_description: string (nullable = true)
 |-- contract_type: struct (nullable = true

In [34]:
df.count()

                                                                                

100000

Extract Last 6 months Data from Postgres into Spark Dataframe

In [54]:
today = datetime.now()
end_date = today.replace(day=1) - timedelta(days=1)
start_date = end_date - timedelta(days=30)  # 180 days for six months
# Adjust the start_date to the first day of the month
start_date = start_date.replace(day=1)

start_date_str = start_date.strftime("%Y-%m-%d")
print(start_date_str)
end_date_str = end_date.strftime("%Y-%m-%d")
print(end_date_str)

where_clause = f"(SELECT * FROM tk_2023_07 WHERE date >= '{start_date_str}' AND date <= '{end_date_str}') as tk"
print(where_clause)
postgres_data = spark.read.format("jdbc") \
    .option("url", os.getenv('POSTGRES_CONNECTION_JDBC_STRING')) \
    .option("dbtable", where_clause) \
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("driver", "org.postgresql.Driver") \
    .load()

postgres_data.show(truncate=False)

2023-08-01
2023-08-31
(SELECT * FROM tk_2023_07 WHERE date >= '2023-08-01' AND date <= '2023-08-31') as tk


[Stage 136:>                                                        (0 + 1) / 1]

+-------+--------------------------------+-------------+--------------------+----------+---------------+---------------+-------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+-----------+--------------------------------+--------------------+----------------+--------+-------------------------------------------------------------------------------+----------+----------------+----------------+--------------------+--------+----------------+-------------------------+------------------------+------+---------------+-------------------------------------------------+---------------+-----------------+----------------------+-------------------+------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------+------------------------------------

                                                                                

In [55]:
postgres_data_count = postgres_data.count()
postgres_data_count

                                                                                

300000

In [56]:
df.show(truncate=False)

+--------------------------+-----------------------------------+-----------------------+-------------------+---------------------------------------------+---------------------------------------------------------------+----------------------+---------------------------+----------------+--------------------------+------------------------+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Compare the two dataframes and find the differences

In [57]:
# Find difference between the two dataframes when postgres_data count is greater than 0
if postgres_data_count > 0:
    changes_df = df.join(postgres_data, df.posting_id == postgres_data.posting_id, how='left_anti')
    changes_df.show(truncate=False)
else:
    changes_df = df
    changes_df.show(truncate=False)



+-------------------------+----------------+-----------------------+-------------------+---------------+----------------+----------------------+---------------------------+----------------+-----------------+---------------+------------------+-----------------------+---------+------------------------+---------------------+----------------------+-------------+----+--------------------+--------+---------------+--------------------+---------------+----------------+---------------------+-------------------+---------------+-------+---------+-------------------+-----------------+---------+---------------+------+---------+--------+---------------+--------+--------------------+-------------+--------------------+------------------------+---------------------+--------------------+---------------------+------------------------+---------------------+--------------------------+-----------------+------------------------+------------------------+-------------------+-----------------+------------------

                                                                                

In [58]:
# Order by 'date'
changes_df = changes_df.orderBy('date')

In [59]:
changes_df.show(truncate=False)



+-------------------------+----------------+-----------------------+-------------------+---------------+----------------+----------------------+---------------------------+----------------+-----------------+---------------+------------------+-----------------------+---------+------------------------+---------------------+----------------------+-------------+----+--------------------+--------+---------------+--------------------+---------------+----------------+---------------------+-------------------+---------------+-------+---------+-------------------+-----------------+---------+---------------+------+---------+--------+---------------+--------+--------------------+-------------+--------------------+------------------------+---------------------+--------------------+---------------------+------------------------+---------------------+--------------------------+-----------------+------------------------+------------------------+-------------------+-----------------+------------------

                                                                                

In [60]:
changes_df.count()

                                                                                

0

## Transform

Change Column Types

In [42]:
import pyspark.sql.types as T

# Change the data types of the columns.
df_types_fixed= changes_df.withColumn("job_id", changes_df.job_id.cast(T.StringType())) \
    .withColumn("posting_count", changes_df.posting_count.cast(T.LongType())) \
    .withColumn("source_website_count", changes_df.source_website_count.cast(T.LongType())) \
    .withColumn("date", changes_df.date.cast(T.DateType())) \
    .withColumn("expiration_date", changes_df.expiration_date.cast(T.DateType())) \
    .withColumn("duration", changes_df.duration.cast(T.LongType())) \
    .withColumn("salary", changes_df.salary.cast(T.LongType())) \
    .withColumn("salary_from", changes_df.salary_from.cast(T.LongType())) \
    .withColumn("salary_to", changes_df.salary_to.cast(T.LongType())) \
    .withColumn("experience_years_from", changes_df.experience_years_from.cast(T.LongType())) \
    .withColumn("experience_years_to", changes_df.experience_years_to.cast(T.LongType())) \
    .withColumn("hours_per_week_from", changes_df.hours_per_week_from.cast(T.LongType())) \
    .withColumn("hours_per_week_to", changes_df.hours_per_week_to.cast(T.LongType())) \
    .withColumn('working_hours_type', changes_df.working_hours_type.value.cast(T.IntegerType())) \
    .withColumn('advertiser_type', changes_df.advertiser_type.value.cast(T.StringType())) \
    .withColumn('contract_type', changes_df.contract_type.value.cast(T.StringType())) \
    .withColumn('education_level', changes_df.education_level.value.cast(T.StringType())) \
    .withColumn('employment_type', changes_df.employment_type.value.cast(T.StringType())) \
    .withColumn('experience_level', changes_df.experience_level.value.cast(T.StringType())) \
    .withColumn("it_skills", F.concat_ws(",", changes_df.it_skills.value)) \
    .withColumn("language_skills", F.concat_ws(",", changes_df.language_skills.value)) \
    .withColumn('organization_activity', changes_df.organization_activity.value.cast(T.StringType())) \
    .withColumn('organization_industry', changes_df.organization_industry.value.cast(T.StringType())) \
    .withColumn('organization_region', changes_df.organization_region.value.cast(T.StringType())) \
    .withColumn('organization_size', changes_df.organization_size.value.cast(T.StringType())) \
    .withColumn('profession', changes_df.profession.value.cast(T.StringType())) \
    .withColumn('profession_class', changes_df.profession_class.value.cast(T.StringType())) \
    .withColumn('profession_group', changes_df.profession_group.value.cast(T.StringType())) \
    .withColumn('profession_isco_code', changes_df.profession_isco_code.value.cast(T.StringType())) \
    .withColumn('profession_kldb_code', changes_df.profession_kldb_code.value.cast(T.StringType())) \
    .withColumn('profession_onet_2019_code', changes_df.profession_onet_2019_code.value.cast(T.StringType())) \
    .withColumn("professional_skills", F.concat_ws(",", changes_df.professional_skills.value)) \
    .withColumn('region', changes_df.region.value.cast(T.StringType())) \
    .withColumn('organization_national_id', F.substring(changes_df.organization_national_id, 1, 25)) \
    .withColumn("soft_skills", F.concat_ws(",", changes_df.soft_skills.value)) \
    .withColumn('source_type', changes_df.source_type.value.cast(T.StringType())) \
    .withColumn('advertiser_email', F.concat_ws(",", F.slice(F.split(changes_df.advertiser_email, ','), 1, 5))) \
    .withColumn('advertiser_website', F.concat_ws(",", F.slice(F.split(changes_df.advertiser_website, ','), 1, 5))) \
    .withColumn('advertiser_contact_person', F.substring(changes_df.advertiser_contact_person, 1, 255)) \
    .withColumn('advertiser_reference_number', F.substring(changes_df.advertiser_reference_number, 1, 255)) \
    .withColumn('organization_website', F.substring(changes_df.organization_website, 1, 100)) \
    .withColumn('organization_linkedin_id', F.substring(changes_df.organization_linkedin_id, 1, 255)) \
    .withColumn('apply_url', F.substring(changes_df.apply_url, 1, 255)) \
    .withColumn('source_url', F.substring(changes_df.source_url, 1, 255)) \
    .withColumn('source_website', F.substring(changes_df.source_website, 1, 255)) \
    .withColumn('advertiser_name', F.substring(changes_df.advertiser_name, 1, 255)) \
    .withColumn('advertiser_phone', F.concat_ws(",", F.slice(F.split(changes_df.advertiser_phone, ','), 1, 10))) \
 

df_types_fixed.printSchema()

root
 |-- advertiser_contact_person: string (nullable = true)
 |-- advertiser_email: string (nullable = false)
 |-- advertiser_house_number: string (nullable = true)
 |-- advertiser_location: string (nullable = true)
 |-- advertiser_name: string (nullable = true)
 |-- advertiser_phone: string (nullable = false)
 |-- advertiser_postal_code: string (nullable = true)
 |-- advertiser_reference_number: string (nullable = true)
 |-- advertiser_spend: long (nullable = true)
 |-- advertiser_street: string (nullable = true)
 |-- advertiser_type: string (nullable = true)
 |-- advertiser_website: string (nullable = false)
 |-- application_description: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- available_contact_fields: string (nullable = true)
 |-- candidate_description: string (nullable = true)
 |-- conditions_description: string (nullable = true)
 |-- contract_type: string (nullable = true)
 |-- date: date (nullable = true)
 |-- duplicate_on_jobsite: boolean (nullabl

In [43]:
df_types_fixed.show(truncate=False) 



+-------------------------+-------------------------------+-----------------------+--------------------+--------------------------------------------------------------------------------------------------------+--------------------------+----------------------+---------------------------------+----------------+------------------------+---------------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------

                                                                                

Check for duplicates

In [22]:
# Drop duplicates
# df = df.dropDuplicates()
# Drop duplicates in posting_id column and keep the latest one
df_types_fixed = df_types_fixed.dropDuplicates(['posting_id'])

In [23]:
df_types_fixed.count()

                                                                                

179818

Replacing empty string with None

In [24]:
# Replacing empty string with None
df_types_fixed = df_types_fixed.withColumn("it_skills", F.when(df_types_fixed.it_skills == "", None).otherwise(df_types_fixed.it_skills)) \
    .withColumn("language_skills", F.when(df_types_fixed.language_skills == "", None).otherwise(df_types_fixed.language_skills)) \
    .withColumn("professional_skills", F.when(df_types_fixed.professional_skills == "", None).otherwise(df_types_fixed.professional_skills)) \
    .withColumn("soft_skills", F.when(df_types_fixed.soft_skills == "", None).otherwise(df_types_fixed.soft_skills)) \

df_types_fixed = df_types_fixed.replace('', None)
df_types_fixed.show()



+-------------------------+--------------------+-----------------------+-------------------+--------------------+--------------------+----------------------+---------------------------+----------------+--------------------+---------------+--------------------+-----------------------+--------------------+------------------------+---------------------+----------------------+-------------+----------+--------------------+--------+---------------+--------------------+---------------+-----------------+---------------------+-------------------+---------------+-------+--------------------+-------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------+--------------------+--------------------+--------------------+------------------------+---------------------+--------------------+---------------------+------------------------+---------------------+--------------------------+--------------------+---

                                                                                

Check for null values

In [25]:
from pyspark.sql.functions import isnan, when, count, col


# Calculate the null counts for each column
df_nullCount = df.select([count(when(col(c).isNull(), c)).cast('integer').alias(c) for c in df.columns])

# Melt (transpose) the DataFrame
columns = df.columns
exprs = [f"stack({len(columns)}, " + ", ".join([f"'{c}', {c}" for c in columns]) + f") as (Labels, Null_Counts)"]
df_nullCount_transpose = df_nullCount.selectExpr(*exprs)

# Order the transposed DataFrame by Null_Counts in descending order
df_nullCount_transpose = df_nullCount_transpose.orderBy('Null_Counts', ascending=False)

# Show the result DataFrame
df_nullCount_transpose.show()



+--------------------+-----------+
|              Labels|Null_Counts|
+--------------------+-----------+
| experience_years_to|     296917|
|experience_years_...|     283171|
|organization_nati...|     278028|
| hours_per_week_from|     250414|
|   hours_per_week_to|     250414|
|    experience_level|     238748|
|         salary_from|     232911|
|              salary|     231936|
|           salary_to|     231936|
|organization_link...|     231453|
|    salary_from_rate|     223762|
|      salary_to_rate|     223658|
|   salary_time_scale|     222615|
|advertiser_refere...|     209214|
|           it_skills|     198644|
|     language_skills|     198537|
|            duration|     175753|
|     expiration_date|     175753|
|location_postal_code|     174353|
|advertiser_house_...|     167656|
+--------------------+-----------+


                                                                                

Calculate the null percentages for each column

In [26]:
from pyspark.sql.functions import expr

# Calculate the total number of rows
rows_count = df.count()
# Calculate the null percentages for each column
df_null_percentages = df_nullCount_transpose.withColumn(
    "Null_Percentages", expr(f"(Null_Counts / {rows_count}) * 100")
).select("Labels", "Null_Percentages")

# Order the DataFrame by Null_Percentages in descending order
df_null_percentages = df_null_percentages.orderBy("Null_Percentages", ascending=False)

# Show the result DataFrame
df_null_percentages.show()




+--------------------+------------------+
|              Labels|  Null_Percentages|
+--------------------+------------------+
| experience_years_to| 98.97233333333332|
|experience_years_...| 94.39033333333333|
|organization_nati...|            92.676|
| hours_per_week_from| 83.47133333333333|
|   hours_per_week_to| 83.47133333333333|
|    experience_level| 79.58266666666667|
|         salary_from|            77.637|
|              salary|            77.312|
|           salary_to|            77.312|
|organization_link...|            77.151|
|    salary_from_rate| 74.58733333333333|
|      salary_to_rate| 74.55266666666667|
|   salary_time_scale|            74.205|
|advertiser_refere...|            69.738|
|           it_skills| 66.21466666666666|
|     language_skills|            66.179|
|            duration| 58.58433333333334|
|     expiration_date| 58.58433333333334|
|location_postal_code|58.117666666666665|
|advertiser_house_...| 55.88533333333333|
+--------------------+------------

                                                                                

## Load

Loading the data into Postgres

Print the final dataframe column names

In [27]:
df_final = df_types_fixed
print(len(df_final.columns))
df_final.columns

80


['advertiser_contact_person',
 'advertiser_email',
 'advertiser_house_number',
 'advertiser_location',
 'advertiser_name',
 'advertiser_phone',
 'advertiser_postal_code',
 'advertiser_reference_number',
 'advertiser_spend',
 'advertiser_street',
 'advertiser_type',
 'advertiser_website',
 'application_description',
 'apply_url',
 'available_contact_fields',
 'candidate_description',
 'conditions_description',
 'contract_type',
 'date',
 'duplicate_on_jobsite',
 'duration',
 'education_level',
 'employer_description',
 'employment_type',
 'experience_level',
 'experience_years_from',
 'experience_years_to',
 'expiration_date',
 'expired',
 'full_text',
 'hours_per_week_from',
 'hours_per_week_to',
 'it_skills',
 'job_description',
 'job_id',
 'job_title',
 'language',
 'language_skills',
 'location',
 'location_coordinates',
 'location_name',
 'location_postal_code',
 'location_remote_possible',
 'organization_activity',
 'organization_address',
 'organization_industry',
 'organization_

Write the data into postgres database

In [28]:
# from pyspark.sql import DataFrameWriter
# Write the data to a table in Postgres
df_final.write.format("jdbc") \
    .option("url", os.getenv('POSTGRES_CONNECTION_JDBC_STRING')) \
    .option("dbtable", os.getenv('POSTGRES_TABLE')) \
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("driver", "org.postgresql.Driver") \
    .mode("append") \
    .save()