Importing libraries

In [10]:
import os
from sqlite3 import Row

import boto3
import gzip
import json
from dotenv import load_dotenv
from io import BytesIO
from pyspark.sql import SparkSession

# Load environment variables
load_dotenv()

True

Connecting to S3 bucket

In [17]:
# Creating Boto3 Session
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
aws_region = os.getenv('AWS_REGION')
aws_bucket_name = os.getenv('AWS_BUCKET_NAME')

session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)
print(session)

prefix = 'DE/monthly/'

# Create an S3 client
s3 = boto3.client('s3')


Session(region_name='eu-central-1')


Outputting the list of files in the bucket

In [18]:
# Get the list of objects in the S3 bucket
response = s3.list_objects_v2(Bucket=aws_bucket_name, Prefix=prefix, Delimiter='/')
print(response)


{'ResponseMetadata': {'RequestId': 'REJFE7F2K553GJ4T', 'HostId': 'VoIj3aplrDsKwSqDfnuXJCyeK4jlCoWUtrZKIgO6Yi2BptcBfuyt1H/RwPZ+rN4/eRGlzQjOk/8=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'VoIj3aplrDsKwSqDfnuXJCyeK4jlCoWUtrZKIgO6Yi2BptcBfuyt1H/RwPZ+rN4/eRGlzQjOk/8=', 'x-amz-request-id': 'REJFE7F2K553GJ4T', 'date': 'Fri, 07 Jul 2023 01:51:57 GMT', 'x-amz-bucket-region': 'eu-central-1', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 1}, 'IsTruncated': False, 'Name': 'jobfeed-data-feeds', 'Prefix': 'DE/monthly/', 'Delimiter': '/', 'MaxKeys': 1000, 'CommonPrefixes': [{'Prefix': 'DE/monthly/2020-06/'}, {'Prefix': 'DE/monthly/2020-07/'}, {'Prefix': 'DE/monthly/2020-08/'}, {'Prefix': 'DE/monthly/2020-09/'}, {'Prefix': 'DE/monthly/2020-10/'}, {'Prefix': 'DE/monthly/2020-11/'}, {'Prefix': 'DE/monthly/2020-12/'}, {'Prefix': 'DE/monthly/2021-01/'}, {'Prefix': 'DE/monthly/2021-02/'}, {'Prefix': 'DE/monthly/2021-03/'}, {'Prefix':

Downloading the files from the bucket

In [None]:
#Number of Months to download
months = 1
# Number of files per month to download
files_per_month = 1
# current project directory parent path
ROOT_DIR = os.path.abspath(os.pardir)


# Get the list of subfolders in the S3 bucket
subfolders = [obj['Prefix'] for obj in response['CommonPrefixes']]
# Get the last N subfolders - N = months of data to download
subfolders = subfolders[-months:]


filesToLoadInDF = []
# Download files from each subfolder
for subfolder in subfolders:
    # Get the list of files in the subfolder
    response = s3.list_objects_v2(Bucket=aws_bucket_name, Prefix=subfolder)
    # Get the file paths
    files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.jsonl.gz')]
    # Only get the first N files
    files = files[:files_per_month]

    # filesToLoadInDF = [filesToLoadInDF.append(f) for f in files]

    # Create the folder in your local machine
    folder = ROOT_DIR + "/data/raw/" + aws_bucket_name + "/" + subfolder
    if not os.path.exists(folder):
        os.makedirs(folder)
    # Download and extract each file
    for file in files:
        filename = file.rsplit("/", 1)[1]
        print('Downloading file {}...'.format(filename))
        print(subfolder + filename)
        print(folder + filename)

        # Download and Save the file
        s3.download_file(Filename=folder + filename, Bucket=aws_bucket_name, Key=subfolder + filename)

        locaFilePath = os.path.join(folder + filename)
        localExtractedFilePath = os.path.join(folder + filename[:-3])
        print(localExtractedFilePath)
        filesToLoadInDF.append(localExtractedFilePath)
        # Extract the data from the gzipped file
        with gzip.open(locaFilePath, 'rb') as gz_file, open(localExtractedFilePath, 'wb') as extract_file:
            extract_file.write(gz_file.read())

        # Delete the gzipped file
        # os.remove(locaFilePath)

Downloading file jobs.0.jsonl.gz...
DE/monthly/2023-06/jobs.0.jsonl.gz
/home/vboxuser/Documents/PycharmProjects/Bigdata-Processing-pipeline/data/raw/jobfeed-data-feeds/DE/monthly/2023-06/jobs.0.jsonl.gz
/home/vboxuser/Documents/PycharmProjects/Bigdata-Processing-pipeline/data/raw/jobfeed-data-feeds/DE/monthly/2023-06/jobs.0.jsonl


Creating the Spark Session

In [None]:
spark = SparkSession.builder \
    .master("local") \
    .appName("DE-Project") \
    .getOrCreate()

print(filesToLoadInDF)
df = spark.read.json(filesToLoadInDF)  # Use the extracted file paths here
df.show()

Postgres Connection

In [12]:
# Postgres Connection
import psycopg2
pgconn = psycopg2.connect(
    host=os.getenv('POSTGRES_HOST'),
    database=os.getenv('POSTGRES_DB'),
    user=os.getenv('POSTGRES_USER'),
    password=os.getenv('POSTGRES_PASSWORD'),
    port=os.getenv('POSTGRES_PORT')
)
pgconn

<connection object at 0x7f6120b802c0; dsn: 'user=airflow password=xxx dbname=airflow host=localhost port=5432', closed: 0>