## Bronze Layer: Ingesting Data to Databricks

In [0]:
import requests,json
from pyspark.sql import SparkSession

#defining the spark session
spark = SparkSession.builder.appName("JobETL_Bronze").getOrCreate()

# defining the API key
API_KEY = "278025380e0c67b179f77be267daf50fa06806f4911fc87dcfc3af7aa0dc79a3"

roles = ["Data Engineer", "Python Developer", "ETL Developer", "Spark Engineer", "Data Analyst"]
location = "India"
all_jobs = []

#Running a Loop over all the Job Roles
for role in roles:
    params={
        "engine":"google_jobs",
        "q":role, #"q": role → "q" yaha par role ka naam jayega (e.g., "Data Engineer")
        "location":location,
        "api_key":API_KEY
    }
    res = requests.get("https://serpapi.com/search.json", params=params)
    print("✅Reading 🔴LIVE Data from Google Jobs API")

    jobs = res.json().get("jobs_results", [])

    # Print the whole response to see what's inside
    # print(json.dumps(res.json(), indent=2))   
     
    # print(jobs)   getting API response

    for job in jobs:
        job["search_role"] = role
        all_jobs.extend(jobs)

# bronze_df = spark.read.json(spark.sparkContext.parallelize([json.dumps(job) for job in all_jobs]))
# bronze_df.write.format("delta").option("mergeSchema", "true").mode("append").save("/mnt/lakehouse/bronze/jobs_raw")

# ✅ Convert Python list of dicts directly into Spark DF
bronze_df = spark.createDataFrame(all_jobs)

# ✅ Save to Delta (use managed table instead of DBFS path in free version)
bronze_df.write.format("delta").mode("overwrite").saveAsTable("bronze_jobs_raw")
print("✅Data Written to Bronze Layer")

In [0]:
# display(bronze_df)

## Silver Layer: Cleaning and Structuring the Data

In [0]:
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("JobETL_Silver").getOrCreate()

#reading bronze layer
print("✅Reading from Bronze Layer -------> Silver Layer")
df = spark.read.table("bronze_jobs_raw")

#cleaning and structuring the data
silver_df =  df.selectExpr(
    "title",
    "company_name",
    "location",
    "description",
    "job_id",
    "detected_extensions.posted_at as posted_at",
    "search_role"
).dropna(subset=["title", "company_name", "location"])

In [0]:
display(silver_df)

In [0]:
silver_df = silver_df.withColumn("company_name", trim(upper(col("company_name"))))
silver_df = silver_df.dropna(subset=["posted_at"])
silver_df.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable("silver_jobs")
print("✅ Data Written to Silver Layer!")
display(silver_df)

## Golend Layer: Generate KPI Tables

In [0]:
#Method 1 is best because it is using pyspark ans there is distributed processesing
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JobETL_Gold").getOrCreate()

#reading silver layer
print("✅Reading from Silver Layer -------> Golden Layer")
df = spark.read.table("silver_jobs")

#Writing golden layer to a table
df.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable("gold_jobs")

#KPI 1- Top Companies
top_companies = df.groupBy("Company_name").count().orderBy(col("count").desc())
# display(top_companies)

# KPI2- Jobs by City
display(df.groupBy('Location').agg(count('*').alias('job_count')).orderBy(col('job_count').desc()))

In [0]:
# -- There will be no distributed processing
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JobETL_Gold").getOrCreate()

#reading silver layer
df = spark.read.table("silver_jobs")
df.write.mode("append").saveAsTable("JobAggregatorTable")
print("✅Data Successfully Written to Table ---> JobAggregatorTable")


In [0]:
%sql
select * from JobAggregatorTable

Databricks visualization. Run in Databricks to view.

In [0]:
print("😍Congratulations!!!!! Full ETL Job Aggregator Project is Completed and LIVE!!!!")

In [0]:
%sql
describe history JobAggregatorTable

In [0]:
%sql
select * from JobAggregatorTable

# Data Analytics Part :- Visualisation Business Scenario

In [0]:
import matplotlib.pyplot as plt

In [0]:
spark = SparkSession.builder.appName("VisualsFromGold").getOrCreate()

In [0]:
#Load Gold Layer: Top Companies
df_companies = spark.read.table("gold_jobs")
df_companies = df_companies.dropna(subset=["posted_at"])
display(df_companies)

In [0]:
#Convert to Pandas for Plotting
pdf = df_companies.orderBy("posted_at", ascending=True).limit(10).toPandas()
display(pdf)

In [0]:
# Creating a plot

plt.figure(figsize=(10,5))
plt.barh(pdf['company_name'][::-1],pdf['title'], color='skyblue')
plt.xlabel('Number of jobs')
plt.ylabel('Top Companies Hiring')
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [0]:
#Job demad by City Plot
df_cities = spark.read.table("gold_jobs")
df_cities = df_cities.dropna(subset=["posted_at"])

pdf_city = (
    df_cities.orderBy("posted_at",ascending=False).toPandas())

#Plot
plt.figure(figsize=(10,6))
plt.barh(pdf_city['location'][::-1], pdf_city['title'][::-1], color='lightgreen')
plt.xlabel("Job Posting")
plt.title("Job Demand by City")
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()