In [None]:
# KPI 1: Number of job postings per category (Top 10)
def get_top10_job_posting_per_cat(kpi_df):
    
    return kpi_df.groupBy("Job Category").count().orderBy(col("count").desc()).limit(10)

In [None]:
# KPI 2: Whats the salary distribution per job category?
def get_sal_dist_per_cat(kpi_df):
    
    sal_dist = kpi_df.groupBy("Job Category").agg(
    round(avg("Annual Salary From"),2).alias("AvgSalaryFrom"),
    round(avg("Annual Salary To"),2).alias("AvgSalaryTo")).orderBy("Job Category")
    
    return sal_dist

In [None]:
#KPI 4: Whats the job posting having the highest salary per agency?
def get_highes_sal_per_cat(kpi_df):
    
    # window for highest salary per agency
    max_sal_per_agency_window_spec = Window.partitionBy("Agency").orderBy(col("Annual Salary To").desc())
    
    #finding the highest
    max_sal_per_agency_df = kpi_df.withColumn("rank", rank().over(max_sal_per_agency_window_spec)).filter(col("rank") == 1)
    
    # Select the relevant columns for the result
    max_sal_per_agency_df = max_sal_per_agency_df.select("Agency", "Business Title", round(col("Annual Salary To"),2)).distinct() # Assuming for "Anual Salary To" rounding to 2 decimal required.
    
    return max_sal_per_agency_df

In [None]:
#KPI5: Whats the job positings average salary per agency for the last n years
def get_avg_sal_per_agency_last_n_year(kpi_df,last_n_year):
    n_years_ago = f.date_sub(f.current_timestamp(), 365 * last_n_year)
    last_n_job_posting_df = kpi_df.select('Agency','Annual Salary From','Annual Salary To').filter(col('Posting Date') >= two_years_ago)
    avg_salary_df = last_n_job_posting_df.groupBy("Agency").agg(
                                                            round(avg(col("Annual Salary From")),2).alias("Avg_Annual_Salary_From"),
                                                            round(avg(col("Annual Salary To")),2).alias("Avg_Annual_Salary_To")
                                                            )
    return avg_salary_df

In [None]:
print("KPI Transformations Imported")