In [1]:
# KPI 1: Number of job postings per category (Top 10)
def get_top10_job_posting_per_cat(kpi_df):
    
    return kpi_df.groupBy("Job Category").agg(sum("# Of Positions").alias("No_Of_Position")).sort(desc("No_Of_Position")).limit(10)

In [None]:
# KPI 2: Whats the salary distribution per job category?
def get_sal_dist_per_cat(kpi_df):
    
    sal_dist = kpi_df.groupBy("Job Category").agg(
    round(avg("Annual Salary From"),2).alias("AvgSalaryFrom"),
    round(avg("Annual Salary To"),2).alias("AvgSalaryTo")).orderBy("Job Category")
    
    return sal_dist

In [None]:
#Is there any correlation between the higher degree and the salary?
def get_corr_higher_degree_salary(kpi_df):
    # Analysis 3: Correlation between higher degree and salary
    degree_salary_correlation = kpi_df.select("Minimum Qual Requirements", "Annual Salary From", "Annual Salary To").na.drop().withColumn(
        "HasHigherDegree",
        when(f.lower(col("Minimum Qual Requirements")).contains("master") | f.lower(col("Minimum Qual Requirements")).contains("ph.d."), 1).otherwise(0)
    ).withColumn(
        "AvgSalary",
        (col("Annual Salary From") + col("Annual Salary To")) / 2
    ).select("HasHigherDegree", "AvgSalary").stat.corr("HasHigherDegree", "AvgSalary")
    return degree_salary_correlation

In [2]:
#KPI 4: Whats the job posting having the highest salary per agency?
def get_highes_sal_per_cat(kpi_df):
    
    # window for highest salary per agency
    max_sal_per_agency_window_spec = Window.partitionBy("Agency").orderBy(col("Annual Salary To").desc())
    
    #finding the highest
    max_sal_per_agency_df = kpi_df.withColumn("rank", rank().over(max_sal_per_agency_window_spec)).filter(col("rank") == 1)
    
    # Select the relevant columns for the result
    max_sal_per_agency_df = max_sal_per_agency_df.select("Agency", "Business Title", round(col("Annual Salary To"),2).alias('Highest Salary')).distinct() # Assuming for "Anual Salary To" rounding to 2 decimal required.
    
    return max_sal_per_agency_df

SyntaxError: invalid syntax (<ipython-input-2-fcda45d6d31a>, line 13)

In [None]:
#KPI5: Whats the job positings average salary per agency for the last n years
def get_avg_sal_per_agency_last_n_year(kpi_df,last_n_year):
    n_years_ago = date_sub(current_timestamp(), 365 * last_n_year)
    last_n_job_posting_df = kpi_df.select('Agency','Annual Salary From','Annual Salary To').filter(col('Posting Date') >= n_years_ago)
    avg_salary_df = last_n_job_posting_df.groupBy("Agency").agg(
                                                            round(avg(col("Annual Salary From")),2).alias("Avg_Annual_Salary_From"),
                                                            round(avg(col("Annual Salary To")),2).alias("Avg_Annual_Salary_To")
                                                            )
    return avg_salary_df

In [None]:
#KPI6: What are the highest paid skills in the US market
def top_highest_paid_job_postings(kpi_df, sample_size=20):
    # finding top 20 highest paid job postings.
    highest_paid_job = kpi_df.select('Job ID','Preferred Skills','Preferred Skills_prepared','Salary Range To').filter((f.col("Preferred Skills_prepared") != 'error name') & (f.col("Preferred Skills_prepared") != '')).orderBy(col('Salary Range To').desc())

    #filtering sample records for job postings
    top_samples_job_posting_df = highest_paid_job.select('Preferred Skills_prepared').limit(sample_size)

    # getting top skills
    splitted_skills_df = top_samples_job_posting_df.select(f.explode(f.split(col("Preferred Skills_prepared"), " ")).alias("skills"))
    top_skills_df = splitted_skills_df.groupBy('skills').count().orderBy(col('count').desc())
    return top_skills_df

In [None]:
print("KPI Transformations Imported")