# Create a Spark session and load csv data #

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Load the Stack Overflow survey response CSV file into a DataFrame
soDf = spark.read.csv("C:\\Users\\Phil.Austin\\OneDrive - Telefonica Tech UK Limited\\survey_results_public.csv", header=True, inferSchema=True)

# Show the first few rows of the DataFrame
soDf.show()

In [None]:
soDf.createOrReplaceTempView("so_data")

grpDf = spark.sql("""
                    SELECT 
                        Age, 
                        COUNT(*) AS NumberOfResponses 
                    FROM so_data 
                    GROUP BY Age 
                    ORDER BY COUNT(*) DESC
                """)

grpDf.show()

# Salaries are in different currencies #

## Which you can see in Data Wrangler... ##


In [None]:
#Load currency conversions
currencyConvDf = spark.read.csv("average_csv_2024-3.csv", header=True, inferSchema=True)

currencyConvDf.show()


# Add a Currency Code column to the SO dataframe #

In [None]:
from pyspark.sql import functions as F

# Add a new column 'CurrencyCode' to soDf that contains the first three letters of 'Currency'
soDf = soDf.withColumn("CurrencyCode", F.expr("substring(Currency, 1, 3)"))

soDf.filter(soDf.Currency != 'NA').select(soDf["Currency"], soDf["CurrencyCode"]).show()

# Convert Compensation into GBP #

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DecimalType

# Join soDf to currencyConvDf where currencyConvDf.CurrencyCode = soDf.Currency Code
conversionDf = soDf.join(
    currencyConvDf,
    soDf["CurrencyCode"] == currencyConvDf["Currency Code"],
    "inner"
). \
    select(soDf["Country"], \
           soDf["CompTotal"], \
           soDf["LanguageHaveWorkedWith"], \
           currencyConvDf["Sterling value of Currency Unit £"])

# convert total compensation into GBP
conversionDf = conversionDf.withColumn("CompTotalInGBP", 
                    F.col("CompTotal") * F.col("Sterling value of Currency Unit £"))


# Split the LanguagesWorkedWith column on the ; character and explode it
explodedDf = conversionDf.withColumn(
    "Language",
    F.explode(F.split(F.col("LanguageHaveWorkedWith"), ";"))
)

explodedDf.show()


# And the winner is... #

In [10]:

# Filter out rows where CompTotalInGBP is 0 or null
filteredDf = explodedDf.filter((F.col("CompTotalInGBP") != 0) \
                               & (F.col("CompTotalInGBP").isNotNull()) 
                               )   

# Aggregate median CompTotalInGBP by Language and show the top 10
aggregatedDf = filteredDf.groupBy("Language").\
                agg(F.median("CompTotalInGBP").alias("Median Compensation In GBP")).\
                        orderBy(F.desc("Median Compensation In GBP")).\
                            limit(20)




In [None]:
aggregatedDf.show()

In [None]:
# Filter out rows where CompTotalInGBP is 0 or null
filteredDf = explodedDf.filter((F.col("CompTotalInGBP") != 0) \
                               & (F.col("CompTotalInGBP").isNotNull()) \
                                   & (F.col("Country") == "United Kingdom of Great Britain and Northern Ireland"))   

# Aggregate median CompTotalInGBP by Language and show the top 10
aggregatedDf = filteredDf.groupBy("Language").\
                agg(F.median("CompTotalInGBP").alias("Median Compensation In GBP")).\
                        orderBy(F.desc("Median Compensation In GBP")).\
                            limit(20)

aggregatedDf.show()