In [0]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.functions import when, col
import pandas as pd
def extract_spark():
    url="https://raw.githubusercontent.com/fivethirtyeight/data/refs/heads/master/college-majors/grad-students.csv"
    file_path="data/grad-students.csv"
    spark = SparkSession.builder.appName("grade_student").getOrCreate() # start spark
    # spark_data = spark.read.csv(file_path) # load as csv
    df=pd.read_csv(url)
    spark_df = spark.createDataFrame(df) 
    # To avoid mismatch between the schema of the existing 
    # Delta table and the DataFrame I am trying to append
    spark.sql("DROP TABLE IF EXISTS grade_student_delta")
    spark_df.write.format("delta").mode("append").saveAsTable("grade_student_delta")
    
    return file_path
  

In [0]:
extract_spark()

'data/grad-students.csv'

In [0]:
def data_transform(table="grade_student_delta"):
    # Initialize Spark session
    spark = SparkSession.builder.appName("grade_student").getOrCreate()

    # Define STEM categories
    core_STEM = [
        'Engineering',
        'Computers & Mathematics',
        'Biology & Life Science',
        'Physical Sciences'
    ]

    other_STEM = [
        'Agriculture & Natural Resources',
        'Health',
        'Interdisciplinary'
    ]
    
    # Load the table into a DataFrame
    sparktable = spark.table(table)
    
    # Add the "STEM_major" column based on conditions
    sparktable = sparktable.withColumn(  # Use sparktable, not table
        "STEM_major",
        when(col("Major_category").isin(core_STEM), "core_STEM")
        .when(col("Major_category").isin(other_STEM), "other_STEM")
        .otherwise("Other")
    )

    # Display the transformed DataFrame
    sparktable.show()

    # Return the transformed DataFrame
    return sparktable.columns

In [0]:
spark = SparkSession.builder.appName("grade_student").getOrCreate()
data_transform(table="grade_student_delta")
spark.sql("SELECT * FROM grade_student_delta LIMIT 2").show()

+----------+--------------------+--------------------+----------+----------------+-------------+-------------------------+---------------+----------------------+-----------+--------+--------+-------------+----------------+----------------------------+------------------+-------------------------+--------------+-----------+-----------+-----------+------------+----------+
|Major_code|               Major|      Major_category|Grad_total|Grad_sample_size|Grad_employed|Grad_full_time_year_round|Grad_unemployed|Grad_unemployment_rate|Grad_median|Grad_P25|Grad_P75|Nongrad_total|Nongrad_employed|Nongrad_full_time_year_round|Nongrad_unemployed|Nongrad_unemployment_rate|Nongrad_median|Nongrad_P25|Nongrad_P75| Grad_share|Grad_premium|STEM_major|
+----------+--------------------+--------------------+----------+----------------+-------------+-------------------------+---------------+----------------------+-----------+--------+--------+-------------+----------------+----------------------------+-----

In [0]:
def load_sql(table="grade_student_delta"):
    spark = SparkSession.builder.appName("grade_student").getOrCreate()
    
    try:
        # Run the SQL query
        query_result = spark.sql(f"""
            SELECT 
                Major_category,
                SUM(Nongrad_employed) AS Total_Nongrad_employed,
                SUM(Grad_employed) AS Total_Grad_employed,
                SUM(Grad_unemployed) AS Total_Grad_unemployed,
                SUM(Nongrad_unemployed) AS Total_Nongrad_unemployed,
                SUM(Grad_total) AS Total_Grad_total,
                SUM(Nongrad_total) AS Total_Nongrad_total
            FROM {table}
            GROUP BY Major_category
            HAVING Total_Grad_employed + Total_Nongrad_employed > 10000
            ORDER BY Total_Grad_employed + Total_Nongrad_employed DESC
        """)
        
        # Check if the result is empty
        if query_result.count() > 0:
            return query_result
        else:
            print("No results found for the query.")
            return None
    
    except Exception as e:
        print(f"Error executing SQL query: {e}")
        return None

In [0]:
query_result=load_sql(table="grade_student_delta")
query_result.show()

+--------------------+----------------------+-------------------+---------------------+------------------------+----------------+-------------------+
|      Major_category|Total_Nongrad_employed|Total_Grad_employed|Total_Grad_unemployed|Total_Nongrad_unemployed|Total_Grad_total|Total_Nongrad_total|
+--------------------+----------------------+-------------------+---------------------+------------------------+----------------+-------------------+
|            Business|               7123852|            2124495|               101994|                  393222|         2718897|            9345634|
|           Education|               2659824|            2437166|                66938|                  111875|         3945300|            4488291|
|Humanities & Libe...|               2289696|            1986572|                85033|                  154239|         2825975|            3448921|
|         Engineering|               2483802|            1634563|                65073|             