In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Student Health Data').getOrCreate()


In [3]:
df=spark.read.csv('Student_Health_Data.csv',inferSchema=True,header=True)

In [8]:
df.createOrReplaceTempView('student_data')

In [12]:
import time
start_time = time.time()
g1=spark.sql("""
    SELECT 
        AVG(Stress_Level_Biosensor) AS avg_biosensor_stress,
        AVG(Stress_Level_Self_Report) AS avg_self_reported_stress,
        AVG(CASE 
            WHEN Physical_Activity = 'High' THEN 3
            WHEN Physical_Activity = 'Moderate' THEN 2
            ELSE 1 END) AS avg_physical_activity,
        AVG(CASE 
            WHEN Sleep_Quality = 'Good' THEN 3
            WHEN Sleep_Quality = 'Moderate' THEN 2
            ELSE 1 END) AS avg_sleep_quality
    FROM student_data
""")


g1.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g1.write.csv('Goal1',header=True)

+--------------------+------------------------+---------------------+-----------------+
|avg_biosensor_stress|avg_self_reported_stress|avg_physical_activity|avg_sleep_quality|
+--------------------+------------------------+---------------------+-----------------+
|   5.483909405211377|       5.361600567638264|                1.905|            2.263|
+--------------------+------------------------+---------------------+-----------------+

Processing Time: 1.59 seconds


In [14]:
start_time = time.time()
g2=spark.sql("""
    SELECT Health_Risk_Level, COUNT(*) AS count
    FROM student_data
    GROUP BY Health_Risk_Level
""")


g2.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g2.write.csv('Goal2',header=True)

+-----------------+-----+
|Health_Risk_Level|count|
+-----------------+-----+
|             High|  138|
|              Low|  190|
|         Moderate|  672|
+-----------------+-----+

Processing Time: 0.77 seconds


In [15]:
start_time = time.time()
g3=spark.sql("""
    SELECT 
        Health_Risk_Level,
        AVG(Stress_Level_Biosensor) AS avg_biosensor_stress,
        AVG(Stress_Level_Self_Report) AS avg_self_reported_stress,
        AVG(CASE 
            WHEN Physical_Activity = 'High' THEN 3
            WHEN Physical_Activity = 'Moderate' THEN 2
            ELSE 1 END) AS avg_physical_activity,
        AVG(CASE 
            WHEN Sleep_Quality = 'Good' THEN 3
            WHEN Sleep_Quality = 'Moderate' THEN 2
            ELSE 1 END) AS avg_sleep_quality
    FROM student_data
    GROUP BY Health_Risk_Level
""")


g3.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g3.write.csv('Goal3',header=True)

+-----------------+--------------------+------------------------+---------------------+------------------+
|Health_Risk_Level|avg_biosensor_stress|avg_self_reported_stress|avg_physical_activity| avg_sleep_quality|
+-----------------+--------------------+------------------------+---------------------+------------------+
|             High|   7.513085120395608|       7.191850494337831|   2.3043478260869565|1.8840579710144927|
|              Low|   2.999760089834212|      2.9657585555641286|    1.868421052631579|2.3684210526315788|
|         Moderate|     5.7695673237028|       5.663141478961978|   1.8333333333333333|2.3110119047619047|
+-----------------+--------------------+------------------------+---------------------+------------------+

Processing Time: 1.47 seconds


In [16]:
start_time = time.time()
g4=spark.sql("""
    SELECT 
        Mood,
        AVG(Stress_Level_Biosensor) AS avg_biosensor_stress,
        AVG(Stress_Level_Self_Report) AS avg_self_reported_stress,
        AVG(CASE 
            WHEN Sleep_Quality = 'Good' THEN 3
            WHEN Sleep_Quality = 'Moderate' THEN 2
            ELSE 1 END) AS avg_sleep_quality
    FROM student_data
    GROUP BY Mood
""")


g4.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g4.write.csv('Goal4',header=True)

+--------+--------------------+------------------------+------------------+
|    Mood|avg_biosensor_stress|avg_self_reported_stress| avg_sleep_quality|
+--------+--------------------+------------------------+------------------+
|Stressed|   5.527092675450389|        5.35771862832382|2.3118279569892475|
| Neutral|   5.462382357385251|       5.274956134294667| 2.255421686746988|
|   Happy|   5.486169146021864|       5.453529090320141|2.2481203007518795|
+--------+--------------------+------------------------+------------------+

Processing Time: 1.15 seconds


In [17]:
start_time = time.time()
g5=spark.sql("""
    SELECT 
        AVG(Study_Hours) AS avg_study_hours,
        AVG(Project_Hours) AS avg_project_hours,
        AVG(Stress_Level_Biosensor) AS avg_biosensor_stress,
        AVG(Stress_Level_Self_Report) AS avg_self_reported_stress
    FROM student_data
""")


g5.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g5.write.csv('Goal5',header=True)

+------------------+------------------+--------------------+------------------------+
|   avg_study_hours| avg_project_hours|avg_biosensor_stress|avg_self_reported_stress|
+------------------+------------------+--------------------+------------------------+
|30.227037157426228|14.887368383808754|   5.483909405211377|       5.361600567638264|
+------------------+------------------+--------------------+------------------------+

Processing Time: 0.70 seconds


In [18]:
start_time = time.time()
g5=spark.sql("""
    SELECT 
        AVG(Study_Hours) AS avg_study_hours,
        AVG(Project_Hours) AS avg_project_hours,
        AVG(Stress_Level_Biosensor) AS avg_biosensor_stress,
        AVG(Stress_Level_Self_Report) AS avg_self_reported_stress
    FROM student_data
""")


g5.show()
end_time = time.time()
processing_time = end_time - start_time
print(f"Processing Time: {processing_time:.2f} seconds")
g5.write.csv('Goal5',header=True)

+------------------+------------------+--------------------+------------------------+
|   avg_study_hours| avg_project_hours|avg_biosensor_stress|avg_self_reported_stress|
+------------------+------------------+--------------------+------------------------+
|30.227037157426228|14.887368383808754|   5.483909405211377|       5.361600567638264|
+------------------+------------------+--------------------+------------------------+

Processing Time: 0.54 seconds


AnalysisException: [PATH_ALREADY_EXISTS] Path file:/C:/Users/s562045/Downloads/Goal5 already exists. Set mode as "overwrite" to overwrite the existing path.