In [2]:
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Database connection
conn = psycopg2.connect(
    host=os.getenv('DB_HOST'),
    port=os.getenv('DB_PORT'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    database=os.getenv('DB_NAME')
)

print("âœ… Successfully connected to student_performance_db database!")

âœ… Successfully connected to student_performance_db database!


<img src="exam_result.jpeg" alt="exam_result image" width="49%"/> <img src="examB.png" alt="Lower exam grade image" width="49%"/>

In today's fast-paced and competitive educational environment, understanding the factors that influence student success is more important than ever. Just like the transport system in a bustling city like London must adapt to serve its residents, schools and educators must adapt to meet the needs of students. In this project, we will take a deep dive into a dataset containing rich details about various aspects of student life, such as hours studied, sleep patterns, attendance, and more, to uncover what truly impacts exam performance.

The dataset we'll be working with includes a wide range of factors influencing student performance. By analyzing this data, we'll be able to identify key drivers of success and provide insights that could help students, teachers, and policymakers make informed decisions. The table we'll use for this project is called `student_performance` and includes the following data:

| Column                   | Definition                                                      | Data type             |
|--------------------------|-----------------------------------------------------------------|-----------------------|
| `attendance`              | Percentage of classes attended                                  |     `float`               |
| `extracurricular_activities` | Participation in extracurricular activities                   |     `varchar` (Yes, No)    |
| `sleep_hours`             | Average number of hours of sleep per night                      |     `float`               |
| `tutoring_sessions`       | Number of tutoring sessions attended per month                  |     `integer`             |
| `teacher_quality`         | Quality of the teachers                                         |     `varchar` (Low, Medium, High) |
| `exam_score`              | Final exam score                                                |     `float`               |

You will execute SQL queries to answer three questions, as listed in the instructions.


In [7]:
# avg_exam_score_by_study_and_extracurricular
query = """
SELECT
    hours_studied,
    AVG(exam_score) AS avg_exam_score
	
FROM student_performance
	
WHERE
    hours_studied > 10
    AND extracurricular_activities = 'Yes'
    AND exam_score IS NOT NULL
	
GROUP BY hours_studied
ORDER BY hours_studied DESC;
"""

df_result1 = pd.read_sql_query(query, conn)
print("\nðŸ“Š Average Exam Score by Study Hours (>10h) with Extracurricular Activities:")
df_result1


ðŸ“Š Average Exam Score by Study Hours (>10h) with Extracurricular Activities:


  df_result1 = pd.read_sql_query(query, conn)


Unnamed: 0,hours_studied,avg_exam_score
0,43,78.0
1,39,75.0
2,38,73.5
3,37,73.0
4,36,70.428571
5,35,72.3125
6,34,71.1875
7,33,70.333333
8,32,71.325
9,31,70.553191


In [8]:
# avg_exam_score_by_hours_studied_range
query = """
WITH buckets AS (
    SELECT
        CASE
            WHEN hours_studied BETWEEN 1 AND 5   THEN '1-5 hours'
            WHEN hours_studied BETWEEN 6 AND 10  THEN '6-10 hours'
            WHEN hours_studied BETWEEN 11 AND 15 THEN '11-15 hours'
            WHEN hours_studied >= 16             THEN '16+ hours'
            ELSE '0 or NULL'
        END AS hours_studied_range,
        exam_score
    FROM student_performance
    WHERE exam_score IS NOT NULL
)
	
	
SELECT
    hours_studied_range,
    AVG(exam_score) AS avg_exam_score
	
FROM buckets
	
WHERE hours_studied_range <> '0 or NULL'
GROUP BY hours_studied_range
ORDER BY avg_exam_score DESC;
"""

df_result2 = pd.read_sql_query(query, conn)
print("\nðŸ“Š Average Exam Score by Study Hours Range:")
df_result2


ðŸ“Š Average Exam Score by Study Hours Range:


  df_result2 = pd.read_sql_query(query, conn)


Unnamed: 0,hours_studied_range,avg_exam_score
0,16+ hours,67.923363
1,11-15 hours,65.204386
2,6-10 hours,64.22549
3,1-5 hours,62.627119


In [9]:
# student_exam_ranking
query = """
SELECT
    attendance,
    hours_studied,
    sleep_hours,
    tutoring_sessions,
    exam_score,
    DENSE_RANK() OVER (ORDER BY exam_score DESC) AS exam_rank
	
FROM student_performance
	
WHERE exam_score IS NOT NULL
ORDER BY exam_rank ASC
LIMIT 30;
"""

df_result3 = pd.read_sql_query(query, conn)
print("\nðŸ“Š Top 30 Students by Exam Score (with ranking):")
df_result3


ðŸ“Š Top 30 Students by Exam Score (with ranking):


  df_result3 = pd.read_sql_query(query, conn)


Unnamed: 0,attendance,hours_studied,sleep_hours,tutoring_sessions,exam_score,exam_rank
0,98.0,27,6.0,5,101.0,1
1,89.0,18,4.0,3,100.0,2
2,90.0,14,8.0,4,99.0,3
3,83.0,23,4.0,1,99.0,3
4,96.0,28,4.0,1,98.0,4
5,83.0,16,8.0,2,98.0,4
6,90.0,28,9.0,0,98.0,4
7,83.0,15,7.0,2,97.0,5
8,74.0,21,6.0,1,97.0,5
9,99.0,25,7.0,0,97.0,5
