# Student Habits vs Academic Performance

Dataset from https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-academic-performance

In [58]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [59]:
df = pd.read_csv("student_habits_performance.csv")
df.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


### Basic Info and Shape

In [60]:
# Column names and types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

In [81]:
# Summary statistics
df.describe()

Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,attendance_percentage,sleep_hours,exercise_frequency,mental_health_rating,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,3.5501,2.5055,1.8197,84.1317,6.4701,3.042,5.438,69.6015
std,2.3081,1.46889,1.172422,1.075118,9.399246,1.226377,2.025423,2.847501,16.888564
min,17.0,0.0,0.0,0.0,56.0,3.2,0.0,1.0,18.4
25%,18.75,2.6,1.7,1.0,78.0,5.6,1.0,3.0,58.475
50%,20.0,3.5,2.5,1.8,84.4,6.5,3.0,5.0,70.5
75%,23.0,4.5,3.3,2.525,91.025,7.3,5.0,8.0,81.325
max,24.0,8.3,7.2,5.4,100.0,10.0,6.0,10.0,100.0


No significant outliers or inconsistencies spotted.

In [62]:
# Shape
print('Shape:', df.shape)   # (rows, columns)

Shape: (1000, 16)


### Cleaning the Data

##### Missing values

In [63]:
# Check for missing values
df.isnull().sum()

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64

In [64]:
# We will ignore the parental_education_level data in our analysis
df = df.drop(columns=["parental_education_level"])
df.shape  

(1000, 15)

##### Duplicates

In [65]:
# Check for duplicates
print(df.duplicated().sum())

0


In [66]:
# Specifically in the student_id column
print(df["student_id"].duplicated().sum())

0


##### Data types

In [67]:
# Fix data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  internet_quality               1000 non-null   object 
 12  mental_health_rating           1000 non-null   in

In [68]:
# Convert the following columns:
# gender, diet_quality, internet_quality
# from Dtype object to category
categorical_cols = [
    "gender",
    "diet_quality",
    "internet_quality"
]
for col in categorical_cols:
    df[col] = df[col].astype("category")

In [80]:
# Check categories
for col in df.select_dtypes(include="category").columns:
    print(f"Categories in '{col}':")
    print(df[col].cat.categories)
    print()

Categories in 'gender':
Index(['Female', 'Male', 'Other'], dtype='object')

Categories in 'diet_quality':
Index(['Fair', 'Good', 'Poor'], dtype='object')

Categories in 'internet_quality':
Index(['Average', 'Good', 'Poor'], dtype='object')



In [70]:
# Convert 'part_time_job' and 'extracurricular_participation' from object (Yes/No) to boolean (True/False)
df["part_time_job"] = df["part_time_job"].map({"Yes": True, "No": False})
df["extracurricular_participation"] = df["extracurricular_participation"].map({"Yes": True, "No": False})
# (Using .map() to explicitly define the mapping rather than .astype(bool) which won't work correctly on strings)

In [78]:
# Check if conversion worked
print(df["part_time_job"].dtype, df["part_time_job"].unique())
print(df["extracurricular_participation"].dtype, df["extracurricular_participation"].unique())

bool [False  True]
bool [ True False]
