In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
data = pd.read_csv("../data/student_habits_performance.csv")
data.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [4]:
duplicate_count = data.duplicated().sum()
if duplicate_count > 0:
    data = data.drop_duplicates()
    print(f"Removed {duplicate_count} duplicate rows.")
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [5]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   str    
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   str    
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   str    
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   str    
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    str    
 12  internet_quality               1000 non-null   str    
 13  

In [6]:
data.isnull().sum()

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64

In [None]:
if data['parental_education_level'].isnull().any():
    data['parental_education_level'] = data['parental_education_level'].fillna('Unknown')
    print("Filled missing values in 'parental_education_level' with 'Unknown'.")
else:
    print("No missing values in 'parental_education_level'.")

Filled missing values in 'parental_education_level' with 'Unknown'.


In [9]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   str    
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   str    
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   str    
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   str    
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       1000 non-null   str    
 12  internet_quality               1000 non-null   str    
 13  

In [10]:
data['parental_education_level'].value_counts()

parental_education_level
High School    392
Bachelor       350
Master         167
Unknown         91
Name: count, dtype: int64

In [11]:
data.head().T

Unnamed: 0,0,1,2,3,4
student_id,S1000,S1001,S1002,S1003,S1004
age,23,20,21,23,19
gender,Female,Female,Male,Female,Female
study_hours_per_day,0.0,6.9,1.4,1.0,5.0
social_media_hours,1.2,2.8,3.1,3.9,4.4
netflix_hours,1.1,2.3,1.3,1.0,0.5
part_time_job,No,No,No,No,No
attendance_percentage,85.0,97.3,94.8,71.0,90.9
sleep_hours,8.0,4.6,8.0,9.2,4.9
diet_quality,Fair,Good,Poor,Poor,Fair


In [14]:
numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns in the dataset:", numeric_columns)

Numeric columns in the dataset: ['age', 'study_hours_per_day', 'social_media_hours', 'netflix_hours', 'attendance_percentage', 'sleep_hours', 'exercise_frequency', 'mental_health_rating', 'exam_score']


In [19]:
def find_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
    return outliers, lower_bound, upper_bound

outlier_summary = []


for col in numeric_columns:
    outliers, lower, upper = find_outliers(data, col)
    outlier_summary.append({
        'Column': col,
        "Outlier Count": outliers.shape[0],
        'Lower Bound': lower,
        'Upper Bound': upper,
        "min_value": data[col].min(),
        "max_value": data[col].max()
    })

outlier_df = pd.DataFrame(outlier_summary)
outlier_df

Unnamed: 0,Column,Outlier Count,Lower Bound,Upper Bound,min_value,max_value
0,age,0,12.375,29.375,17.0,24.0
1,study_hours_per_day,7,-0.25,7.35,0.0,8.3
2,social_media_hours,5,-0.7,5.7,0.0,7.2
3,netflix_hours,4,-1.2875,4.8125,0.0,5.4
4,attendance_percentage,3,58.4625,110.5625,56.0,100.0
5,sleep_hours,2,3.05,9.85,3.2,10.0
6,exercise_frequency,0,-5.0,11.0,0.0,6.0
7,mental_health_rating,0,-4.5,15.5,1.0,10.0
8,exam_score,2,24.2,115.6,18.4,100.0


In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging


logger = logging.getLogger(__name__)
data = pd.read_csv("../data/student_habits_performance.csv")

def clean_data(data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy so we don't modify the original unintentionally
    cleaned_data = data.copy()

    # Remove duplicates
    duplicate_count = cleaned_data.duplicated().sum()
    if duplicate_count > 0:
        cleaned_data = cleaned_data.drop_duplicates()
        logger.info(f"Removed {duplicate_count} duplicate rows.")
    else:
        logger.info("No duplicate rows found.")

    # Handle missing values
    if cleaned_data['parental_education_level'].isnull().any():
        cleaned_data['parental_education_level'] = cleaned_data['parental_education_level'].fillna('Unknown')
        logger.info("Filled missing values in 'parental_education_level' with 'Unknown'.")
    else:
        logger.info("No missing values in 'parental_education_level'.")

    remaning_null = cleaned_data.isnull().sum().sum()
    if remaning_null > 0:
        logger.warning(f"There are still {remaning_null} missing values in the dataset.")
    else:
        logger.info("No missing values remain in the dataset.")

    return cleaned_data


if __name__ == "__main__":
    cleaned_data = clean_data(data)
    print(cleaned_data.info())
    print("Duplicate Count:", duplicate_count)
    print("Remaining Missing Values:", cleaned_data.isnull().sum().sum())
    print(cleaned_data.shape)
    print(cleaned_data['parental_education_level'].value_counts())



<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   str    
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   str    
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   str    
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   str    
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       1000 non-null   str    
 12  internet_quality               1000 non-null   str    
 13  