In [2]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import pytest

# Get project root (one level up from notebooks)
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.pipelines.preprocessing import MLDataLoader

import matplotlib.pyplot as plt
import seaborn

In [2]:
# Initialize loader
data_path = r"C:\Users\jhoni\Documents\LooperAI\repositorios\ai-ml-mlops-katas\data\raw\Exam_Score_Prediction.csv"

data_loader = MLDataLoader(data_path)

In [3]:
# Upload raw data
df_raw = data_loader.load_data()

Loaded 20000 registers


In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   student_id        20000 non-null  int64  
 1   age               20000 non-null  int64  
 2   gender            20000 non-null  object 
 3   course            20000 non-null  object 
 4   study_hours       20000 non-null  float64
 5   class_attendance  20000 non-null  float64
 6   internet_access   20000 non-null  object 
 7   sleep_hours       20000 non-null  float64
 8   sleep_quality     20000 non-null  object 
 9   study_method      20000 non-null  object 
 10  facility_rating   20000 non-null  object 
 11  exam_difficulty   20000 non-null  object 
 12  exam_score        20000 non-null  float64
dtypes: float64(4), int64(2), object(7)
memory usage: 2.0+ MB


In [5]:
df_raw.head(10)

Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,54.8
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,29.7
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7
5,6,23,male,b.tech,3.48,65.4,yes,4.2,good,mixed,low,moderate,58.2
6,7,17,female,b.tech,1.35,69.0,yes,7.4,average,online videos,high,hard,53.7
7,8,22,male,b.sc,5.48,51.1,yes,8.2,poor,self-study,low,moderate,47.3
8,9,18,other,bca,2.89,92.0,yes,6.6,poor,self-study,low,easy,44.9
9,10,17,male,bba,6.77,44.8,yes,9.8,average,group study,high,moderate,77.7


In [6]:
df_raw['exam_difficulty'].value_counts()

exam_difficulty
moderate    9878
easy        6141
hard        3981
Name: count, dtype: int64

In [7]:
# Remove high cardinality
df_remove_hc = data_loader.remove_high_cardinality(df_raw, 10)

In [9]:
df_eda = df_remove_hc[["student_id","exam_score"]]
df_eda

Unnamed: 0,student_id,exam_score
0,1,58.9
1,2,54.8
2,3,90.3
3,4,29.7
4,5,43.7
...,...,...
19995,19997,86.5
19996,19998,60.9
19997,19999,64.5
19998,20000,79.0


In [8]:
df_remove_hc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   student_id        20000 non-null  int64  
 1   age               20000 non-null  int64  
 2   gender            20000 non-null  object 
 3   course            20000 non-null  object 
 4   study_hours       20000 non-null  float64
 5   class_attendance  20000 non-null  float64
 6   internet_access   20000 non-null  object 
 7   sleep_hours       20000 non-null  float64
 8   sleep_quality     20000 non-null  object 
 9   study_method      20000 non-null  object 
 10  facility_rating   20000 non-null  object 
 11  exam_difficulty   20000 non-null  object 
 12  exam_score        20000 non-null  float64
dtypes: float64(4), int64(2), object(7)
memory usage: 2.0+ MB


In [8]:
# Split feature and target
X,y = data_loader.split_features_and_target(df_remove_hc, "exam_score")

In [9]:
y

0        58.9
1        54.8
2        90.3
3        29.7
4        43.7
         ... 
19995    86.5
19996    60.9
19997    64.5
19998    79.0
19999    71.0
Name: exam_score, Length: 20000, dtype: float64

In [10]:
# Split train, val, test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [4]:
# test_basic.py
def add(a, b):
    return a + b

def test_add_positive_numbers():
    result = add(2, 3)
    assert result == 5

def test_add_negative_numbers():
    result = add(-2, -3)
    assert result == -5

def test_add_mixed():
    result = add(-2, 3)
    assert result == 1

In [12]:
reference = pd.Series([4, 5, 4, 6, 5, 4, 5, 6, 5, 4])
current = pd.Series([3,4,5,4,5,3,5,6,6,5])
n_bins = 10

In [13]:
_, bin_edges = np.histogram(reference, bins=n_bins)
bin_edges

array([4. , 4.2, 4.4, 4.6, 4.8, 5. , 5.2, 5.4, 5.6, 5.8, 6. ])

In [14]:
# Extend edges to handle values outside reference range
bin_edges[0] = -np.inf # All low values
bin_edges[-1] = np.inf # All high values

In [15]:
# Count values in each bin
ref_counts, _ = np.histogram(reference, bins=bin_edges)
curr_counts, _ = np.histogram(current, bins=bin_edges)

In [17]:
ref_counts,curr_counts,_

(array([4, 0, 0, 0, 0, 4, 0, 0, 0, 2]),
 array([4, 0, 0, 0, 0, 4, 0, 0, 0, 2]),
 array([-inf,  4.2,  4.4,  4.6,  4.8,  5. ,  5.2,  5.4,  5.6,  5.8,  inf]))