In [5]:
# Libraries
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# Data Loading
enrol_data = pd.read_csv('../data/ENROL_Declared20260105.csv')
declared_data = pd.read_csv('../data/Student_Declared20260105.csv')

In [7]:
# Data sizes
print(f"ENROL_Declared20260105.csv shape: {enrol_data.shape}")
print(f"Student_Declared20260105.csv shape: {declared_data.shape}")

ENROL_Declared20260105.csv shape: (12591, 15)
Student_Declared20260105.csv shape: (1786, 4)


In [8]:
# Null Value Analysis
enrol_nulls = enrol_data.isnull().sum()
print("Null values in ENROL_Declared20260105.csv:")
print(enrol_nulls[enrol_nulls > 0])

declared_nulls = declared_data.isnull().sum()
print("\nNull values in Student_Declared20260105.csv:")
print(declared_nulls[declared_nulls > 0])

Null values in ENROL_Declared20260105.csv:
HourStart      496
MinuteStart    496
HourEnd        496
MinuteEnd      496
dtype: int64

Null values in Student_Declared20260105.csv:
Series([], dtype: int64)


In [9]:
# Duplicate Records Check
enrol_duplicates = enrol_data.duplicated().sum()
print(f"\nNumber of duplicate records in ENROL_Declared20260105.csv: {enrol_duplicates}")

declared_duplicates = declared_data.duplicated().sum()
print(f"Number of duplicate records in Student_Declared20260105.csv: {declared_duplicates}")


Number of duplicate records in ENROL_Declared20260105.csv: 0
Number of duplicate records in Student_Declared20260105.csv: 0


In [10]:
# Remove certain columns, then remove duplicates and compare row counts
enrol_data_cols_to_remove = ["HourStart", "MinuteStart", "HourEnd", "MinuteEnd", "MON", "Tues", "WED", "Thurs", "FRI"]
enrol_data_grouped = enrol_data.drop(columns=enrol_data_cols_to_remove).drop_duplicates()
print(f"Row count before deduplication: {len(enrol_data)}, after deduplication: {len(enrol_data_grouped)}")


Row count before deduplication: 12591, after deduplication: 11577


In [None]:
# Group by STD_INDEX, EnolledSemester, Subject, and CatalogNbr to see unique course enrollments per student per semester
student_courses_cols = ['STD_INDEX', 'EnolledSemester', 'Subject','CatalogNbr'] # 'Location' and 'SsrComponent' skipped for simplicity
enrol_data_simple = enrol_data.drop(columns=[col for col in enrol_data.columns if col not in student_courses_cols])
enrol_data_simple = enrol_data_simple.drop_duplicates()
print("Enrol data unique student courses shape:", enrol_data_simple.shape)

Enrol data unique student courses shape: (9820, 4)


In [None]:
# Example of course counts per semester:
course_counts = enrol_data_simple.groupby(['EnolledSemester', 'Subject', 'CatalogNbr']).size().reset_index(name='Count')
# print("Course counts per semester:")
# print(course_counts.head(25))