In [1]:
import pandas as pd

# Load studentInfo.csv
student_info = pd.read_csv('studentInfo.csv')

# Display basic info about the dataset
print(f"Total records: {len(student_info)}")
print(f"Total unique students: {student_info['id_student'].nunique()}")
print(f"Total unique modules: {student_info['code_module'].nunique()}")
print(f"\nFirst few rows:")
student_info.head()

Total records: 32593
Total unique students: 28785
Total unique modules: 7

First few rows:


Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [2]:
# Check for students with multiple module records
students_module_count = student_info.groupby('id_student')['code_module'].nunique().reset_index()
students_module_count.columns = ['id_student', 'num_modules']

# Filter students with more than one module
students_with_multiple_modules = students_module_count[students_module_count['num_modules'] > 1]

print(f"Number of students with multiple module records: {len(students_with_multiple_modules)}")
print(f"Number of students with single module records: {len(students_module_count) - len(students_with_multiple_modules)}")
print(f"\nStudents with most modules:")
students_with_multiple_modules.sort_values('num_modules', ascending=False).head(10)

Number of students with multiple module records: 2479
Number of students with single module records: 26306

Students with most modules:


Unnamed: 0,id_student,num_modules
10717,557085,3
2290,279883,3
12012,571950,3
8876,537811,3
4350,399863,3
6532,493793,3
12056,572154,3
6887,502004,3
8014,524431,3
10091,551528,3


In [3]:
# Show detailed examples of students with multiple modules
if len(students_with_multiple_modules) > 0:
    # Get a sample student with multiple modules
    sample_student_id = students_with_multiple_modules.iloc[0]['id_student']
    
    print(f"Example: Student ID {sample_student_id}")
    print(f"\nAll records for this student:")
    sample_records = student_info[student_info['id_student'] == sample_student_id][['id_student', 'code_module', 'code_presentation', 'final_result']]
    print(sample_records)
    
    print("\n" + "="*50)
    print("Summary statistics:")
    print(f"- Average number of modules per student: {students_module_count['num_modules'].mean():.2f}")
    print(f"- Max number of modules for a single student: {students_module_count['num_modules'].max()}")
    print(f"- Min number of modules for a single student: {students_module_count['num_modules'].min()}")

Example: Student ID 29411

All records for this student:
       id_student code_module code_presentation final_result
10598       29411         CCC             2014J    Withdrawn
14399       29411         DDD             2013J         Pass

Summary statistics:
- Average number of modules per student: 1.09
- Max number of modules for a single student: 3
- Min number of modules for a single student: 1
