# StudentProfiler: Student Profiling and Risk Assessment

## Overview

The StudentProfiler microservice analyzes student learning metrics to create comprehensive profiles that help identify:
- **Risk Levels**: Students at risk of failing or withdrawing
- **Engagement Patterns**: How actively students interact with learning materials
- **Performance Clusters**: Groups of students with similar learning behaviors

These profiles are essential for:
- **Early Intervention**: Identifying students who need support
- **Personalized Recommendations**: Tailoring learning paths based on profiles
- **Resource Allocation**: Focusing support efforts where they're most needed

## Pipeline Position

```
01_LMSConnector → 02_PrepaData → 03_StudentProfiler → 04_PathPredictor → 05_RecoBuilder
     (Raw Data)    (Features)      (Profiles)          (ML Model)        (Recommendations)
```

## Rule-Based Profiling Logic

### Risk Level Classification

Students are classified into three risk levels based on multiple factors:

- **HIGH RISK**: 
  - Average score < 50 OR
  - Completion rate < 50% OR
  - Final result is "Withdrawn" or "Fail"

- **MEDIUM RISK**:
  - Average score < 70 OR
  - Completion rate < 75%

- **LOW RISK**:
  - Average score ≥ 70 AND
  - Completion rate ≥ 75% AND
  - Final result is "Pass" or "Distinction"

### Engagement Profile Classification

Based on quantiles of `total_clicks` and `active_days`:

- **HIGH_ENGAGEMENT**: High clicks (≥66th percentile) AND high active days (≥66th percentile)
- **REGULAR**: Medium levels of engagement
- **LOW_ENGAGEMENT**: Low clicks (<33rd percentile) OR low active days (<33rd percentile)

### Global Profile

Combines risk level and engagement profile to create 9 possible profiles:
- HIGH_RISK_HIGH_ENGAGEMENT, HIGH_RISK_REGULAR, HIGH_RISK_LOW_ENGAGEMENT
- MEDIUM_RISK_HIGH_ENGAGEMENT, MEDIUM_RISK_REGULAR, MEDIUM_RISK_LOW_ENGAGEMENT
- LOW_RISK_HIGH_ENGAGEMENT, LOW_RISK_REGULAR, LOW_RISK_LOW_ENGAGEMENT

## Clustering (Optional)

The profiler can also use **KMeans clustering** to discover latent student profiles based on:
- Average score
- Completion rate
- Total clicks
- Active days

This unsupervised approach can reveal patterns not captured by rule-based logic.


In [None]:
# Import the StudentProfiler pipeline
from libs.profiler import run_student_profiler_pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting style
try:
    plt.style.use('seaborn-v0_8')
except OSError:
    try:
        plt.style.use('seaborn')
    except OSError:
        plt.style.use('default')
sns.set_palette("husl")


In [None]:
# Run the complete StudentProfiler pipeline
# Set use_clustering=True to enable KMeans clustering (requires scikit-learn)
profiles_df = run_student_profiler_pipeline(use_clustering=False)


In [None]:
# Display the first few rows
print("First 10 rows of student profiles:")
print(profiles_df.head(10))
print(f"\nDataFrame shape: {profiles_df.shape}")
print(f"\nColumn names: {list(profiles_df.columns)}")


In [None]:
# Display value counts for risk levels
print("=" * 60)
print("RISK LEVEL DISTRIBUTION")
print("=" * 60)
risk_counts = profiles_df['risk_level'].value_counts()
print(risk_counts)
print(f"\nPercentages:")
print((risk_counts / len(profiles_df) * 100).round(2))


In [None]:
# Display value counts for engagement profiles
print("=" * 60)
print("ENGAGEMENT PROFILE DISTRIBUTION")
print("=" * 60)
engagement_counts = profiles_df['engagement_profile'].value_counts()
print(engagement_counts)
print(f"\nPercentages:")
print((engagement_counts / len(profiles_df) * 100).round(2))


## Visualizations

Let's explore the profiles through visualizations:


In [None]:
# Distribution of avg_score per risk_level (Boxplot)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Boxplot
risk_order = ['LOW', 'MEDIUM', 'HIGH']
profiles_df_ordered = profiles_df[profiles_df['risk_level'].isin(risk_order)]
axes[0].boxplot([profiles_df_ordered[profiles_df_ordered['risk_level'] == r]['avg_score'].dropna() 
                for r in risk_order], labels=risk_order)
axes[0].set_title('Distribution of Average Scores by Risk Level', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Risk Level')
axes[0].set_ylabel('Average Score')
axes[0].grid(True, alpha=0.3)

# Histogram
for risk in risk_order:
    data = profiles_df_ordered[profiles_df_ordered['risk_level'] == risk]['avg_score'].dropna()
    axes[1].hist(data, alpha=0.6, label=risk, bins=30, edgecolor='black')
axes[1].set_title('Distribution of Average Scores by Risk Level', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Average Score')
axes[1].set_ylabel('Frequency')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Bar plot of number of students per risk_level
risk_counts = profiles_df['risk_level'].value_counts().reindex(['LOW', 'MEDIUM', 'HIGH'])

plt.figure(figsize=(10, 6))
bars = plt.bar(risk_counts.index, risk_counts.values, color=['green', 'orange', 'red'], 
               edgecolor='black', alpha=0.7)
plt.title('Number of Students per Risk Level', fontsize=14, fontweight='bold')
plt.xlabel('Risk Level')
plt.ylabel('Number of Students')
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}\n({height/len(profiles_df)*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Bar plot of engagement profiles
engagement_counts = profiles_df['engagement_profile'].value_counts()

plt.figure(figsize=(10, 6))
bars = plt.bar(engagement_counts.index, engagement_counts.values, 
               color=['steelblue', 'lightblue', 'navy'], edgecolor='black', alpha=0.7)
plt.title('Number of Students per Engagement Profile', fontsize=14, fontweight='bold')
plt.xlabel('Engagement Profile')
plt.ylabel('Number of Students')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}\n({height/len(profiles_df)*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Cross-tabulation: Risk Level vs Engagement Profile
crosstab = pd.crosstab(profiles_df['risk_level'], profiles_df['engagement_profile'], 
                       margins=True, margins_name="Total")

plt.figure(figsize=(12, 6))
sns.heatmap(crosstab.iloc[:-1, :-1], annot=True, fmt='d', cmap='YlOrRd', 
            cbar_kws={'label': 'Number of Students'}, linewidths=1, linecolor='black')
plt.title('Risk Level vs Engagement Profile Cross-Tabulation', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Engagement Profile')
plt.ylabel('Risk Level')
plt.tight_layout()
plt.show()

print("Cross-tabulation table:")
print(crosstab)


In [None]:
# Distribution of global profiles
if 'global_profile' in profiles_df.columns:
    global_counts = profiles_df['global_profile'].value_counts()
    
    plt.figure(figsize=(14, 6))
    bars = plt.bar(range(len(global_counts)), global_counts.values, 
                   edgecolor='black', alpha=0.7)
    plt.title('Number of Students per Global Profile', fontsize=14, fontweight='bold')
    plt.xlabel('Global Profile')
    plt.ylabel('Number of Students')
    plt.xticks(range(len(global_counts)), global_counts.index, rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height)}',
                 ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    print("\nGlobal Profile Distribution:")
    print(global_counts)


In [None]:
# Optional: If clustering was enabled, show cluster distribution
if 'cluster_label' in profiles_df.columns:
    print("=" * 60)
    print("CLUSTER PROFILE DISTRIBUTION")
    print("=" * 60)
    cluster_counts = profiles_df['cluster_label'].value_counts().sort_index()
    print(cluster_counts)
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(cluster_counts.index.astype(str), cluster_counts.values, 
                   edgecolor='black', alpha=0.7)
    plt.title('Number of Students per Cluster', fontsize=14, fontweight='bold')
    plt.xlabel('Cluster Label')
    plt.ylabel('Number of Students')
    plt.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{int(height)}\n({height/len(profiles_df)*100:.1f}%)',
                 ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    if 'cluster_profile' in profiles_df.columns:
        print("\nCluster Profile Labels:")
        print(profiles_df['cluster_profile'].value_counts())
else:
    print("Clustering not enabled. Set use_clustering=True in run_student_profiler_pipeline() to enable.")


In [None]:
# Display the output file path
from pathlib import Path
from libs.utils import get_data_paths

_, processed_dir = get_data_paths()
output_file = processed_dir / "student_module_profiles.csv"

print("=" * 60)
print("OUTPUT FILE LOCATION")
print("=" * 60)
print(f"\n✓ Generated file: {output_file}")
print(f"✓ File exists: {output_file.exists()}")
if output_file.exists():
    print(f"✓ File size: {output_file.stat().st_size / 1024:.2f} KB")
    print(f"✓ Total rows: {len(profiles_df)}")
    print(f"✓ Total columns: {len(profiles_df.columns)}")
print("\n" + "=" * 60)
