In [None]:
# Moodle Data Exploration Notebook

# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from src.data_loader import MoodleDataLoader
from src.preprocessing import MoodleDataPreprocessor

# 2. Load data
loader = MoodleDataLoader()
# Connect to database or load from CSV
# logs = loader.load_log_data(limit=10000)
# quizzes = loader.load_quiz_data()

# For demo, create sample data
np.random.seed(42)
dates = pd.date_range('2024-01-01', '2024-03-31', freq='H')
sample_logs = pd.DataFrame({
    'timestamp': np.random.choice(dates, 5000),
    'user_id': np.random.randint(1, 101, 5000),
    'course_id': np.random.choice(['MATH101', 'PHYS201', 'CHEM301', 'BIO101'], 5000),
    'action': np.random.choice(['view', 'submit', 'quiz', 'forum'], 5000),
    'duration': np.random.exponential(15, 5000)
})

# 3. Basic exploration
print("Dataset shape:", sample_logs.shape)
print("\nFirst few rows:")
print(sample_logs.head())
print("\nData types:")
print(sample_logs.dtypes)
print("\nMissing values:")
print(sample_logs.isnull().sum())

# 4. Activity analysis
daily_activity = sample_logs.groupby(sample_logs['timestamp'].dt.date).size()
plt.figure(figsize=(12, 6))
daily_activity.plot(kind='line', title='Daily Activity Trend')
plt.xlabel('Date')
plt.ylabel('Number of Actions')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 5. Course comparison
course_stats = sample_logs.groupby('course_id').agg({
    'user_id': 'nunique',
    'action': 'count',
    'duration': 'mean'
}).rename(columns={
    'user_id': 'unique_students',
    'action': 'total_actions',
    'duration': 'avg_duration'
})

print("\nCourse Statistics:")
print(course_stats)

# 6. Interactive visualization with Plotly
fig = px.sunburst(sample_logs, path=['course_id', 'action'], 
                  title='Activity Distribution by Course and Action Type')
fig.show()

# 7. Time pattern analysis
sample_logs['hour'] = sample_logs['timestamp'].dt.hour
sample_logs['day_of_week'] = sample_logs['timestamp'].dt.day_name()

hourly_pattern = sample_logs.groupby('hour').size()
weekly_pattern = sample_logs.groupby('day_of_week').size()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
hourly_pattern.plot(kind='bar', ax=ax1, title='Hourly Activity Pattern')
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Activity Count')

weekly_pattern.reindex(['Monday', 'Tuesday', 'Wednesday', 
                       'Thursday', 'Friday', 'Saturday', 'Sunday']).plot(
    kind='bar', ax=ax2, title='Weekly Activity Pattern')
ax2.set_xlabel('Day of Week')
ax2.set_ylabel('Activity Count')

plt.tight_layout()
plt.show()

# 8. Student engagement ranking
student_engagement = sample_logs.groupby('user_id').agg({
    'action': 'count',
    'duration': 'sum',
    'course_id': 'nunique'
}).rename(columns={
    'action': 'total_actions',
    'duration': 'total_duration',
    'course_id': 'courses_accessed'
}).sort_values('total_actions', ascending=False)

print("\nTop 10 Most Engaged Students:")
print(student_engagement.head(10))

# 9. Save processed data
student_engagement.to_csv('../data/processed/student_engagement.csv')
print("\nProcessed data saved successfully!")