#### Phase 1: Data Import and Preparation

In [None]:
# Import necessary libaries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load and examine the dataframe structure
file_path = "../data/raw/df.xlsx"
data = pd.ExcelFile(file_path)

# All courses (sheets) except first two sheets (metadata)
course_list = [sheet for sheet in data.sheet_names if sheet not in ['List of Courses', 'Data Points Defined']]

print(f"There are {len(course_list)} courses in this excel file, which are:")
for i, course in enumerate(course_list):
    print(f"{i+1}. {course}")

#### Phase 2: Combine all courses (sheets) into one sheet

In [None]:
# A function to take a excel file and a list of sheets and combine all sheets into one

def combine_course_data(file_path, course_list):
    all_courses = []

    for sheet in course_list:
        try:
            df = pd.read_excel(file_path, sheet_name = sheet)

            df['course_identifier'] = sheet

            parts = sheet.split(' ')
            if len(parts) >= 3:
                df['course_code'] = parts[0] + ' ' + parts[1]
                df['term'] = ' '.join(parts[2:])
            else:
                df['course_code'] = sheet
                df['term'] = 'unknown'

            df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')

            all_courses.append(df)

        except Expection as e:
            print(f"Error happening {sheet}: {e}")
    
    return pd.concat(all_courses, ignore_index = True, sort = False)

In [None]:
# Load the dataset by applying the function on excel file
new_data = combine_course_data(file_path, course_list)
print(f"New dataset structure: {new_data.shape}") # (no. of rows, no. of columns)

# Display column names
print(f"\nList of new dataset columns:")
for i, col in enumerate(new_data.columns):
    print(f"{i+1}. {col}")
                

#### Phase 3: Data Quality Assessment

- look for negative and missing values, outliers by checking the range (min, max)

#### Phase 4: Analysis

1. Which types of content(video) had the most/least total views by type? Also classify videos based on session name patterns, in case of no media type column


In [None]:
# Question 1: total minutes by media type

def analyze_minutes_by_media_type(df):
    
    if 'media_type' not in df.columns:
        print("Media Type column is not found. Replacing it with session names.")
        # Create media type classification from session names if missing
        df['media_type'] = df['session'].apply(classify_media_type)
    
    # Calculate total minutes delivered by media type
    media_type_analysis = df.groupby('media_type').agg({
        'minutes_delivered': ['sum', 'mean', 'count'],
        'views_and_downloads': ['sum', 'mean'],
        'unique_viewers': ['sum', 'mean'],
        'video_duration_minutes': ['sum', 'mean']}).round(2)
    
    # Flatten column names
    media_type_analysis.columns = ['_'.join(col).strip() for col in media_type_analysis.columns]
    
    # Calculate engagement ratio (minutes delivered/original duration)
    media_type_analysis['engagement_ratio'] = (
        media_type_analysis['minutes_delivered_sum'] / 
        media_type_analysis['video_duration_minutes_sum']).round(2)
    
    # Sort by total minutes delivered
    media_type_analysis = media_type_analysis.sort_values(
        'minutes_delivered_sum', 
        ascending = False)
        
    return media_type_analysis

def classify_media_type(session_name):
    session_name = session_name.lower()
    
    if 'content' in session_name:
        return 'Content Video'
    elif 'interview' in session_name:
        return 'Interview'
    elif 'mod_' in session_name or 'module' in session_name:
        return 'Module Content'
    elif 'record_' in session_name:
        return 'Recordings'
    else:
        return 'Other'

In [None]:
# Run content type analysis
results = analyze_minutes_by_media_type(new_data)
results