Jupyter Notebook: Panopto Course Analytics



1. Setup and Import Libraries

In [None]:
# Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

2. Data Import: Panopto export contains multiple sheets, with two metadata sheets (course list and data definitions) and individual course data sheets.

In [None]:
# Load the excel file and examine the structure

data_file = '../data/raw/df.xlsx'

data = pd.read_excel(data_file, sheet_name = None)

# Display the sheet namesp
print(data.keys())

3. Course (sheet) Selection and Initial Data Exploration

Course start from index two or sheet three. Then we will explore the course name, number of rows and columns present in the course dataset as well as the column (variable) names. 

In [None]:
# Select a specific course (sheet) for analysis

course_name = list(data.keys())[2] # courses start from index 2 (third sheet)

df = pd.read_excel(data_file, sheet_name  = course_name)

print(f"Course name: '{course_name}'\n")

print(f"Dataset structure: {df.shape}\n") # structure: (number of rows, number of columns)

print(f"Column names in '{course_name}' course:")
for i, col in enumerate(df.columns):
    print(f"{i+1}. {col}")

4. Cleaning Stage: Standardize Column Names

- provide a rationale for standardizing column names (easier to work with programmatically and follow Python naming conventions)

In [None]:
# A function to standardize column names across different course datasets

def standardize_column_names(df):
    # create a copy of the course dataset
    df_clean = df.copy()

    # clean column names: lowercase, replace spaces with underscores, remove special characters
    df_clean.columns = (df_clean.columns
                        .str.lower()
                        .str.replace(' ', '_')
                        .str.replace('-', '_')
                        .str.replace(' ', '')
                        )
    
    return df_clean 

df = standardize_column_names(df)

# Display the cleaned column names
print(f"New column names in '{course_name}' dataset:")
for i, col in enumerate(df.columns):
    print(f"{i+1}. {col}")

4. Data Type Inspection and Conversion 

- what info() does
- explain what each column represents 

In [None]:
# Identify data types and basic statistics

print(f"'{course_name}' course basic information:\n")
df.info()

In [None]:
print(f"\nA summary of numerical columns in '{course_name}' course\n")
df.describe()

6. Handling Missing Values

- rationale for removing missing values 

In [None]:
# Check if there are any missing values
print(f"Missing values in each column:\n\n{(df.isnull().sum()) > 0}")

In [None]:
# If missing values were found in previous cell, run this function to remove them
df = df.dropna()

7. Data Quality 

- understanding the distribution of data 

In [None]:
# Data quality checks

print("Unique values in categorical columns:")

categorical_columns = ['media_type', 'creator']

for i, col in enumerate(categorical_columns):
    if col in df.columns:
        print(f"\n{i+1}. '{col}' column: {df[col].nunique()} unique values")
        print(df[col].value_counts())