In [1]:

# call __init_ to set notebook seed
import set_notebook_env
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import statsmodels.formula.api as smf
from scipy.stats import mannwhitneyu

from collections import defaultdict
import numpy as np

import set_notebook_env
loader, plot_utils = set_notebook_env.set_env(
    data_dir="../data/", 
    min_ratings=0, 
    max_ratings=None,
    drop_inconsistent_gender = True
)


Using seed: 18787288


In [2]:
loader.initial_setup()
cleaned_df = loader.add_gender_code(loader.cleaned_df)
cleaned_df.columns

Index(['avg_rating', 'avg_difficulty', 'num_ratings', 'pepper',
       'would_take_again_prop', 'num_online_ratings', 'male', 'female',
       'tough_grader', 'good_feedback', 'respected', 'lots_to_read',
       'participation_matters', 'no_skip', 'lots_of_hw', 'inspirational',
       'pop_quizzes', 'accessible', 'papers', 'clear_grading', 'hilarious',
       'test_heavy', 'few_things', 'amazing_lectures', 'caring',
       'extra_credit', 'group_projects', 'lecture_heavy', 'major',
       'university', 'state', 'gender_code'],
      dtype='object')

In [3]:
ratings = cleaned_df['avg_rating'].to_numpy()
gender = cleaned_df['gender_code'].to_numpy()

In [4]:
import numpy as np

# -----------------------------
# Helpers
# -----------------------------
def iqr(x):
    x = np.asarray(x)
    return np.quantile(x, 0.75) - np.quantile(x, 0.25)

def cohens_d(x, y):
    x = np.asarray(x); y = np.asarray(y)
    nx, ny = x.size, y.size
    mx, my = x.mean(), y.mean()
    sx2 = np.var(x, ddof=1)
    sy2 = np.var(y, ddof=1)
    sp = np.sqrt(((nx - 1) * sx2 + (ny - 1) * sy2) / (nx + ny - 2))
    return (mx - my) / sp

def cliffs_delta(x, y):
    # Î´ = P(X>Y) - P(X<Y)
    x = np.asarray(x); y = np.asarray(y)
    diff = x[:, None] - y[None, :]
    n_pos = np.sum(diff > 0)
    n_neg = np.sum(diff < 0)
    return (n_pos - n_neg) / (x.size * y.size)

def ratio_stat(x, y, stat_fn):
    x = np.asarray(x); y = np.asarray(y)
    return stat_fn(x) / stat_fn(y)

def bootstrap_ci_two_sample(x, y, stat_fn, B=2000, alpha=0.05, seed=0):
    """
    Resample within each group (male/female) with replacement.
    Returns: (point_estimate, (ci_low, ci_high))
    """
    rng = np.random.default_rng(seed)
    x = np.asarray(x); y = np.asarray(y)
    nx, ny = x.size, y.size

    point = stat_fn(x, y)
    boots = np.empty(B)

    for b in range(B):
        # with replacement
        xb = rng.choice(x, size=nx, replace=True)
        yb = rng.choice(y, size=ny, replace=True)
        boots[b] = stat_fn(xb, yb)

    lo, hi = np.quantile(boots, [alpha/2, 1 - alpha/2])
    return point, (lo, hi)


In [5]:
male = cleaned_df.loc[cleaned_df["gender_code"]==0, "avg_rating"].dropna().to_numpy()
female = cleaned_df.loc[cleaned_df["gender_code"]==1, "avg_rating"].dropna().to_numpy()

In [6]:

# -----------------------------
# 1) Cohen's d (mean effect)
# -----------------------------
d_point, d_ci = bootstrap_ci_two_sample(
    male, female,
    stat_fn=lambda x, y: cohens_d(x, y),
    B=2000, seed=42
)


In [None]:

# -----------------------------
# 2) Cliff's delta (rank/location shift)
# -----------------------------
delta_point, delta_ci = bootstrap_ci_two_sample(
    male, female,
    stat_fn=lambda x, y: cliffs_delta(x, y),
    B=2000, seed=42
)


In [None]:
# -----------------------------
# 3) Variance ratio (spread)
# -----------------------------
var_ratio_point, var_ratio_ci = bootstrap_ci_two_sample(
    male, female,
    stat_fn=lambda x, y: ratio_stat(x, y, stat_fn=lambda z: np.var(z, ddof=1)),
    B=2000, seed=42
)

In [None]:

# -----------------------------
# 4) IQR ratio (robust spread)
# -----------------------------
iqr_ratio_point, iqr_ratio_ci = bootstrap_ci_two_sample(
    male, female,
    stat_fn=lambda x, y: ratio_stat(x, y, stat_fn=iqr),
    B=2000, seed=42
)


In [None]:
print("Cohen's d (male - female):", d_point, "95% CI:", d_ci)
print("Cliff's delta (male vs female):", delta_point, "95% CI:", delta_ci)
print("Variance ratio Var(male)/Var(female):", var_ratio_point, "95% CI:", var_ratio_ci)
print("IQR ratio IQR(male)/IQR(female):", iqr_ratio_point, "95% CI:", iqr_ratio_ci)
