Cluster students by skill level.

Inputs:

- gradebook data (formative assessments)
- assignment weight (100 - median grade)

Outputs:

- plot students in latent space
- kmeans clusters

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.mixture import GaussianMixture

%matplotlib inline

In [None]:
# 1. Simulate Data

def simulate_grades(mu, sigma, n_students):
    grades = []
    for i in range(n_students):
        g = -1
        while g<0 or g>100:
            g = np.random.normal(mu, sigma)
        grades.append(g)
    return grades


def simulate_gradebook(n_students=20):
    gb =  pd.DataFrame({'Student ID': np.random.choice(100000, n_students, replace=False),
                         'Quiz 1': [11,12,10,90,91,92,50,55,41,38],
                         'Quiz 2': [97,95,89,19,9,11,50,45,40,30],
                         'Quiz 3': [5,25,8,80,75,90,50,65,43,40]})
    gb['Final Grade'] = gb[['Quiz 1', 'Quiz 2', 'Quiz 3']].mean(axis='columns')
    return gb

gb = simulate_gradebook(10)
display(gb.head())

for col in ['Quiz 1', 'Quiz 2', 'Quiz 3', 'Final Grade']:
    sns.distplot(gb[col], hist=False, label=col)
plt.legend()
plt.xlim([0,100])
plt.show()

In [None]:
# 2. Cluster
n_clusters=4
kmeans = KMeans(n_clusters).fit(gb[['Quiz 1', 'Quiz 2', 'Quiz 3']])
gb['kmeans'] = kmeans.labels_

sns.heatmap(kmeans.cluster_centers_, cmap=sns.color_palette('RdYlGn_r'))
plt.xlabel('Quiz')
plt.ylabel('kmeans')

In [None]:
n_clusters=3
gm = GaussianMixture(n_clusters, n_init=100).fit(gb[['Quiz 1', 'Quiz 2', 'Quiz 3']])
gb['gaussmix'] = gm.predict(gb[['Quiz 1', 'Quiz 2', 'Quiz 3']])

In [None]:
# 3. Visualize Clusters

for col in ['Quiz 1', 'Quiz 2', 'Quiz 3', 'Final Grade']:
    for i in range(n_clusters):
        sns.distplot(gb.loc[gb['gaussmix']==i, col], kde=False, label=f'Cluster {i}')
    plt.legend()
    plt.xlim([0,100])
    plt.show()

In [None]:
X = gb[['Quiz 1', 'Quiz 2', 'Quiz 3']]
y = gb['gaussmix']
target_names = [0,1,2]

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)

lda = LDA(n_components=2)
X_r2 = lda.fit(X, y).transform(X)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA')

plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA')

plt.show()

In [None]:
# 4. Explain Clusters
# average scores for each assignment

# 5. Recommend Interventions
# which assignment does each group need to focus on?