# Setup for Supplemental Analysis

In [None]:
import pandas as pd
import numpy as np

import scipy.stats as stats

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.patches as mpatches
from matplotlib.transforms import Bbox
from matplotlib_venn import venn2, venn3

import seaborn as sns

from IPython.display import display, Markdown

%matplotlib inline

In [None]:
# create output directory if it doesn't exist
import os
if not os.path.exists('plots'):
   os.makedirs('plots')

In [None]:
# Load enrollments
enr = pd.read_csv('data/enrollments_pub.csv', header=0)
enr['one'] = 1.

enr_adv = pd.read_csv('data/enrollments_adv_pub.csv', header=0)
enr_adv['one'] = 1.

# Load pre-surveys
pre = pd.read_csv('data/pre_survey_pub.csv', header=0).dropna(axis=0, how='all')
pre['one'] = 1.

pre_gender = pd.read_csv('data/pre_survey_gender_pub.csv', header=0)
pre_gender['one'] = 1.

# Load post-surveys
pst = pd.read_csv('data/pst_survey_pub.csv', header=0).dropna(axis=0, how='all')
pst['one'] = 1.

pst_demo = pd.read_csv('data/pst_survey_demo_pub.csv', header=0)
pst_demo['one'] = 1.

# # Load pre and post questions
pre_q = pd.read_csv('data/pre_survey_question_info.csv',
                    header=0, index_col=0)

pst_q = pd.read_csv('data/pst_survey_question_info.csv',
                    header=0, index_col=0)

# pre survey countries of respondents
precountry = pd.read_csv('data/pre_survey_country_counts.csv', header=0, index_col=0)

In [None]:
precountry.head()

In [None]:
enr['activity_quiz_count'] = enr[['activity_q{:d}'.format(i) for i in range(1, 7)]].sum(axis=1)
enr['activity_pga_count'] = enr[['activity_pga{:d}'.format(i) for i in range(1, 4)]].sum(axis=1)
enr['activity_vid_week_count'] = \
    (enr[['activity_vid_week_{:d}'.format(i) for i in range(1, 7)]] > 1).sum(axis=1)

In [None]:
enr['activity_auditor'] = enr['activity_learner'] & (~enr['activity_completed'])
enr_adv['activity_auditor'] = enr_adv['activity_learner'] & (~enr_adv['activity_completed'])

enr['learner_category'] = 'Non-learner'
enr.loc[enr['activity_completed'] == 1, 'learner_category'] = 'Learner: completer'
enr.loc[(enr['activity_auditor'] == 1) , 'learner_category'] = 'Learner: auditor'

In [None]:
# Intend to pursue academic job
job_categories = {'Community college',
                  'Research intensive university faculty position',
                  'Undergraduate-focused college or university'}

pre['academic_job'] = pre['QID363'].apply(lambda x: len(set(str(x).split(',')) & job_categories) > 0)

# Intent to be active
pre['intent_active'] = (pre['QID366'].apply(lambda x: 'complete most of the quizzes' in str(x)) |
                        pre['QID366'].apply(lambda x: 'complete the peer assessments' in str(x)))

In [None]:
# Coursera courses (linked survey and completion stats)
pre_c = pre[pre['course_name'].isin(['INTRO Fall 2014',
                                     'INTRO Fall 2015'])]
pst_c = pst[pst['course_name'].isin(['INTRO Fall 2014',
                                     'INTRO Fall 2015'])]

enr_c = enr[enr['course_name'].isin(['INTRO Fall 2014',
                                     'INTRO Fall 2015'])]

print('           Enrollments  Pre-surveys  Post-surveys')
print('All INTRO:   {:9d}    {:9d}     {:9d}'.format(len(enr), len(pre), len(pst)))
print('COURSERA:    {:9d}    {:9d}     {:9d}'.format(len(enr_c), len(pre_c), len(pst_c)))

print('\nINTRO: {:5.1f}% of students complete pre-survey'.format(100.*len(pre) / len(enr)))
print('INTRO: {:5.1f}% of students complete post-survey'.format(100.*len(pst) / len(enr)))

In [None]:
# all Coursera

coursera = enr_c.merge(pre_c, how="left", on="unique_user_id")
coursera = coursera.merge(pst_c, how='left', on='unique_user_id')
coursera['pre'] = [1 if x in pre_c.unique_user_id.values else 0 for x in coursera.unique_user_id]
coursera['post'] = [1 if x in pst_c.unique_user_id.values else 0 for x in coursera.unique_user_id]
coursera['CIRTL'] = coursera.QID355_p_x.combine_first(coursera.QID355_p_y)
coursera['Role'] = coursera.QID334_p_x.combine_first(coursera.QID334_p_y)

## Survey Participant Demographics

In [None]:
def reduce_categories(x, n):
    'Reduce number of categories by collapsing least common into "Other"'
    
    cats = x.value_counts().sort_values(ascending=False).iloc[:n].index.tolist()
    
    y = pd.Series(index=x.index.copy())
    y.loc[x.isin(cats)] = x.loc[x.isin(cats)]
    y.loc[~x.isin(cats) & x.notna()] = 'Other'
    
    return y

### Pre Survey/Enrolled

In [None]:
t = pd.concat([enr.groupby('course_name').sum()[['one', 'activity_learner']]
               .transpose().rename(index={'one': 'Total enrolled', 'activity_learner': 'Learners'}),
               pre.groupby('course_name').sum()[['one']]
               .transpose().rename(index={'one': 'Pre-survey respondents'}),
               pst.groupby('course_name').sum()[['one']]
               .transpose().rename(index={'one': 'Post-survey respondents'})], axis=0).astype(int)
t = t.append((t.loc['Pre-survey respondents']/t.loc['Total enrolled'])
         .transpose().rename("Pre-survey respondents (% of enrolled)"))

t = t.append((precountry > 0).sum().rename('Countries'))
t['Average'] = t.mean(axis=1)

display(t.style.format(lambda x: "{:.0%}".format(x) if float(x) < 1 else int(x)))


t.to_html('plots/overview_table.html', 
              float_format=lambda x: "{:.0%}".format(x) if float(x) < 1 else "{}".format(int(x)), 
                          na_rep="")

In [None]:
t = t.transpose().drop(['Average']).transpose() # remove average column from above
pre_t = []
pre_t_names = []

# CIRTL Inst: QID355
pre_t_names.append('CIRTL')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                     index='QID355_p')
              .iloc[[1, 0]]
              / t.loc['Pre-survey respondents']))

pre_t_names.append('Role')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                     index='QID334_p')
              .loc[['Doctoral student', 'Post-doctoral researcher', 'Faculty', 'Other']]
              / t.loc['Pre-survey respondents']))

pre_t_names.append('Gender')
pre_t.append((pd.pivot_table(data=pre_gender, columns='course_name', aggfunc='sum', values='one',
                     index='QID376_p').loc[['Female', 'Male']]
      / t.loc['Pre-survey respondents']))

pre_t_names.append('Academic Job')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                     index='academic_job').iloc[[1, 0]]
              / t.loc['Pre-survey respondents']))

pre_t_names.append('MCLC')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                     index='QID367').loc[['Yes', 'No', "I'm not sure"]]
              / t.loc['Pre-survey respondents']))

# STEM/SBE field
stemsbe = ['Biological Sciences', 'Physical Sciences', 'Engineering',
           'Social Sciences', 'Mathematical Sciences',
           'Earth, Atmospheric and Oceanic Sciences', 'Computer Science',
           'Learning Sciences', 'Agricultural Sciences']
pre_t_names.append('STEM/SBE')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                             index=pre['QID356'].isin(stemsbe))
              .iloc[[1]]
              / t.loc['Pre-survey respondents']))

# Top 3 fields
pre_t_names.append('Field')
pre_t.append((pd.pivot_table(data=pre, columns='course_name', aggfunc='sum', values='one',
                             index=reduce_categories(pre['QID356'], 3))
              .loc[['Biological Sciences', 'Physical Sciences', 'Engineering', 'Other']]
              / t.loc['Pre-survey respondents']))

In [None]:
sum_tab_pre = pd.concat(pre_t, axis=0, keys=pre_t_names)
sum_tab_pre['avg'] = sum_tab_pre.mean(axis=1)

display(sum_tab_pre.style.format('{:.1%}'))

sum_tab_pre.to_html('plots/pre_table.html', 
                    float_format=lambda x: '{:.1%}'.format(x) if float(x) < 1 else "{}".format(int(x)), 
                          na_rep="")

## Post-survey responses

In [None]:
# 'Participated in an MCLC?' -> collapse "yes, at my CIRTL institution" and "yes, at..."
pst_t = []

pst_t.append((pd.pivot_table(data=pst, columns='course_name', aggfunc='sum', values='one',
                             index='QID355_p')
              .iloc[[1, 0]]
              / t.loc['Post-survey respondents']))

pst_t.append((pd.pivot_table(data=pst, columns='course_name', aggfunc='sum', values='one',
                     index='QID334_p')
              .loc[['Doctoral student', 'Post-doctoral researcher', 'Faculty', 'Other']]
              / t.loc['Post-survey respondents']))

pst_t.append((pd.pivot_table(data=pst_demo, columns='course_name', aggfunc='sum', values='one',
                     index='QID376_p')
              .loc[['Female', 'Male']]
              / t.loc['Post-survey respondents']))

pst_t.append((pd.pivot_table(data=pst_demo, columns='course_name', aggfunc='sum', values='one',
                             index='QID400_p')
              .loc[['Caucasian (non-Hispanic)', 'Asian/Pacific Islander', 'Other/Multiple']]
              / t.loc['Post-survey respondents']))

pst_t.append((pd.pivot_table(data=pst, columns='course_name', aggfunc='sum', values='one',
                             index=pst['QID389_p'].replace('yes.*', 'yes', regex=True))
              .loc[['yes', 'no']]
              / t.loc['Post-survey respondents']))

pst_t.append((pd.pivot_table(data=pst_demo, columns='course_name', aggfunc='sum', values='one',
                             index=pst_demo['QID399_p']).iloc[[1,0]]
              / t.loc['Post-survey respondents']))

In [None]:
sum_tab_pst = pd.concat(pst_t, axis=0,
                        keys=['CIRTL', 'Role', 'Gender', 'Ethnicity', 'MCLC', 'Citizenship'])
sum_tab_pst['avg'] = sum_tab_pst.mean(axis=1)

display(sum_tab_pst.style.format(lambda x: "{:.1%}".format(x) if not pd.isnull(x) else ""))

In [None]:
pst_out = sum_tab_pst.copy()
pst_out.index = pst_out.index.droplevel()
(100*pst_out).to_html('plots/post_table.html', float_format="%.1f%%", na_rep="")

## Course Activity

### Any Activity

In [None]:
activity_any = ((enr['activity_quiz_count'] > 0) |
                (enr['activity_pga_count'] > 0) |
                (enr['activity_vid_count'] > 0))
print('{:.1f}% of those enrolled do at least 1 (quiz|video|pga)'
      .format(100.*activity_any.sum() / len(enr)))

### Quizzes

In [None]:
q_cols = ['activity_q{:d}'.format(i) for i in range(1, 7)]

fig, ax = plt.subplots(ncols=2, figsize=(8, 3))

(1.*(enr[activity_any]['activity_quiz_count'].value_counts().sort_index()
       / activity_any.sum())).plot(kind='bar', ax=ax[0], color='C0', rot=0)

ax[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))

ax[0].set_xlabel('Number of quizzes completed')
ax[0].set_ylabel('Percent of participants')

ax[0].set_ylim(0,.6)
ax[0].set_facecolor("white")
ax[0].spines["top"].set_visible(False)
ax[0].spines["right"].set_visible(False)
ax[0].spines["left"].set_visible(False)
ax[0].grid(False)
ax[0].set_axisbelow(True)
ax[0].grid(which="major", axis="y", color="#cccccc")


enr[q_cols].sum().rename(index=lambda x: x[-1])\
    .plot(kind='bar', color='C0', ax=ax[1], rot=0)

ax[1].set_xlabel('Quiz number')
ax[1].set_ylabel('Number of attempters')

ax[1].set_ylim(0,3500)
ax[1].set_facecolor("white")
ax[1].spines["top"].set_visible(False)
ax[1].spines["right"].set_visible(False)
ax[1].spines["left"].set_visible(False)
ax[1].grid(False)
ax[1].set_axisbelow(True)
ax[1].grid(which="major", axis="y", color="#cccccc")

ax[0].xaxis.set_ticks_position('none') 
ax[1].xaxis.set_ticks_position('none') 
ax[0].yaxis.set_ticks_position('none') 
ax[1].yaxis.set_ticks_position('none') 

plt.tight_layout(w_pad=4);
plt.savefig("plots/Fig-S1-quizzes.png")
plt.savefig("plots/Fig-S1-quizzes.svg")

In [None]:
(1.*(enr[activity_any]['activity_quiz_count'].value_counts().sort_index()
       / activity_any.sum())).to_csv('plots/num_quizzes.csv')

enr[q_cols].sum().rename(index=lambda x: x[-1]).to_csv('plots/quiz_takers.csv')

In [None]:
x = enr.loc[enr['activity_quiz_count'] >= 2, 'activity_quiz_count'] == 6

print('Of those who complete >=2 quizzes, {:.1%} complete all 6.'
      .format(1.*x.sum() / len(x)))

### Videos

In [None]:
vid_wk_cols = ['activity_vid_week_{:d}'.format(i) for i in range(1, 7)]

activity_vid_week_fraction = \
    enr.groupby('course_name')[vid_wk_cols].apply(lambda x: x/x.max())

In [None]:
cb = sns.color_palette('Blues', 3)

fig, ax = plt.subplots(ncols=2, figsize=(8, 3))

bins = [0.001, 0.2, 0.8, 1.01]
labels = ['> 0 and < 20% of videos', '20%-80% of videos', '> 80% of videos']

t = activity_vid_week_fraction.fillna(0)\
    .apply(lambda c: pd.cut(c, bins=bins,
                            labels=labels)\
    .value_counts()).loc[labels]\
    .transpose().rename(index=lambda x: x[-1])
t = t/len(enr[activity_any])

b1 = ax[0].bar(t.index, t[labels[0]], color=cb[0]) 
b2 = ax[0].bar(t.index, t[labels[1]], bottom=t[labels[0]], color=cb[1])
b3 = ax[0].bar(t.index, t[labels[2]], bottom=t[labels[0]]+t[labels[1]], color=cb[2])
   
ax[0].set_xlabel('Module/Week number')
ax[0].set_ylabel('Participants watching a video')

leg = ax[0].legend([b3[0], b2[0], b1[0]], 
             labels[::-1], 
             title='Percent of module videos watched', loc="lower center", facecolor="white", framealpha=0,
             bbox_to_anchor=(0, -.7, 1, .4), 
             mode="expand", borderaxespad=0.)
leg._legend_box.align = "left"

ax[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))
ax[0].set_ylim([0., 1])

ax[0].set_facecolor("white")
ax[0].spines["top"].set_visible(False)
ax[0].spines["right"].set_visible(False)
ax[0].spines["left"].set_visible(False)
ax[0].grid(False)
ax[0].set_axisbelow(True)
ax[0].grid(which="major", axis="y", color="#cccccc")

prop_videos=100*(enr.activity_vid_count / enr.groupby('course_name').activity_vid_count.transform(np.max))
prop_videos[activity_any].hist(bins=10, density=True, ax=ax[1], color='gray')

ax[1].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))
ax[1].set_xlim([0., 100.])
ax[1].set_ylim([0., .05])
ax[1].set_xticks(range(0, 101, 20))
ax[1].set_xticklabels(['0%', '20%', '40%', '60%', '80%', '100%'])
ax[1].set_yticklabels(['0%', '10%', '20%', '30%', '40%', '50%'])

ax[1].set_xlabel('Percentage of all videos watched')
ax[1].set_ylabel('Percentage of participants')

ax[1].set_facecolor("white")
ax[1].spines["top"].set_visible(False)
ax[1].spines["right"].set_visible(False)
ax[1].spines["left"].set_visible(False)
ax[1].grid(False)
ax[1].set_axisbelow(True)
ax[1].grid(which="major", axis="y", color="#cccccc")

ax[0].xaxis.set_ticks_position('none') 
#ax[1].xaxis.set_ticks_position('none') 
ax[0].yaxis.set_ticks_position('none') 
ax[1].yaxis.set_ticks_position('none') 

plt.subplots_adjust(wspace=.4)
plt.savefig("plots/Fig-S2-videos.png", bbox_extra_artists=(leg,), bbox_inches='tight')
plt.savefig("plots/Fig-S2-videos.svg", bbox_extra_artists=(leg,), bbox_inches='tight')

In [None]:
pd.cut(prop_videos[activity_any]/100, [0, 0.001, 0.1, 0.2, 0.8, 1.01]).value_counts()/len(prop_videos[activity_any])

In [None]:
# output data
t.to_csv("plots/module_video_props.csv", header=True)
    
(pd.cut(prop_videos[activity_any], 
        range(0,101,10)).value_counts()/len(prop_videos[activity_any])).to_csv("plots/videos_watched.csv")

### PGAs

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(8, 3))

display(100.*enr.loc[activity_any, 'activity_pga_count'].value_counts().sort_index() / activity_any.sum())

(1.*enr.loc[enr['activity_pga_count'] > 0, 'activity_pga_count'].value_counts().sort_index() / activity_any.sum())\
    .plot(kind='bar', color='C0', ax=ax[0], rot=0)

ax[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))

ax[0].set_xlabel('Number of PGAs completed')
ax[0].set_ylabel('Percent of participants');
ax[0].set_ylim([0,.07])

ax[0].set_facecolor("white")
ax[0].spines["top"].set_visible(False)
ax[0].spines["right"].set_visible(False)
ax[0].spines["left"].set_visible(False)
ax[0].grid(False)
ax[0].set_axisbelow(True)
ax[0].grid(which="major", axis="y", color="#cccccc")

pga_cols = ['activity_pga{:d}'.format(i) for i in range(1, 4)]
enr[pga_cols].sum().rename(index=lambda x: x[-1]).plot(kind='bar', color='C0', rot=0)

ax[1].set_xlabel('PGA number')
ax[1].set_ylabel('Number of attempters')

ax[1].set_ylim([0,1000.])

ax[1].set_facecolor("white")
ax[1].spines["top"].set_visible(False)
ax[1].spines["right"].set_visible(False)
ax[1].spines["left"].set_visible(False)
ax[1].grid(False)
ax[1].set_axisbelow(True)
ax[1].grid(which="major", axis="y", color="#cccccc")

ax[0].xaxis.set_ticks_position('none') 
ax[1].xaxis.set_ticks_position('none') 
ax[0].yaxis.set_ticks_position('none') 
ax[1].yaxis.set_ticks_position('none') 

plt.tight_layout(w_pad=4)
plt.savefig("plots/Fig-S3-pgas.png")
plt.savefig("plots/Fig-S3-pgas.svg")

In [None]:
# output data
(1.*enr.loc[enr['activity_pga_count'] > 0, 
            'activity_pga_count'].value_counts().sort_index() / activity_any.sum()).to_csv("plots/num_pgas.csv")
    
(enr[pga_cols].sum().rename(index=lambda x: x[-1])).to_csv("plots/pga_takers.csv")

In [None]:
cb = sns.color_palette('Blues', 2)

def hist_pga_data(data, count_col):
    x = pd.pivot_table(data=data,
                       index=count_col, columns='any_pga',
                       aggfunc='count', values='one')
    return (x / x.sum())

def hist_pga(data, count_col, ax):
    'Plot conditional histogram of count_col for PGA attempters vs. non-attempters'

    hist_pga_data(data, count_col).plot(kind='bar', ax=ax, rot=0, width=.8, color=cb)
    
    ax.set_ylabel('Percent of grouped participants')
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))

# TODO
enr_tmp = enr.copy()
enr_tmp['any_pga'] = 1.*(enr['activity_pga_count'] > 0)

fig, ax = plt.subplots(ncols=2, figsize=(8, 2.5))

hist_pga(enr_tmp[activity_any], count_col='activity_quiz_count', ax=ax[0])
hist_pga(enr_tmp[activity_any], count_col='activity_vid_week_count', ax=ax[1])

ax[0].set_ylim([0,.8])
ax[1].set_ylim([0,.8])

ax[0].set_xlabel('Number of quizzes attempted')
ax[1].set_xlabel('Number of modules watching videos')

l1 = ax[0].legend(['Did not attempt a PGA', 'Completed at least one PGA'],
             loc='lower center', title='', facecolor="white", framealpha=0,
             bbox_to_anchor=(0, -.6, 1, .4), 
             mode="expand", borderaxespad=0.)
l2 = ax[1].legend(['Did not attempt a PGA', 'Completed at least one PGA'],
             loc='lower center', title='', facecolor="white", framealpha=0,
             bbox_to_anchor=(0, -.6, 1, .4), 
             mode="expand", borderaxespad=0.)

ax[0].set_facecolor("white")
ax[0].spines["top"].set_visible(False)
ax[0].spines["right"].set_visible(False)
ax[0].spines["left"].set_visible(False)
ax[0].grid(False)
ax[0].set_axisbelow(True)
ax[0].grid(which="major", axis="y", color="#cccccc")

ax[1].set_facecolor("white")
ax[1].spines["top"].set_visible(False)
ax[1].spines["right"].set_visible(False)
ax[1].spines["left"].set_visible(False)
ax[1].grid(False)
ax[1].set_axisbelow(True)
ax[1].grid(which="major", axis="y", color="#cccccc")

ax[0].xaxis.set_ticks_position('none') 
ax[1].xaxis.set_ticks_position('none') 
ax[0].yaxis.set_ticks_position('none') 
ax[1].yaxis.set_ticks_position('none') 


plt.subplots_adjust(wspace=.4)
plt.savefig("plots/Fig-S4-pgas_other.png", bbox_extra_artists=(l1, l2,), bbox_inches='tight')
plt.savefig("plots/Fig-S4-pgas_other.svg", bbox_extra_artists=(l1, l2,), bbox_inches='tight')

In [None]:
# output data
hist_pga_data(enr_tmp[activity_any], 
              count_col='activity_quiz_count').to_csv("plots/num_quizzes_pga.csv", header=True)
    
hist_pga_data(enr_tmp[activity_any], 
              count_col='activity_vid_week_count').to_csv("plots/num_vid_modules_pga.csv", header=True)

In [None]:
pd.read_csv("plots/num_quizzes_pga.csv")

In [None]:
pd.read_csv("plots/num_vid_modules_pga.csv")

Likelihood of completing by PGA attempt status

In [None]:
x = pd.pivot_table(data=enr_tmp[activity_any],
                   index='activity_completed', columns='any_pga',
                   aggfunc='count', values='one')
x

In [None]:
(x[1][1]/x[1].sum())/(x[0][1]/x[0].sum())

## Learners

In [None]:
# Each entry is a user in a module
x = activity_vid_week_fraction.stack()[enr[vid_wk_cols].stack() > 1]

thresh = 0.5
print('{:.0%} of those who watch more than 1 video watch {:.0%} of the videos in a module'\
      .format(1.*(x > thresh).sum() / len(x), thresh))

In [None]:
print( '{:.0%} of PGA attempters also meet quiz and video criteria'\
    .format(1.*((enr['activity_pga_count'] > 0) &
                (enr['activity_quiz_count'] >= 2) &
                (enr['activity_vid_week_count'] >= 3)).sum() /
            (enr['activity_pga_count'] > 0).sum()))

In [None]:
id_vid = set(enr.loc[enr['activity_vid_week_count'] >= 3, 'unique_user_id'])
id_quiz = set(enr.loc[enr['activity_quiz_count'] >= 2, 'unique_user_id'])
id_pga = set(enr.loc[enr['activity_pga_count'] > 0, 'unique_user_id'])

venn3([id_vid, id_quiz, id_pga], ['>2 video modules', '>1 quiz', '>0 PGAs']);
plt.tight_layout()
plt.savefig("plots/Fig-S5-venn.png")
plt.savefig("plots/Fig-S5-venn.svg")

In [None]:
# output data
pd.DataFrame([id_vid, id_quiz, id_pga]).transpose()\
    .rename(columns={0:'> 2 video modules',1:'>1 quiz', 2:'>0 PGAs'}).to_csv("plots/learner_venn.csv", 
                                                                             header=True, index=False)

In [None]:
print('{:.0%} of learners only meet the definition based on watching videos.'\
    .format(1.*(enr['activity_learner'] &
            (enr['activity_quiz_count'] < 2) &
            (enr['activity_pga_count'] == 0)).sum() / enr['activity_learner'].sum()))

In [None]:
g1 = enr[(enr['activity_quiz_count']>2) | (enr['activity_pga_count']>1)]
print("{:.0%} of people who attempt at least two quizzes or 1 PGA ultimately complete the course; this is {:.0%} of total learners."\
      .format(g1[g1['activity_completed']==1].shape[0]/g1.shape[0],
             g1[g1['activity_completed']==1].shape[0]/enr[enr['activity_learner']==1].shape[0]))

### Auditors vs. Completers

In [None]:
cb = sns.color_palette('Blues', 2)

def hist_learner_data(data, count_col):
    x = pd.pivot_table(data=data,
                       index=count_col, columns='learner_category',
                       aggfunc='count', values='one')
    
    return (x / x.sum())

def hist_learner(data, count_col, ax):

    hist_learner_data(data, count_col).plot(kind='bar', ax=ax, rot=0, width=.8, color=cb)
    
    ax.set_ylabel('Percent of grouped participants')
    ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:.0%}'.format(x)))

fig, ax = plt.subplots(figsize=(4,3))

hist_learner(enr[enr['activity_learner']==1], count_col='activity_vid_week_count', ax=ax)

ax.set_yticks([0,.1,.2,.3,.4,.5,.6,.7,.8])
ax.set_xlabel('Number of modules watching videos')


l1 = ax.legend(title='', framealpha=1, facecolor="white")
l1.get_frame().set_linewidth(0.0)

ax.set_facecolor("white")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.grid(False)
ax.set_axisbelow(True)
ax.grid(which="both", axis="y", color="#cccccc")

ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none') 

plt.tight_layout()
plt.savefig("plots/Fig-S6-learner_comp.png")
plt.savefig("plots/Fig-S6-learner_comp.svg")

In [None]:
# output data
hist_learner_data(enr[enr['activity_learner']==1], 
                  count_col='activity_vid_week_count').to_csv("plots/vid_modules_learners.csv", header=True)

## Completers

In [None]:
# how people complete
display(pd.crosstab(enr[enr['activity_completed']==1]['activity_quiz_count'], 
            enr[enr['activity_completed']==1]['activity_pga_count'], 
           normalize='all').style.format('{:.1%}'))

# output data
with open("plots/completers_quiz_pga.html", "w") as of:
    pd.crosstab(enr[enr['activity_completed']==1]['activity_quiz_count'], 
            enr[enr['activity_completed']==1]['activity_pga_count'], 
           normalize='all').to_html(of, header=True)

# Course Activity of Survey Respondents

## Intro 

In [None]:
id_all = set(enr_c['unique_user_id'])

id_pre = set(pre_c['unique_user_id'])
id_pst = set(pst_c['unique_user_id'])

id_completers = set(enr_c.loc[enr['activity_completed'] == 1, 'unique_user_id'])
id_learners = set(enr_c.loc[enr['activity_learner'] == 1, 'unique_user_id'])

id_auditers = set(enr_c.loc[enr['activity_auditor'] == 1, 'unique_user_id'])


fig, ax = plt.subplots(ncols=2, figsize=(11, 4.5))


venn3([id_pre, id_learners, id_completers],
      ['Pre-surveys', 'Learners', 'Completers'], ax=ax[0]);

venn3([id_pst, id_learners, id_completers],
      ['Post-survey', 'Learners', 'Completers'], ax=ax[1]);

plt.tight_layout()
plt.savefig("plots/Fig-S7-pre_post_venn.png")
plt.savefig("plots/Fig-S7-pre_post_venn.svg")

In [None]:
# output data
pd.DataFrame([id_pre, id_learners, id_completers]).transpose()\
.rename(columns={0:'pre',1:'learners', 2:'completers'}).to_csv("plots/pre_survey_venn.csv", 
                                                               header=True, index=False)
    
pd.DataFrame([id_pst, id_learners, id_completers]).transpose()\
    .rename(columns={0:'post',1:'learners', 2:'completers'}).to_csv("plots/post_survey_venn.csv", 
                                                                    header=True, index=False)

In [None]:
# % who took the pre-course survey and then did not engage: intended to engage
b1 = coursera[(coursera['activity_learner']==0) & (coursera['pre']==1)]
print(b1[b1['intent_active']==1].shape[0]/b1.shape[0])

# % who took the pre-course survey and then learner: intended to engage
b1 = coursera[(coursera['activity_learner']==1) & (coursera['pre']==1)]
print(b1[b1['intent_active']==1].shape[0]/b1.shape[0])

### Learner Demographics

In [None]:
grpcount = coursera[coursera.pre==1]['learner_category'].value_counts()

t = pd.DataFrame(grpcount).transpose().rename(index={'learner_category':'Count'})
t = t.append(grpcount/coursera[coursera.pre==1].shape[0], sort=True)\
.rename(index={'learner_category':'% of Pre-survey Respondents'})
t = t.append(pd.pivot_table(data=coursera[coursera.pre==1], columns='learner_category', aggfunc='count',
                             values='unique_user_id', index='CIRTL').div(grpcount).iloc[[1,0]])

t = t.append(pd.pivot_table(data=coursera[coursera.pre==1], columns='learner_category', aggfunc='count',
                             values='unique_user_id', index='Role').div(grpcount)\
             .loc[['Doctoral student', 'Post-doctoral researcher', 'Faculty', 'Other']])

t = t.append(pd.pivot_table(data=coursera[coursera.pre==1], columns='learner_category', aggfunc='count',
                             values='unique_user_id', index='academic_job').div(grpcount).iloc[[1,0]])

t = t.append(pd.pivot_table(data=coursera[coursera.pre==1], columns='learner_category', aggfunc='count',
                             values='unique_user_id', index='QID367').div(grpcount).iloc[[2,1,0]])

t['Category'] = ['','','CIRTL','CIRTL', 'Role','Role','Role','Role',  
                 'Preparing to teach', 'Preparing to teach','MCLC Intent','MCLC Intent','MCLC Intent']
t.index.name='Value'
t.set_index('Category', append=True, inplace=True)
t = t.reorder_levels(['Category', 'Value'])
#
#sum_tab_pst['avg'] = sum_tab_pst.mean(axis=1)

def f1(x):
    return "{:.0%}".format(x) if x<1 else "{}".format(int(x))

display(t.style.format(lambda x: "{:.0%}".format(x) if x<1 else int(x)))

with open('plots/learner_table.html', 'w') as of:
    t.to_html(of, float_format=f1)

### Motivations for taking the course
#### Fraction ranking "Important" or "Very important"

In [None]:
pre_intent = pre.filter(axis='columns', like='QID371').rename(columns=pre_q['short'])

((pre_intent >= 2).sum() / pre_intent.notna().sum()).sort_values(ascending=False)

In [None]:
with open('plots/motivations_table.html', 'w') as of:
    pd.DataFrame(((pre_intent >= 2).sum() / pre_intent.notna().sum()).sort_values(ascending=False)).to_html(of, header=False)

#### Grouped by engagement level

Plots not included in supplemental

In [None]:
cb = sns.color_palette('Blues_r', 3)
cr = sns.color_palette('Reds', 2)

fig, ax = plt.subplots(figsize=(7, 3))
coursera.groupby('learner_category').mean().filter(axis='columns', like='QID371_')\
    .dropna(axis='columns')\
    .rename(columns=pre_q['short'])\
    .transpose().sort_values(by='Non-learner')\
    .plot(kind='barh', color=[cb[0], cb[1], cr[0]], ax=ax)

ax.legend().set_title('')
ax.set_xlim([0., 3.])
ax.set_xticks(np.arange(0., 3.1, 0.5))
ax.set_xticklabels(['Not at all important', '', 'Moderately important', '',
                    'Important', '', 'Very important']);

#### Grouped by role

Plot not included in supplemental

In [None]:
fig, ax = plt.subplots(figsize=(7, 3))

coursera.groupby('Role').mean().filter(axis='columns', like='QID371_')\
    .dropna(axis='columns')\
    .rename(columns=pre_q['short'])\
    .transpose().sort_values(by='Post-doctoral researcher')\
    .plot(kind='barh', ax=ax, color=sns.color_palette('Blues_r', 4))

ax.legend().set_title('')
ax.set_xlim([0., 3.])
ax.set_xticks(np.arange(0., 3.1, 0.5))
ax.set_xticklabels(['Not at all important', '', 'Moderately important', '',
                    'Important', '', 'Very important']);

### Usefulness of Course Components

In [None]:
pst.columns

In [None]:
pre_intent = pre.filter(axis='columns', like='QID371').rename(columns=pre_q['short'])

((pre_intent >= 2).sum() / pre_intent.notna().sum()).sort_values(ascending=False)

In [None]:
pd.DataFrame(((pre_intent >= 2).sum() / 
              pre_intent.notna().sum()).sort_values(ascending=False))\
.to_html('plots/motivations_table.html', header=False)

In [None]:
fig, ax = plt.subplots(figsize=(8, 3))

q_help = ['QID373_{:d}'.format(i) for i in [11,12,13,14,15,16,17,21,22,9]]
q_text = pst_q.loc[q_help, 'short']

pst_help = pst[q_help + ['course_name']]

pst_help_means = pst_help.groupby('course_name').mean() 

m = pst_help[q_help].mean()
err = pd.DataFrame({"min":m-pst_help_means.min(), 
                              "max":pst_help_means.max()-m}).transpose().values
m.index=q_text
#display(m.sort_values())
sortedorder = [7,5,1,0,2,8,3,6,4,9]
#display(m[sortedorder])

ax.errorbar(x=m[sortedorder], y=np.arange(10),
            xerr=err[:,sortedorder],
             fmt='o', label='')

ax.set_ylim(-0.5, 9.5)
ax.set_yticks(range(10))
ax.set_yticklabels(q_text[sortedorder])
ax.tick_params(axis='y', which='major', pad=10)

ax.set_xlim(0,4.1)
ax.set_xticks([0, 0.5,1,1.5,2,2.5,3,3.5,4])
ax.set_xticklabels(['No help','','Little help','','Moderate help','','Much help','','Great help'])

ax.set_facecolor("white")

ax.grid(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.grid(which="major", axis="x", color="#cccccc")

ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none') 

plt.tight_layout()
plt.savefig("plots/Fig-S8-helpful.png")
plt.savefig("plots/Fig-S8-helpful.svg")

In [None]:
# output data
pd.DataFrame({"mean":m[sortedorder], 
              "category":q_text[sortedorder], 
              "min":m[sortedorder]-err[0,sortedorder],
              "max":m[sortedorder]+err[1,sortedorder]}).to_csv("plots/help.csv")

### Learning Indicators

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))

q_gains = ['QID387_{:d}'.format(i) for i in [25,27,28,30,32,34,37]]
#q_text = pst_q.loc[q_gains, 'short']
#q_text = [x.strip() for x in q_text]
q_text = ['Enthusiasm', 
          'Discussion', 
          'Additional Classes', 
          'Additional MOOC', 
          'Understanding', 
          'Implementation', 
          'Seek Help']

pst_gains = pst[q_gains + ['course_name']]

pst_gains_means = pst_gains.groupby('course_name').mean() 

m = pst_gains[q_gains].mean()
err = pd.DataFrame({"min":m-pst_gains_means.min(), 
                              "max":pst_gains_means.max()-m}).transpose().values
m.index=q_text
#display(m)
#display(m.sort_values())
sortedorder = [3,0,6,2,4,1,5]
#display(m[sortedorder])

ax.errorbar(x=m[sortedorder], y=np.arange(7),
            xerr=err[:,sortedorder],
             fmt='o', label='')

ax.set_ylim(-0.5, 6.5)
ax.set_yticks(range(7))
ax.set_yticklabels([q_text[x] for x in sortedorder])
ax.tick_params(axis='y', which='major', pad=10)

ax.set_xlim(0,4.1)
ax.set_xticks([0, 0.5,1,1.5,2,2.5,3,3.5,4])
ax.set_xticklabels(['No gain','','Little gain','','Moderate gain','','Good gain','','Great gain'])

ax.set_facecolor("white")

ax.grid(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.grid(which="major", axis="x", color="#cccccc")

ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none') 

plt.tight_layout()
plt.savefig("plots/Fig-S9-gains.png")
plt.savefig("plots/Fig-S9-gains.svg")

In [None]:
# output data
pd.DataFrame({"mean":m[sortedorder], 
              "category":[q_text[x] for x in sortedorder], 
              "min":m[sortedorder]-err[0,sortedorder],
              "max":m[sortedorder]+err[1,sortedorder]}).to_csv("plots/gains.csv")

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))

q_fam = ['QID601_{:d}'.format(i) for i in [1, 2, 3, 4, 6, 15, 16]]
q_text = pst_q.loc[q_fam, 'short']

pst_fam = pst[q_fam + ['course_name']]

pst_fam_means = pst_fam.groupby('course_name').mean() 

m = pst_fam[q_fam].mean()
err = pd.DataFrame({"min":m-pst_fam_means.min(), 
                              "max":pst_fam_means.max()-m}).transpose().values
m.index=q_text
display(m)
sortedorder = [3,2,4,1,6,5,0]

ax.errorbar(x=m[sortedorder], y=np.arange(7),
            xerr=err[:,sortedorder],
             fmt='o', label='')

ax.set_ylim(-0.5, 6.5)
ax.set_yticks(range(7))
ax.set_yticklabels([q_text[x] for x in sortedorder])
ax.tick_params(axis='y', which='major', pad=10)

ax.set_xlim(0,3.1)
ax.set_xticks([0, 0.5,1,1.5,2,2.5,3])
ax.set_xticklabels(['Not at all','', 'Slightly', '','Moderately', '','Very'])
plt.xlabel("Familiarity")

ax.set_facecolor("white")

ax.grid(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.grid(which="major", axis="x", color="#cccccc")

ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none') 

plt.tight_layout()
plt.savefig("plots/Fig-S10-familiar_all.png")
plt.savefig("plots/Fig-S10-familiar_all.svg")

In [None]:
# output data
pd.DataFrame({"mean":m[sortedorder], 
              "category":[q_text[x] for x in sortedorder], 
              "min":m[sortedorder]-err[0,sortedorder],
              "max":m[sortedorder]+err[1,sortedorder]}).to_csv("plots/familiar_all.csv")

## MCLC

In [None]:
g1 = coursera[(coursera.pre==1) | (coursera.post==1)].replace(np.nan, 'No response')

print(g1.shape[0])
g1['QID389_p'] = [x.capitalize() for x in g1['QID389_p'].values]
t = pd.crosstab(g1['QID367'].rename('Intent'), 
            g1['QID389_p'].rename('Self-Reported Participation'), 
                margins=True, normalize='index')
t = t.iloc[[3,0,1,2,4]][['Yes','No','No response']]
display(t.style.format("{:.0%}"))
with open('plots/mclc_crosstab.html', 'w') as of:
    t.to_html(of, float_format=lambda x: "{:.0%}".format(x) if x<1 else "{}".format(int(x)))

In [None]:
t = pd.crosstab(g1['QID367'].rename('Intent'), 
            g1['QID389_p'].rename('Self-Reported Participation')).loc[['Yes','No']][['Yes','No']]
print(t)
oddsratio, pvalue = stats.fisher_exact(t)
print("OddsR: ", oddsratio, "p-Value:", pvalue)

In [None]:
# overall reported participation

pst['QID389_p'].value_counts()/len(pst['QID389_p'])

### Reasons for participating in an MCLC

In [None]:
def multiselect_value_counts(s):
    'Count occurences of multiselect options'

    opts = set(','.join(pre.loc[pre['QID388'].notna(), 'QID388'].tolist()).split(','))
    
    c = pd.Series(index=list(opts))
    
    for i, x in c.iteritems():
        c.loc[i] = sum([i in str(y) for y in s])
    
    return c

mclc = pre['QID367'] == 'Yes'

print('{:.1%} of pre-survey respondents intend to join an MCLC'.format(mclc.sum() / len(pre)))

t = pd.DataFrame({'Total': multiselect_value_counts(pre[mclc]['QID388']),
              'Percent': multiselect_value_counts(pre[mclc]['QID388'])/mclc.sum()})\
    .sort_values(by='Total', ascending=False)
t.index = [x.capitalize() for x in t.index]

display(t.style.format(lambda x: "{:.0%}".format(x) if x<1 else int(x)))

with open('plots/mclc_motivation.html', 'w') as of:
    t.to_html(of, float_format=lambda x: "{:.0%}".format(x) if x<1 else "{}".format(int(x)))