Generate the summary statistics of LMS measures for the whole study sample.

In [97]:
import pandas as pd
import numpy as np
from collections import Counter

In [61]:
df = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_updated.dta").rename(columns={'has_assign_qtr1': 'has_on_time_assign_share_qtr1'})
sample = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_final.dta").loc[:,['vccsid', 'strm', 'college', 'course', 'section']]
df = df.merge(sample, how='inner', on=['vccsid','strm', 'course', 'college', 'section'])

In [85]:
df_first = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_updated.dta").rename(columns={'has_assign_qtr1': 'has_on_time_assign_share_qtr1'})
sample = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\LMS_data_final.dta").loc[:,['vccsid', 'strm', 'college', 'course', 'section']]
df_first = df_first.merge(sample, how='inner', on=['vccsid','strm', 'course', 'college', 'section'])

In [63]:
predictors = [e for e in list(df.columns.values)[6:] if e not in ['assign_sub_cnt_qtr1', 'on_time_assign_share_qtr1']]
predictors

['avg_depth_post_qtr1',
 'avg_session_len_qrt1',
 'avg_word_tot_qtr1',
 'disc_post_cnt_qtr1',
 'disc_reply_cnt_qtr1',
 'irreg_session_len_qrt1',
 'tot_click_cnt_qrt1',
 'tot_time_qrt1',
 'has_on_time_assign_share_qtr1',
 'has_assign_sub_cnt_qtr1']

In [64]:
online_ind_df = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\updated_online_ind.dta")
df = online_ind_df.merge(df, how='inner', on=['vccsid','strm', 'course', 'college', 'section'])
df.shape

(969025, 20)

In [88]:
online_ind_df_first_first = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\updated_online_ind.dta")
df_first = online_ind_df_first_first.merge(df_first, how='inner', on=['vccsid','strm', 'course', 'college', 'section'])
df_first.shape

(204853, 20)

In [65]:
df = df.fillna(0)

In [89]:
df_first = df_first.fillna(0)

In [66]:
df_sub1 = df[np.array(df.online_ind == 1) & np.array(df.strm < 2212)]
df_sub2 = df[np.array(df.online_ind == 0) & np.array(df.strm < 2212)]
df_sub3 = df[np.array(df.online_ind == 1) & np.array(df.strm == 2212)]
df_sub4 = df[np.array(df.online_ind == 0) & np.array(df.strm == 2212)]

In [90]:
df_first_sub1 = df_first[np.array(df_first.online_ind == 1) & np.array(df_first.strm < 2212)]
df_first_sub2 = df_first[np.array(df_first.online_ind == 0) & np.array(df_first.strm < 2212)]
df_first_sub3 = df_first[np.array(df_first.online_ind == 1) & np.array(df_first.strm == 2212)]
df_first_sub4 = df_first[np.array(df_first.online_ind == 0) & np.array(df_first.strm == 2212)]

In [91]:
def generate_stats(table):
    r_dict = {}
    for p in predictors:
        r_dict[p] = table[p].mean()
    for p in ['assign_sub_cnt_qtr1', 'on_time_assign_share_qtr1']:
        r_dict[p] = table[table['has_'+p] == 1].loc[:,p].mean()
    return pd.DataFrame.from_dict(r_dict, orient='index').rename(columns={0: 'mean'})\
.loc[['assign_sub_cnt_qtr1', 'has_assign_sub_cnt_qtr1', 'on_time_assign_share_qtr1', 'has_on_time_assign_share_qtr1',
      'avg_session_len_qrt1', 'irreg_session_len_qrt1', 'tot_click_cnt_qrt1', 'tot_time_qrt1',
      'avg_depth_post_qtr1', 'avg_word_tot_qtr1', 'disc_post_cnt_qtr1', 'disc_reply_cnt_qtr1'],:]

In [83]:
pd.concat([generate_stats(df_sub1), generate_stats(df_sub2), generate_stats(df_sub3), generate_stats(df_sub4)], axis=1).round(4)\
.to_csv("~\\Box Sync\\Clickstream\\data\\full\\lms_predictor_comparison_early_term.csv", index=False)

In [92]:
pd.concat([generate_stats(df_first_sub1), generate_stats(df_first_sub2), generate_stats(df_first_sub3), generate_stats(df_first_sub4)], axis=1).round(4)\
.to_csv("~\\Box Sync\\Clickstream\\data\\full\\lms_predictor_comparison_early_term_first.csv", index=False)

In [93]:
df_full = pd.concat([df, df_first])

In [95]:
df_full_sub1 = df_full[df_full.online_ind == 1]
df_full_sub2 = df_full[df_full.online_ind == 0]

In [96]:
pd.concat([generate_stats(df_first_sub1), generate_stats(df_first_sub2)], axis=1).round(4)\
.to_csv("~\\Box Sync\\Clickstream\\data\\full\\lms_predictor_comparison_early_term_full.csv", index=False)

In [99]:
Counter(df_full.online_ind), Counter(df_full.inperson_ind)

(Counter({0.0: 307742, 1.0: 866136}), Counter({0.0: 871945, 1.0: 301933}))

In [101]:
Counter(df_full.loc[:,['course', 'section', 'strm', 'online_ind']].drop_duplicates().online_ind), Counter(df_full.loc[:,['course', 'section', 'strm', 'inperson_ind']].drop_duplicates().inperson_ind)

(Counter({0.0: 17672, 1.0: 44654}), Counter({0.0: 44965, 1.0: 17358}))

In [102]:
Counter(df[df.strm < 2212].online_ind), Counter(df[df.strm < 2212].inperson_ind)

(Counter({0.0: 223257, 1.0: 475104}), Counter({0.0: 479972, 1.0: 218389}))

In [103]:
Counter(df[df.strm == 2212].online_ind), Counter(df[df.strm == 2212].inperson_ind)

(Counter({0.0: 15358, 1.0: 255306}), Counter({0.0: 255378, 1.0: 15286}))

In [104]:
Counter(df_first[df_first.strm < 2212].online_ind), Counter(df_first[df_first.strm < 2212].inperson_ind)

(Counter({0.0: 68062, 1.0: 113611}), Counter({0.0: 114469, 1.0: 67204}))

In [105]:
Counter(df_first[df_first.strm == 2212].online_ind), Counter(df_first[df_first.strm == 2212].inperson_ind)

(Counter({0.0: 1065, 1.0: 22115}), Counter({0.0: 22126, 1.0: 1054}))

In [106]:
Counter(df[df.strm < 2212].loc[:,['course', 'section', 'strm', 'online_ind']].drop_duplicates().online_ind), Counter(df[df.strm < 2212].loc[:,['course', 'section', 'strm', 'inperson_ind']].drop_duplicates().inperson_ind)

(Counter({0.0: 15891, 1.0: 29914}), Counter({0.0: 30205, 1.0: 15598}))

In [107]:
Counter(df[df.strm == 2212].loc[:,['course', 'section', 'strm', 'online_ind']].drop_duplicates().online_ind), Counter(df[df.strm == 2212].loc[:,['course', 'section', 'strm', 'inperson_ind']].drop_duplicates().inperson_ind)

(Counter({0.0: 1700, 1.0: 14619}), Counter({0.0: 14637, 1.0: 1681}))

In [108]:
Counter(df_first[df_first.strm < 2212].loc[:,['course', 'section', 'strm', 'online_ind']].drop_duplicates().online_ind), Counter(df_first[df_first.strm < 2212].loc[:,['course', 'section', 'strm', 'inperson_ind']].drop_duplicates().inperson_ind)

(Counter({0.0: 11137, 1.0: 21957}), Counter({0.0: 22144, 1.0: 10950}))

In [109]:
Counter(df_first[df_first.strm == 2212].loc[:,['course', 'section', 'strm', 'online_ind']].drop_duplicates().online_ind), Counter(df_first[df_first.strm == 2212].loc[:,['course', 'section', 'strm', 'inperson_ind']].drop_duplicates().inperson_ind)

(Counter({0.0: 377, 1.0: 7825}), Counter({0.0: 7827, 1.0: 375}))