Generate the summary statistics of admin data for the whole study sample.

In [10]:
import pandas as pd
import numpy as np

In [11]:
df1 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\first\\LMS_data_final.dta").loc[:,['vccsid','strm', 'college', 'course', 'section', 'grade']]
df2 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\LMS_data_final.dta").loc[:,['vccsid','strm', 'college', 'course','section', 'grade']]

In [12]:
subj_dict = {'BUS': ['ACC', 'BUS', 'MKT', 'HRI', 'FIN', 'REA', 'ACQ', 'CON', 'ISR'],
             'ART': ['ART', 'DAN', 'MUS', 'HRT', 'CRF', 'PHT', 'IDS', "DEC"],
             'SCI': ['BIO', 'GOL', 'PHY', 'NAS', 'CHM', 'SCT'],
             'EGR': ['EGR', 'CSC', 'ITD', 'ITE', 'ITN', 'ITP',
                     'HIM', 'ENE', 'GIS', 'ENV', 'HIT', 'ARC',
                     'ETR', 'AST', 'IND', 'IST', 'DRF', 'CAD', 
                     'TEL', 'MEC', 'AMT', 'ARO', 'CIV', "INS", 'NUC', 'UMS', "AVI", "ESR", "NAN"],
             'MTH': ['MTE', 'MTH', 'MTT', 'MCR', 'MDE'],
             'SOC': ['EDU', 'SOC', 'ECO', 'GEO', 'HIS', 'PSY',
                     'PLS', 'ADJ', 'LGL', 'CHD', 'SSC', "PBS", "EIP"],
             'MED': ['EMS', 'NUR', 'HLT', 'PED', 'DMS', 'RAD', 
                     'EMT', 'PNE', 'DNA', 'SUR', 'NSG', 'HCT',
                     'DNH', 'OCT', 'CSP', 'DIT', 'PSG', 'PNG',
                     'PTH', 'MEN', 'MDL', 'RTH', "MDA", "OPT", "ROC", "DNL", "OMP", "PBH"],
             'HUM': ['HUM', 'CST', 'PHI', 'REL', 'ENG', 'ENF',
                     'EDE', 'HMS', 'SDV', 'SPD', 'STD', 'SCM', 'MET'],
             'FLA': ['ASL', 'ARA', 'FRE', 'GER', 'JPN', 'KOR',
                     'ITA', 'LAT', 'SPA', 'CHI', 'ESL', 'RUS', 'INT', 'GRE', 'POR', 'VTN'],
             'OCC': ['FST', 'AGR', 'AIR', 'ARO', 'AUT', 'FOR',
                     'FIR', 'WEL', 'VET', 'EQU', 'LBR', 'BCS',
                     'BLD', 'BSK', 'COS', 'VEN', 'TRK', 'TRV',
                     'PNT', 'MSC', 'MAR', 'MAC', 'ELE', 'RVH',
                     'RPK', 'SAF', 'AUB', "DSL", 'FNS', "MRT",
                     'MTS', 'RPK', "FUR", "GWR", "MIN", "PPT",
                     'HVE']}
subj_inverse_dict = {e:k for k,v in subj_dict.items() for e in v}

In [13]:
course1 = df1[df1.strm < 2212].loc[:,['course']].drop_duplicates()
course1.loc[:,'lvl2_ind'] = course1.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course1.loc[:,'subject'] = course1.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course2 = df2[df2.strm < 2212].loc[:,['course']].drop_duplicates()
course2.loc[:,'lvl2_ind'] = course2.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course2.loc[:,'subject'] = course2.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course3 = df1[df1.strm == 2212].loc[:,['course']].drop_duplicates()
course3.loc[:,'lvl2_ind'] = course3.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course3.loc[:,'subject'] = course3.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course4 = df2[df2.strm == 2212].loc[:,['course']].drop_duplicates()
course4.loc[:,'lvl2_ind'] = course4.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course4.loc[:,'subject'] = course4.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course = pd.concat([course1, course2, course3, course4]).drop_duplicates()

In [14]:
np.mean(course.lvl2_ind), np.mean(course1.lvl2_ind), np.mean(course2.lvl2_ind),\
np.mean(course3.lvl2_ind), np.mean(course4.lvl2_ind)

(0.5011337868480725,
 0.39313795568263044,
 0.4853072128227961,
 0.39233954451345754,
 0.5082956259426847)

In [15]:
from collections import Counter
{k:v/course.shape[0] for k,v in Counter(course.subject).items()}

{'ART': 0.09674981103552532,
 'BUS': 0.07294028722600152,
 'EGR': 0.2180650037792895,
 'FLA': 0.025699168556311415,
 'HUM': 0.06878306878306878,
 'MED': 0.19879062736205594,
 'MTH': 0.01020408163265306,
 'OCC': 0.17989417989417988,
 'SCI': 0.03250188964474679,
 'SOC': 0.0963718820861678}

In [16]:
{k:v/course1.shape[0] for k,v in Counter(course1.subject).items()}

{'ART': 0.10293066476054324,
 'BUS': 0.08577555396711938,
 'EGR': 0.20371694067190851,
 'FLA': 0.035739814152966405,
 'HUM': 0.09292351679771266,
 'MED': 0.14367405289492494,
 'MTH': 0.017155110793423873,
 'OCC': 0.14224446032880628,
 'SCI': 0.04288777698355969,
 'SOC': 0.13295210864903503}

In [17]:
{k:v/course3.shape[0] for k,v in Counter(course3.subject).items()}

{'ART': 0.07763975155279502,
 'BUS': 0.08799171842650104,
 'EGR': 0.2401656314699793,
 'FLA': 0.036231884057971016,
 'HUM': 0.09420289855072464,
 'MED': 0.10766045548654245,
 'MTH': 0.020703933747412008,
 'OCC': 0.13871635610766045,
 'SCI': 0.053830227743271224,
 'SOC': 0.14285714285714285}

In [18]:
{k:v/course2.shape[0] for k,v in Counter(course2.subject).items()}

{'ART': 0.09928762243989314,
 'BUS': 0.07390917186108638,
 'EGR': 0.21015138023152272,
 'FLA': 0.02804986642920748,
 'HUM': 0.07168299198575245,
 'MED': 0.20213713268032057,
 'MTH': 0.012021371326803205,
 'OCC': 0.1674087266251113,
 'SCI': 0.03383793410507569,
 'SOC': 0.10151380231522707}

In [19]:
{k:v/course4.shape[0] for k,v in Counter(course4.subject).items()}

{'ART': 0.10105580693815988,
 'BUS': 0.07943690296631473,
 'EGR': 0.2212166918049271,
 'FLA': 0.027652086475615886,
 'HUM': 0.07642031171442937,
 'MED': 0.17546505781799898,
 'MTH': 0.011060834590246356,
 'OCC': 0.16792357968828556,
 'SCI': 0.03619909502262444,
 'SOC': 0.1035696329813977}

In [21]:
section_size1 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\first\\course_specific_predictors_new.csv").loc[:,['vccsid', 'strm', 'college', 'course', 'section']]
section_size2 = pd.read_csv("~\\Box Sync\\Clickstream\\data\\full\\course_specific_predictors_new.csv").loc[:,['vccsid', 'strm', 'college', 'course', 'section']]

In [None]:
online_ind_df_1 = pd.read_stata("\\Box Sync\\Clickstream\\data\\first\\updated_online_ind.dta").drop(['inperson_ind'], axis=1)
online_ind_df_2 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\updated_online_ind.dta").drop(['inperson_ind'], axis=1)

In [29]:
section_size1 = section_size1.merge(online_ind_df_1, on=['vccsid','strm','college','course','section'], how='inner')
section_size2 = section_size2.merge(online_ind_df_2, on=['vccsid','strm','college','course','section'], how='inner')

In [32]:
section_size1 = df1.merge(section_size1, on=['vccsid', 'strm', 'college', 'course', 'section'], how='left')
section_size2 = df2.merge(section_size2, on=['vccsid', 'strm', 'college', 'course', 'section'], how='left')
section_size1.loc[:,'valid'] = (section_size1.strm == 2212).astype(int)
section_size2.loc[:,'valid'] = (section_size2.strm == 2212).astype(int)
# section_size1 = section_size1.drop(['vccsid'], axis=1).drop_duplicates()
# section_size2 = section_size2.drop(['vccsid'], axis=1).drop_duplicates()

In [33]:
section_sizes = pd.concat([section_size1,section_size2]).groupby(['strm', 'college', 'course', 'section']).agg({'vccsid':'count'}).reset_index()
section_sizes.vccsid.mean()

18.34356345907429

In [34]:
ss1 = section_size1.drop(['vccsid','grade'], axis=1).drop_duplicates().merge(section_sizes, on=['strm', 'college', 'course', 'section'], how='inner')
ss1[ss1.valid == 0].vccsid.mean(), ss1[ss1.valid == 1].vccsid.mean()

(20.631695244829416, 20.415379043940124)

In [35]:
ss2 = section_size2.drop(['vccsid','grade'], axis=1).drop_duplicates().merge(section_sizes, on=['strm', 'college', 'course', 'section'], how='inner')
ss2[ss2.valid == 0].vccsid.mean(), ss2[ss2.valid == 1].vccsid.mean()

(18.649167462085057, 17.65010513667768)

In [36]:
course_sizes = pd.concat([section_size1, section_size2]).loc[:,['strm', 'course', 'vccsid', 'valid']].groupby(['strm', 'course']).agg({'vccsid':'count'}).reset_index()
course_sizes.vccsid.mean()

153.8906659674882

In [37]:
ss3 = section_size1.loc[:,['strm', 'course', 'valid']].merge(course_sizes, on=['strm', 'course']).drop_duplicates()
ss3[ss3.valid == 0].vccsid.mean(), ss3[ss3.valid == 1].vccsid.mean()

(257.03238741517583, 276.19565217391306)

In [38]:
ss4 = section_size2.loc[:,['strm', 'course', 'valid']].merge(course_sizes, on=['strm', 'course']).drop_duplicates()
ss4[ss4.valid == 0].vccsid.mean(), ss4[ss4.valid == 1].vccsid.mean()

(156.4104159260576, 147.73403720462545)

In [39]:
Counter(section_size1.drop(['vccsid','grade','online_ind'], axis=1).drop_duplicates().valid)

Counter({0: 33942, 1: 8284})

In [40]:
Counter(section_size2.drop(['vccsid','grade','online_ind'], axis=1).drop_duplicates().valid)

Counter({0: 47145, 1: 16645})

In [41]:
section_size1.loc[:,'first_ind'] = 1
section_size2.loc[:,'first_ind'] = 0
section_size_all = pd.concat([section_size1, section_size2])

In [42]:
Counter(section_size_all.strm)

Counter({2193: 102744, 2194: 332187, 2203: 120040, 2204: 325063, 2212: 293844})

In [43]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    sub = section_size_all[section_size_all.course == c]
    sp = sub[sub.strm.apply(lambda x: x % 10 == 2)]
    su = sub[sub.strm.apply(lambda x: x % 10 == 3)]
    fa = sub[sub.strm.apply(lambda x: x % 10 == 4)]
    print(c)
    print(sp.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(su.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(fa.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(sp.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(su.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(fa.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(sp.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(su.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(fa.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print('')

ENG_111
482.0
195.5
972.0
8973.0
3327.5
19291.5
18.62863070539419
16.877949735449736
19.850281517048217

ENG_112
602.0
155.0
351.5
11466.0
2985.5
6996.5
19.05481727574751
19.274558774558777
19.92606238258729

BIO_101
391.0
151.0
525.0
5237.0
2058.0
7573.5
22.713554987212277
20.109984639016897
22.698095238095238

MTH_154
254.0
100.0
335.5
5736.0
1998.0
7715.5
22.590551181102363
19.621757705218187
23.00075987841945

MTH_161
213.0
76.0
293.5
4075.0
1409.5
6528.5
19.154929577464788
18.6395670995671
22.272650227547132



In [44]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    sub = section_size_all[section_size_all.course == c]
    sp = sub[sub.strm.apply(lambda x: x % 10 == 2)]
    fa = sub[sub.strm.apply(lambda x: x % 10 != 2)]
    print(c)
    print(sp.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(fa.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(sp.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(fa.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(sp.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(fa.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print('')

ENG_111
482.0
583.75
8973.0
11309.5
18.62863070539419
18.364115626248978

ENG_112
602.0
253.25
11466.0
4991.0
19.05481727574751
19.600310578573033

BIO_101
391.0
338.0
5237.0
4815.75
22.713554987212277
21.404039938556068

MTH_154
254.0
217.75
5736.0
4856.75
22.590551181102363
21.31125879181882

MTH_161
213.0
184.75
4075.0
3969.0
19.154929577464788
20.456108663557117



In [45]:
def num_grade(g):
    if g == "A":
        return 4
    elif g == "B":
        return 3
    elif g == "C":
        return 2
    elif g == "D":
        return 1
    else:
        return 0

In [46]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    sub = section_size_all[section_size_all.course == c]
    sub.loc[:,'dfw_ind'] = sub.grade.apply(lambda x: x in {'D','F','W'})
    sub_grade = sub[sub.grade != "W"]
    sub_grade.loc[:,'num_grade'] = sub_grade.grade.apply(lambda x: num_grade(x))
    print(sub_grade.num_grade.mean())
    print(sub.dfw_ind.mean())
    print(sub.first_ind.mean())
    print("")

ENG_111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2.437199036918138
0.32869892314500665
0.5211867532084378

ENG_112
2.7272695579417094
0.27021012811139017
0.05044982038973837

BIO_101
2.53661584603849
0.26720094830696284
0.1700252538267278

MTH_154
2.322249955428775
0.361390268123138
0.24957298907646475

MTH_161
2.2224116644129577
0.41514438716780944
0.29913417746859516



In [47]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    sub = section_size_all[section_size_all.course == c]
    sub.loc[:,'dfw_ind'] = sub.grade.apply(lambda x: x in {'D','F','W'})
    sub_grade = sub[sub.grade != "W"]
    sub_grade.loc[:,'num_grade'] = sub_grade.grade.apply(lambda x: num_grade(x))
    print(sub_grade.groupby(['valid']).agg({'num_grade':'mean'}))
    print(sub.groupby(['valid']).agg({'dfw_ind':'mean'}))
    print(sub.groupby(['valid']).agg({'first_ind':'mean'}))
    print(sub.groupby(['valid']).agg({'online_ind':'mean'}))
    print("")

ENG_111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


       num_grade
valid           
0       2.479077
1       2.219570
        dfw_ind
valid          
0      0.313173
1      0.406950
       first_ind
valid           
0       0.562482
1       0.313064
       online_ind
valid            
0        0.591673
1        0.991870

ENG_112
       num_grade
valid           
0       2.719424
1       2.740626
        dfw_ind
valid          
0      0.278395
1      0.255950
       first_ind
valid           
0       0.069949
1       0.016476
       online_ind
valid            
0        0.691184
1        0.988057

BIO_101
       num_grade
valid           
0       2.543017
1       2.514848
        dfw_ind
valid          
0      0.261487
1      0.286454
       first_ind
valid           
0       0.200301
1       0.068010
       online_ind
valid            
0        0.594018
1        0.999099

MTH_154
       num_grade
valid           
0       2.341853
1       2.255682
        dfw_ind
valid          
0      0.353141
1      0.389334
       first_ind
valid   

In [48]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    print(Counter(section_size_all[section_size_all.course == c].valid))

ENG_111
Counter({0: 45253, 1: 8979})
ENG_112
Counter({0: 19986, 1: 11471})
BIO_101
Counter({0: 29925, 1: 8881})
MTH_154
Counter({0: 19437, 1: 5738})
MTH_161
Counter({0: 15901, 1: 4080})
