Generate the summary statistics of admin data for the entire VCCS population.

In [4]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_stata("~\\Box Sync\\Clickstream\\data\\full\\population.dta").loc[:,['vccsid','strm', 'college', 'course', 'section', 'first_ind']]

In [6]:
subj_dict = {'BUS': ['ACC', 'BUS', 'MKT', 'HRI', 'FIN', 'REA', 'ACQ', 'CON', 'ISR'],
             'ART': ['ART', 'DAN', 'MUS', 'HRT', 'CRF', 'PHT', 'IDS', "DEC"],
             'SCI': ['BIO', 'GOL', 'PHY', 'NAS', 'CHM', 'SCT'],
             'EGR': ['EGR', 'CSC', 'ITD', 'ITE', 'ITN', 'ITP',
                     'HIM', 'ENE', 'GIS', 'ENV', 'HIT', 'ARC',
                     'ETR', 'AST', 'IND', 'IST', 'DRF', 'CAD', 
                     'TEL', 'MEC', 'AMT', 'ARO', 'CIV', "INS", 'NUC', 'UMS', "AVI", "ESR", "NAN"],
             'MTH': ['MTE', 'MTH', 'MTT', 'MCR', 'MDE'],
             'SOC': ['EDU', 'SOC', 'ECO', 'GEO', 'HIS', 'PSY',
                     'PLS', 'ADJ', 'LGL', 'CHD', 'SSC', "PBS", "EIP"],
             'MED': ['EMS', 'NUR', 'HLT', 'PED', 'DMS', 'RAD', 
                     'EMT', 'PNE', 'DNA', 'SUR', 'NSG', 'HCT',
                     'DNH', 'OCT', 'CSP', 'DIT', 'PSG', 'PNG',
                     'PTH', 'MEN', 'MDL', 'RTH', "MDA", "OPT", "ROC", "DNL", "OMP", "PBH"],
             'HUM': ['HUM', 'CST', 'PHI', 'REL', 'ENG', 'ENF',
                     'EDE', 'HMS', 'SDV', 'SPD', 'STD', 'SCM', 'MET'],
             'FLA': ['ASL', 'ARA', 'FRE', 'GER', 'JPN', 'KOR',
                     'ITA', 'LAT', 'SPA', 'CHI', 'ESL', 'RUS', 'INT', 'GRE', 'POR', 'VTN'],
             'OCC': ['FST', 'AGR', 'AIR', 'ARO', 'AUT', 'FOR',
                     'FIR', 'WEL', 'VET', 'EQU', 'LBR', 'BCS',
                     'BLD', 'BSK', 'COS', 'VEN', 'TRK', 'TRV',
                     'PNT', 'MSC', 'MAR', 'MAC', 'ELE', 'RVH',
                     'RPK', 'SAF', 'AUB', "DSL", 'FNS', "MRT",
                     'MTS', 'RPK', "FUR", "GWR", "MIN", "PPT",
                     'HVE']}
subj_inverse_dict = {e:k for k,v in subj_dict.items() for e in v}

In [11]:
course1 = df[np.array(df.strm < 2212) & np.array(df.first_ind == 0)].loc[:,['course']].drop_duplicates()
course1.loc[:,'lvl2_ind'] = course1.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course1.loc[:,'subject'] = course1.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course2 = df[np.array(df.strm < 2212) & np.array(df.first_ind == 1)].loc[:,['course']].drop_duplicates()
course2.loc[:,'lvl2_ind'] = course2.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course2.loc[:,'subject'] = course2.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course3 = df[np.array(df.strm == 2212) & np.array(df.first_ind == 0)].loc[:,['course']].drop_duplicates()
course3.loc[:,'lvl2_ind'] = course3.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course3.loc[:,'subject'] = course3.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course4 = df[np.array(df.strm == 2212) & np.array(df.first_ind == 1)].loc[:,['course']].drop_duplicates()
course4.loc[:,'lvl2_ind'] = course4.course.apply(lambda x: x.split("_")[-1][0] == "2").astype(int)
course4.loc[:,'subject'] = course4.course.apply(lambda x: subj_inverse_dict[x.split("_")[0]])
course = pd.concat([course1, course2, course3, course4]).drop_duplicates()

In [12]:
np.mean(course.lvl2_ind), np.mean(course1.lvl2_ind), np.mean(course2.lvl2_ind),\
np.mean(course3.lvl2_ind), np.mean(course4.lvl2_ind)

(0.4989495798319328, 0.4839886501824078, 0.392578125, 0.5, 0.397497593840231)

In [13]:
from collections import Counter
{k:v/course.shape[0] for k,v in Counter(course.subject).items()}

{'ART': 0.09523809523809523,
 'BUS': 0.07072829131652661,
 'EGR': 0.21638655462184875,
 'FLA': 0.029061624649859945,
 'HUM': 0.06827731092436974,
 'MED': 0.19012605042016806,
 'MTH': 0.01015406162464986,
 'OCC': 0.19012605042016806,
 'SCI': 0.0350140056022409,
 'SOC': 0.09488795518207283}

In [14]:
{k:v/course1.shape[0] for k,v in Counter(course1.subject).items()}

{'ART': 0.09850020267531415,
 'BUS': 0.0713417105796514,
 'EGR': 0.20794487231455208,
 'FLA': 0.030401297122010538,
 'HUM': 0.07053100932306446,
 'MED': 0.19294689906769355,
 'MTH': 0.011755168220510741,
 'OCC': 0.18119173084718282,
 'SCI': 0.03607620591811917,
 'SOC': 0.09931090393190109}

In [15]:
{k:v/course3.shape[0] for k,v in Counter(course3.subject).items()}

{'ART': 0.09859154929577464,
 'BUS': 0.07793427230046948,
 'EGR': 0.22347417840375586,
 'FLA': 0.03192488262910798,
 'HUM': 0.07417840375586854,
 'MED': 0.16854460093896714,
 'MTH': 0.010328638497652582,
 'OCC': 0.17652582159624414,
 'SCI': 0.036619718309859155,
 'SOC': 0.10187793427230046}

In [16]:
{k:v/course2.shape[0] for k,v in Counter(course2.subject).items()}

{'ART': 0.10221354166666667,
 'BUS': 0.08268229166666667,
 'EGR': 0.20572916666666666,
 'FLA': 0.03515625,
 'HUM': 0.087890625,
 'MED': 0.146484375,
 'MTH': 0.016276041666666668,
 'OCC': 0.15559895833333334,
 'SCI': 0.041666666666666664,
 'SOC': 0.12630208333333334}

In [17]:
{k:v/course4.shape[0] for k,v in Counter(course4.subject).items()}

{'ART': 0.07892204042348412,
 'BUS': 0.08373435996150144,
 'EGR': 0.23965351299326276,
 'FLA': 0.03657362848893166,
 'HUM': 0.09143407122232916,
 'MED': 0.11838306063522618,
 'MTH': 0.019249278152069296,
 'OCC': 0.14436958614051973,
 'SCI': 0.05101058710298364,
 'SOC': 0.136669874879692}

In [18]:
section_size1 = df[df.first_ind == 0]
section_size2 = df[df.first_ind == 1]

In [19]:
section_size1.loc[:,'valid'] = (section_size1.strm == 2212).astype(int)
section_size2.loc[:,'valid'] = (section_size2.strm == 2212).astype(int)
# section_size1 = section_size1.drop(['vccsid'], axis=1).drop_duplicates()
# section_size2 = section_size2.drop(['vccsid'], axis=1).drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
section_sizes = pd.concat([section_size1,section_size2]).groupby(['strm', 'college', 'course', 'section']).agg({'vccsid':'count'}).reset_index()
section_sizes.vccsid.mean()

16.332777069920056

In [21]:
ss1 = section_size1.drop(['vccsid'], axis=1).drop_duplicates().merge(section_sizes, on=['strm', 'college', 'course', 'section'], how='inner')
ss1[ss1.valid == 0].vccsid.mean(), ss1[ss1.valid == 1].vccsid.mean()

(16.845879525544614, 15.540779129518205)

In [22]:
ss2 = section_size2.drop(['vccsid'], axis=1).drop_duplicates().merge(section_sizes, on=['strm', 'college', 'course', 'section'], how='inner')
ss2[ss2.valid == 0].vccsid.mean(), ss2[ss2.valid == 1].vccsid.mean()

(19.731555173155517, 19.90495385594569)

In [23]:
course_sizes = pd.concat([section_size1, section_size2]).loc[:,['strm', 'course', 'vccsid', 'valid']].groupby(['strm', 'course']).agg({'vccsid':'count'}).reset_index()
course_sizes.vccsid.mean()

169.9724243135414

In [24]:
ss3 = section_size1.loc[:,['strm', 'course', 'valid']].merge(course_sizes, on=['strm', 'course']).drop_duplicates()
ss3[ss3.valid == 0].vccsid.mean(), ss3[ss3.valid == 1].vccsid.mean()

(172.2523614609572, 166.27323943661972)

In [25]:
ss4 = section_size2.loc[:,['strm', 'course', 'valid']].merge(course_sizes, on=['strm', 'course']).drop_duplicates()
ss4[ss4.valid == 0].vccsid.mean(), ss4[ss4.valid == 1].vccsid.mean()

(289.5400168491997, 306.29451395572664)

In [106]:
Counter(section_size1.drop(['vccsid','grade','online_ind'], axis=1).drop_duplicates().valid)

Counter({0: 33942, 1: 8284})

In [105]:
Counter(section_size2.drop(['vccsid','grade','online_ind'], axis=1).drop_duplicates().valid)

Counter({0: 47145, 1: 16645})

In [124]:
section_size1.loc[:,'first_ind'] = 1
section_size2.loc[:,'first_ind'] = 0
section_size_all = pd.concat([section_size1, section_size2])

In [125]:
Counter(section_size_all.strm)

Counter({2193: 102744, 2194: 332187, 2203: 120040, 2204: 325063, 2212: 293844})

In [131]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    sub = section_size_all[section_size_all.course == c]
    sp = sub[sub.strm.apply(lambda x: x % 10 == 2)]
    su = sub[sub.strm.apply(lambda x: x % 10 == 3)]
    fa = sub[sub.strm.apply(lambda x: x % 10 == 4)]
    print(c)
    print(sp.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(su.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(fa.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(sp.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(su.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(fa.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(sp.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(su.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(fa.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print('')

ENG_111
482.0
195.5
972.0
8973.0
3327.5
19291.5
18.62863070539419
16.877949735449736
19.850281517048217

ENG_112
602.0
155.0
351.5
11466.0
2985.5
6996.5
19.05481727574751
19.274558774558777
19.92606238258729

BIO_101
391.0
151.0
525.0
5237.0
2058.0
7573.5
22.713554987212277
20.109984639016897
22.698095238095238

MTH_154
254.0
100.0
335.5
5736.0
1998.0
7715.5
22.590551181102363
19.621757705218187
23.00075987841945

MTH_161
213.0
76.0
293.5
4075.0
1409.5
6528.5
19.154929577464788
18.6395670995671
22.272650227547132



In [137]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    sub = section_size_all[section_size_all.course == c]
    sp = sub[sub.strm.apply(lambda x: x % 10 == 2)]
    fa = sub[sub.strm.apply(lambda x: x % 10 != 2)]
    print(c)
    print(sp.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(fa.loc[:,['strm','college','section']].drop_duplicates().groupby(['strm']).agg({'section':'count'}).section.mean())
    print(sp.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(fa.loc[:,['strm','vccsid']].drop_duplicates().groupby(['strm']).agg({'vccsid':'count'}).vccsid.mean())
    print(sp.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print(fa.groupby(['strm', 'college', 'section']).agg({'vccsid':'count'}).reset_index().groupby(['strm']).agg({'vccsid':'mean'}).vccsid.mean())
    print('')

ENG_111
482.0
583.75
8973.0
11309.5
18.62863070539419
18.364115626248978

ENG_112
602.0
253.25
11466.0
4991.0
19.05481727574751
19.600310578573033

BIO_101
391.0
338.0
5237.0
4815.75
22.713554987212277
21.404039938556068

MTH_154
254.0
217.75
5736.0
4856.75
22.590551181102363
21.31125879181882

MTH_161
213.0
184.75
4075.0
3969.0
19.154929577464788
20.456108663557117



In [132]:
def num_grade(g):
    if g == "A":
        return 4
    elif g == "B":
        return 3
    elif g == "C":
        return 2
    elif g == "D":
        return 1
    else:
        return 0

In [139]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    sub = section_size_all[section_size_all.course == c]
    sub.loc[:,'dfw_ind'] = sub.grade.apply(lambda x: x in {'D','F','W'})
    sub_grade = sub[sub.grade != "W"]
    sub_grade.loc[:,'num_grade'] = sub_grade.grade.apply(lambda x: num_grade(x))
    print(sub_grade.num_grade.mean())
    print(sub.dfw_ind.mean())
    print(sub.first_ind.mean())
    print("")

ENG_111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2.437199036918138
0.32869892314500665
0.5211867532084378

ENG_112
2.7272695579417094
0.27021012811139017
0.05044982038973837

BIO_101
2.53661584603849
0.26720094830696284
0.1700252538267278

MTH_154
2.322249955428775
0.361390268123138
0.24957298907646475

MTH_161
2.2224116644129577
0.41514438716780944
0.29913417746859516



In [134]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    sub = section_size_all[section_size_all.course == c]
    sub.loc[:,'dfw_ind'] = sub.grade.apply(lambda x: x in {'D','F','W'})
    sub_grade = sub[sub.grade != "W"]
    sub_grade.loc[:,'num_grade'] = sub_grade.grade.apply(lambda x: num_grade(x))
    print(sub_grade.groupby(['valid']).agg({'num_grade':'mean'}))
    print(sub.groupby(['valid']).agg({'dfw_ind':'mean'}))
    print(sub.groupby(['valid']).agg({'first_ind':'mean'}))
    print(sub.groupby(['valid']).agg({'online_ind':'mean'}))
    print("")

ENG_111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


       num_grade
valid           
0       2.479077
1       2.219570
        dfw_ind
valid          
0      0.313173
1      0.406950
       first_ind
valid           
0       0.562482
1       0.313064
       online_ind
valid            
0        0.301196
1        0.396369

ENG_112
       num_grade
valid           
0       2.719424
1       2.740626
        dfw_ind
valid          
0      0.278395
1      0.255950
       first_ind
valid           
0       0.069949
1       0.016476
       online_ind
valid            
0        0.473832
1        0.442856

BIO_101
       num_grade
valid           
0       2.543017
1       2.514848
        dfw_ind
valid          
0      0.261487
1      0.286454
       first_ind
valid           
0       0.200301
1       0.068010
       online_ind
valid            
0        0.196358
1        0.270690

MTH_154
       num_grade
valid           
0       2.341853
1       2.255682
        dfw_ind
valid          
0      0.353141
1      0.389334
       first_ind
valid   

In [136]:
for c in ['ENG_111', 'ENG_112', 'BIO_101', 'MTH_154', 'MTH_161']:
    print(c)
    print(Counter(section_size_all[section_size_all.course == c].valid))

ENG_111
Counter({0: 45253, 1: 8979})
ENG_112
Counter({0: 19986, 1: 11471})
BIO_101
Counter({0: 29925, 1: 8881})
MTH_154
Counter({0: 19437, 1: 5738})
MTH_161
Counter({0: 15901, 1: 4080})
