This script performs random truncation to generate the observation window for each student, such that the distribution of enrollment lengths of our study is identical to the distribution of those for the currently enrolled cohort.

In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter

fpath = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\intermediate_files\\new\\"

#### Random truncation procedure for the training/validation sets

In [2]:
enrolled_nth = pd.read_stata(fpath+"enrolled_nth.dta")
truncation_ss = pd.read_stata(fpath+"truncation_sample_sizes.dta")

In [3]:
train_ss = {}
valid_ss = {}
for i in range(truncation_ss.shape[0]):
    nth_term = truncation_ss.nth_term.iloc[i]
    tss = truncation_ss.train_sample_size.iloc[i]
    vss = truncation_ss.valid_sample_size.iloc[i]
    train_ss[nth_term] = tss
    valid_ss[nth_term] = vss

In [4]:
valid_ind = pd.read_stata(fpath+"full_data_enrolled_terms.dta").loc[:,['vccsid','valid']]

In [5]:
Counter(valid_ind.valid)

Counter({0.0: 325341, 1.0: 63061})

In [6]:
enrolled_nth_1 = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Training sample size:", len(np.unique(enrolled_nth_1.vccsid)))
enrolled_nth_2 = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Validation sample size:", len(np.unique(enrolled_nth_2.vccsid)))

Training sample size: 323182
Validation sample size: 62618


In [7]:
initial_dict = {}
df1 = enrolled_nth.loc[:,['vccsid','first_nonde_strm']].drop_duplicates()
for i in range(df1.shape[0]):
    vccsid = df1.vccsid.iloc[i]
    fns = df1.first_nonde_strm.iloc[i]
    initial_dict[vccsid] = fns

In [8]:
nth_dict_1 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_1.shape[0]):
    vccsid = enrolled_nth_1.vccsid.iloc[i]
    nth = enrolled_nth_1.nth.iloc[i]
    nth_dict_1[nth].add(vccsid)

In [9]:
nth_dict_2 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_2.shape[0]):
    vccsid = enrolled_nth_2.vccsid.iloc[i]
    nth = enrolled_nth_2.nth.iloc[i]
    nth_dict_2[nth].add(vccsid)

In [10]:
nth_dict_1_cp = nth_dict_1.copy()
nth_dict_2_cp = nth_dict_2.copy()

In [12]:
### Random truncation (sampling) for training sample
random.seed(12345)
sample_1 = {}
diff = 0
for nth in range(17,0,-1):
    tss = int(train_ss[nth]) + diff
    pool_size = len(nth_dict_1_cp[nth])
    if pool_size < tss:
        sample_1[nth] = nth_dict_1_cp[nth]
        diff = tss - pool_size
    else:
        sample_1[nth] = set(random.sample(nth_dict_1_cp[nth], tss))    
    for i in range(nth,0,-1):
        nth_dict_1_cp[i] = nth_dict_1_cp[i].difference(sample_1[nth])
    print("nth term == {0} is sampled: {1} out of {2}".format(nth,min(tss, pool_size),pool_size))

nth term == 17 is sampled: 3560 out of 14452
nth term == 16 is sampled: 4507 out of 16604
nth term == 15 is sampled: 3229 out of 11938
nth term == 14 is sampled: 4838 out of 15992
nth term == 13 is sampled: 6234 out of 20244
nth term == 12 is sampled: 4720 out of 15933
nth term == 11 is sampled: 7580 out of 22510
nth term == 10 is sampled: 10516 out of 29311
nth term == 9 is sampled: 8351 out of 22872
nth term == 8 is sampled: 13731 out of 33774
nth term == 7 is sampled: 19759 out of 43903
nth term == 6 is sampled: 15096 out of 31694
nth term == 5 is sampled: 27585 out of 48712
nth term == 4 is sampled: 38580 out of 55600
nth term == 3 is sampled: 26651 out of 30398
nth term == 2 is sampled: 49849 out of 49849
nth term == 1 is sampled: 78396 out of 78396


#### The data for Table 1 of the paper

In [13]:
prop_1 = np.array([len(sample_1[i+1]) for i in range(len(sample_1))])
prop_1 = prop_1/np.sum(prop_1)

In [14]:
### Random truncation (sampling) for validation sample
random.seed(12345)
sample_2 = {}
diff = 0
for nth in range(17,0,-1):
    vss = int(valid_ss[nth]) + diff
    pool_size = len(nth_dict_2_cp[nth])
    if pool_size < vss:
        sample_2[nth] = nth_dict_2_cp[nth]
        diff = vss - pool_size
    else:
        sample_2[nth] = set(random.sample(nth_dict_2_cp[nth], vss))    
    for i in range(nth,0,-1):
        nth_dict_2_cp[i] = nth_dict_2_cp[i].difference(sample_2[nth])
    print("nth term == {0} is sampled: {1} out of {2}".format(nth,min(vss, pool_size),pool_size))

nth term == 17 is sampled: 690 out of 2224
nth term == 16 is sampled: 873 out of 2521
nth term == 15 is sampled: 626 out of 1665
nth term == 14 is sampled: 937 out of 2239
nth term == 13 is sampled: 1208 out of 2977
nth term == 12 is sampled: 915 out of 2282
nth term == 11 is sampled: 1469 out of 3512
nth term == 10 is sampled: 2038 out of 4589
nth term == 9 is sampled: 1618 out of 3461
nth term == 8 is sampled: 2660 out of 5429
nth term == 7 is sampled: 3828 out of 7290
nth term == 6 is sampled: 2925 out of 5109
nth term == 5 is sampled: 5345 out of 8239
nth term == 4 is sampled: 7475 out of 9786
nth term == 3 is sampled: 5131 out of 5131
nth term == 2 is sampled: 10076 out of 10107
nth term == 1 is sampled: 14804 out of 14804


In [15]:
prop_2 = np.array([len(sample_2[i+1]) for i in range(len(sample_2))])
prop_2 = prop_2/np.sum(prop_2)

In [16]:
prop_df = pd.DataFrame({"nth_term": range(1,18),
                        "crnt_cohorts": np.round(truncation_ss.prop,4), 
                        "truncated_train": np.round(prop_1,4),
                        "truncated_validation": np.round(prop_2,4)})\
.loc[:,['nth_term', 'crnt_cohorts', 'truncated_train', 'truncated_validation']]
print(prop_df)

    nth_term  crnt_cohorts  truncated_train  truncated_validation
0          1        0.2364           0.2426                0.2364
1          2        0.1604           0.1542                0.1609
2          3        0.0825           0.0825                0.0819
3          4        0.1194           0.1194                0.1194
4          5        0.0854           0.0854                0.0854
5          6        0.0467           0.0467                0.0467
6          7        0.0611           0.0611                0.0611
7          8        0.0425           0.0425                0.0425
8          9        0.0258           0.0258                0.0258
9         10        0.0325           0.0325                0.0325
10        11        0.0235           0.0235                0.0235
11        12        0.0146           0.0146                0.0146
12        13        0.0193           0.0193                0.0193
13        14        0.0150           0.0150                0.0150
14        

In [18]:
prop_df.to_csv(fpath + "proportion_after_truncation.csv", index=False)

#### Find the end term of each observation after truncation

In [19]:
new_nth_dict = {}
s1 = 0
for k,v in sample_1.items():
    s1 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
s2 = 0
for k,v in sample_2.items():
    s2 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
print(s1,s2)

323182 62618


In [20]:
old_nth_df = enrolled_nth.groupby(['vccsid', 'first_nonde_strm']).agg({'nth':'max'}).reset_index()
new_nth_df = pd.DataFrame.from_dict(new_nth_dict, orient="index").reset_index().rename(columns={0:"new_nth", 'index':'vccsid'})
final_nth_df = old_nth_df.merge(new_nth_df, on=['vccsid'], how='inner').merge(valid_ind, on=['vccsid'], how='inner').sort_values(['vccsid'])

In [21]:
final_nth_df.loc[:,'truncated'] = final_nth_df.apply(lambda r: int(r.loc['nth'] > r.loc['new_nth']), axis=1)

In [22]:
final_nth_df.groupby(['valid']).agg({'truncated':'mean'})

Unnamed: 0_level_0,truncated
valid,Unnamed: 1_level_1
0.0,0.36174
1.0,0.275783


In [23]:
print(np.round(final_nth_df[final_nth_df.valid==0].new_nth.mean(), 2),
      np.round(final_nth_df[final_nth_df.valid==1].new_nth.mean(), 2))

4.76 4.77


In [24]:
final_nth_df = final_nth_df.drop(['nth'], axis=1)

In [25]:
final_nth_df.loc[:,'yr'] = (final_nth_df.new_nth-1) // 3
final_nth_df.loc[:,'t'] = (final_nth_df.new_nth-1) % 3

In [26]:
final_nth_df.loc[:,'last_term'] = final_nth_df.first_nonde_strm + 10*final_nth_df.yr

In [27]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term + final_nth_df.t

In [28]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term.apply(lambda x: x+7 if x % 10 == 5 or x % 10 == 6 else x)

In [29]:
final_nth_df = final_nth_df.loc[:,['vccsid', 'first_nonde_strm', 'last_term', 'truncated','new_nth']]\
.rename(columns={'new_nth':'nth'})

In [30]:
final_nth_df.sort_values(['vccsid']).to_stata(fpath + "truncation_nth_df.dta", write_index=False)

In [31]:
Counter(final_nth_df.nth)

Counter({1: 93200,
         2: 59925,
         3: 31782,
         4: 46055,
         5: 32930,
         6: 18021,
         7: 23587,
         8: 16391,
         9: 9969,
         10: 12554,
         11: 9049,
         12: 5635,
         13: 7442,
         14: 5775,
         15: 3855,
         16: 5380,
         17: 4250})

#### Sanity check: dropped students

In [32]:
enrolled_terms = pd.read_stata("C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\dta\\new\\student_level_sample_and_outcomes.dta").loc[:,['vccsid', 'first_nonde_strm', 'first_degree_strm']]

In [33]:
enrolled_nth_1_new = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_1_new = enrolled_nth_1_new[pd.isnull(enrolled_nth_1_new.nth)].loc[:,['vccsid']]
enrolled_nth_1_new = enrolled_nth_1_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_1_new.first_nonde_strm == enrolled_nth_1_new.first_degree_strm).all()

In [34]:
enrolled_nth_2_new = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_2_new = enrolled_nth_2_new[pd.isnull(enrolled_nth_2_new.nth)].loc[:,['vccsid']]
enrolled_nth_2_new = enrolled_nth_2_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_2_new.first_nonde_strm == enrolled_nth_2_new.first_degree_strm).all()