In [167]:
import pandas as pd
import numpy as np
import random
from collections import Counter

fpath = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\intermediate_files\\"

#### Random truncation procedure for the training/validation sets

In [168]:
enrolled_nth = pd.read_stata(fpath+"enrolled_nth.dta")
truncation_ss = pd.read_stata(fpath+"truncation_sample_sizes.dta")

In [169]:
train_ss = {}
valid_ss = {}
for i in range(truncation_ss.shape[0]):
    nth_term = truncation_ss.nth_term.iloc[i]
    tss = truncation_ss.train_sample_size.iloc[i]
    vss = truncation_ss.valid_sample_size.iloc[i]
    train_ss[nth_term] = tss
    valid_ss[nth_term] = vss

In [170]:
valid_ind = pd.read_stata(fpath+"full_data_enrolled_terms.dta").loc[:,['vccsid','valid']]

In [171]:
Counter(valid_ind.valid)

Counter({0.0: 300144, 1.0: 33350})

In [172]:
enrolled_nth_1 = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Training sample size:", len(np.unique(enrolled_nth_1.vccsid)))
enrolled_nth_2 = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Validation sample size:", len(np.unique(enrolled_nth_2.vccsid)))

Training sample size: 298139
Validation sample size: 33115


In [173]:
initial_dict = {}
df1 = enrolled_nth.loc[:,['vccsid','first_nonde_strm']].drop_duplicates()
for i in range(df1.shape[0]):
    vccsid = df1.vccsid.iloc[i]
    fns = df1.first_nonde_strm.iloc[i]
    initial_dict[vccsid] = fns

In [174]:
nth_dict_1 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_1.shape[0]):
    vccsid = enrolled_nth_1.vccsid.iloc[i]
    nth = enrolled_nth_1.nth.iloc[i]
    nth_dict_1[nth].add(vccsid)

In [175]:
nth_dict_2 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_2.shape[0]):
    vccsid = enrolled_nth_2.vccsid.iloc[i]
    nth = enrolled_nth_2.nth.iloc[i]
    nth_dict_2[nth].add(vccsid)

In [176]:
nth_dict_1_cp = nth_dict_1.copy()
nth_dict_2_cp = nth_dict_2.copy()

In [177]:
### Random truncation (sampling) for training sample
random.seed(12345)
sample_1 = {}
diff = 0
for nth in range(17,0,-1):
    tss = int(train_ss[nth]) + diff
    pool_size = len(nth_dict_1_cp[nth])
    if pool_size < tss:
        sample_1[nth] = nth_dict_1_cp[nth]
        diff = tss - pool_size
    else:
        sample_1[nth] = set(random.sample(nth_dict_1_cp[nth], tss))    
    for i in range(nth,0,-1):
        nth_dict_1_cp[i] = nth_dict_1_cp[i].difference(sample_1[nth])
    print("nth == {0} is sampled: {1} out of {2}".format(nth,min(tss, pool_size),pool_size))

nth == 17 is sampled: 1261 out of 13330
nth == 16 is sampled: 6698 out of 16412
nth == 15 is sampled: 3183 out of 10709
nth == 14 is sampled: 1793 out of 14327
nth == 13 is sampled: 10142 out of 19535
nth == 12 is sampled: 5285 out of 13502
nth == 11 is sampled: 2984 out of 18864
nth == 10 is sampled: 17628 out of 27359
nth == 9 is sampled: 8879 out of 17669
nth == 8 is sampled: 4541 out of 26254
nth == 7 is sampled: 29002 out of 40607
nth == 6 is sampled: 13590 out of 24067
nth == 5 is sampled: 7856 out of 38191
nth == 4 is sampled: 50554 out of 57102
nth == 3 is sampled: 22554 out of 23998
nth == 2 is sampled: 12896 out of 41997
nth == 1 is sampled: 99293 out of 99293


#### The data for Table 1 of the paper

In [178]:
prop_1 = np.array([len(sample_1[i+1]) for i in range(len(sample_1))])
prop_1 = prop_1/np.sum(prop_1)

In [179]:
### Random truncation (sampling) for validation sample
random.seed(12345)
sample_2 = {}
diff = 0
for nth in range(17,0,-1):
    vss = int(valid_ss[nth]) + diff
    pool_size = len(nth_dict_2_cp[nth])
    if pool_size < vss:
        sample_2[nth] = nth_dict_2_cp[nth]
        diff = vss - pool_size
    else:
        sample_2[nth] = set(random.sample(nth_dict_2_cp[nth], vss))    
    for i in range(nth,0,-1):
        nth_dict_2_cp[i] = nth_dict_2_cp[i].difference(sample_2[nth])
    print("nth == {0} is sampled: {1} out of {2}".format(nth,min(vss, pool_size),pool_size))

nth == 17 is sampled: 140 out of 1467
nth == 16 is sampled: 744 out of 1836
nth == 15 is sampled: 353 out of 1169
nth == 14 is sampled: 199 out of 1573
nth == 13 is sampled: 1126 out of 2207
nth == 12 is sampled: 587 out of 1353
nth == 11 is sampled: 331 out of 2072
nth == 10 is sampled: 1958 out of 3001
nth == 9 is sampled: 986 out of 1947
nth == 8 is sampled: 504 out of 2914
nth == 7 is sampled: 3221 out of 4495
nth == 6 is sampled: 1510 out of 2719
nth == 5 is sampled: 873 out of 4308
nth == 4 is sampled: 5615 out of 6349
nth == 3 is sampled: 2505 out of 2730
nth == 2 is sampled: 1432 out of 4591
nth == 1 is sampled: 11031 out of 11031


In [180]:
prop_2 = np.array([len(sample_2[i+1]) for i in range(len(sample_2))])
prop_2 = prop_2/np.sum(prop_2)

In [181]:
prop_df = pd.DataFrame({"nth_term": range(1,18),
                        "crnt_cohort": np.round(truncation_ss.prop,4), 
                        "truncated_train": np.round(prop_1,4),
                        "truncated_validation": np.round(prop_2,4)})\
.loc[:,['nth_term', 'crnt_cohort', 'truncated_train', 'truncated_validation']]
print(prop_df)

    nth_term  crnt_cohort  truncated_train  truncated_validation
0          1       0.3330           0.3330                0.3331
1          2       0.0433           0.0433                0.0432
2          3       0.0757           0.0756                0.0756
3          4       0.1696           0.1696                0.1696
4          5       0.0263           0.0264                0.0264
5          6       0.0456           0.0456                0.0456
6          7       0.0973           0.0973                0.0973
7          8       0.0152           0.0152                0.0152
8          9       0.0298           0.0298                0.0298
9         10       0.0591           0.0591                0.0591
10        11       0.0100           0.0100                0.0100
11        12       0.0177           0.0177                0.0177
12        13       0.0340           0.0340                0.0340
13        14       0.0060           0.0060                0.0060
14        15       0.0107

In [182]:
prop_df.to_csv(fpath + "proportion_after_truncation.csv", index=False)

#### Find the end term of each observation after truncation

In [183]:
new_nth_dict = {}
s1 = 0
for k,v in sample_1.items():
    s1 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
s2 = 0
for k,v in sample_2.items():
    s2 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
print(s1,s2)

298139 33115


In [184]:
old_nth_df = enrolled_nth.groupby(['vccsid', 'first_nonde_strm']).agg({'nth':'max'}).reset_index()
new_nth_df = pd.DataFrame.from_dict(new_nth_dict, orient="index").reset_index().rename(columns={0:"new_nth", 'index':'vccsid'})
final_nth_df = old_nth_df.merge(new_nth_df, on=['vccsid'], how='inner').merge(valid_ind, on=['vccsid'], how='inner').sort_values(['vccsid'])

In [185]:
final_nth_df.loc[:,'truncated'] = final_nth_df.apply(lambda r: int(r.loc['nth'] > r.loc['new_nth']), axis=1)

In [186]:
final_nth_df.groupby(['valid']).agg({'truncated':'mean'})

Unnamed: 0_level_0,truncated
valid,Unnamed: 1_level_1
0.0,0.419821
1.0,0.4183


In [187]:
print(np.round(final_nth_df[final_nth_df.valid==0].new_nth.mean(), 2),
      np.round(final_nth_df[final_nth_df.valid==1].new_nth.mean(), 2))

4.83 4.83


In [188]:
final_nth_df = final_nth_df.drop(['nth'], axis=1)

In [189]:
final_nth_df.loc[:,'yr'] = (final_nth_df.new_nth-1) // 3
final_nth_df.loc[:,'t'] = (final_nth_df.new_nth-1) % 3

In [190]:
final_nth_df.loc[:,'last_term'] = final_nth_df.first_nonde_strm + 10*final_nth_df.yr

In [191]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term + final_nth_df.t

In [192]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term.apply(lambda x: x+7 if x % 10 == 5 or x % 10 == 6 else x)

In [193]:
final_nth_df = final_nth_df.loc[:,['vccsid', 'first_nonde_strm', 'last_term', 'truncated','new_nth']]\
.rename(columns={'new_nth':'nth'})

In [194]:
final_nth_df.sort_values(['vccsid']).to_stata(fpath + "truncation_nth_df.dta", write_index=False)

In [195]:
Counter(final_nth_df.nth)

Counter({1: 110324,
         2: 14328,
         3: 25059,
         4: 56169,
         5: 8729,
         6: 15100,
         7: 32223,
         8: 5045,
         9: 9865,
         10: 19586,
         11: 3315,
         12: 5872,
         13: 11268,
         14: 1992,
         15: 3536,
         16: 7442,
         17: 1401})

#### Sanity check: dropped students

In [196]:
enrolled_terms = pd.read_stata("C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\dta\\student_level_sample_and_outcomes.dta").loc[:,['vccsid', 'first_nonde_strm', 'first_degree_strm']]

In [197]:
enrolled_nth_1_new = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_1_new = enrolled_nth_1_new[pd.isnull(enrolled_nth_1_new.nth)].loc[:,['vccsid']]
enrolled_nth_1_new = enrolled_nth_1_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_1_new.first_nonde_strm == enrolled_nth_1_new.first_degree_strm).all()

In [198]:
enrolled_nth_2_new = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_2_new = enrolled_nth_2_new[pd.isnull(enrolled_nth_2_new.nth)].loc[:,['vccsid']]
enrolled_nth_2_new = enrolled_nth_2_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_2_new.first_nonde_strm == enrolled_nth_2_new.first_degree_strm).all()