In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter

fpath = "C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\intermediate_files\\"

In [2]:
enrolled_nth = pd.read_stata(fpath+"enrolled_nth_alternative.dta")

In [3]:
valid_ind = pd.read_stata(fpath+"full_data_enrolled_terms.dta").loc[:,['vccsid','valid']]

In [4]:
Counter(valid_ind.valid)

Counter({0.0: 300144, 1.0: 33350})

In [5]:
enrolled_nth_1 = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Training sample size:", len(np.unique(enrolled_nth_1.vccsid)))
enrolled_nth_2 = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='inner')
print("Validation sample size:", len(np.unique(enrolled_nth_2.vccsid)))

Training sample size: 298624
Validation sample size: 33161


#### Random truncation procedure for the training/validation sets

##### Note: Use the above number to calculate training/validation sample sizes corresponding to each nth_term (in "identify_truncated_sample_sizes.do")

In [6]:
initial_dict = {}
df1 = enrolled_nth.loc[:,['vccsid','first_nonde_strm']].drop_duplicates()
for i in range(df1.shape[0]):
    vccsid = df1.vccsid.iloc[i]
    fns = df1.first_nonde_strm.iloc[i]
    initial_dict[vccsid] = fns

In [7]:
nth_dict_1 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_1.shape[0]):
    vccsid = enrolled_nth_1.vccsid.iloc[i]
    nth = enrolled_nth_1.nth.iloc[i]
    nth_dict_1[nth].add(vccsid)

In [8]:
nth_dict_2 = {i:set() for i in range(1,18)}
for i in range(enrolled_nth_2.shape[0]):
    vccsid = enrolled_nth_2.vccsid.iloc[i]
    nth = enrolled_nth_2.nth.iloc[i]
    nth_dict_2[nth].add(vccsid)

In [9]:
nth_dict_1_cp = nth_dict_1.copy()
nth_dict_2_cp = nth_dict_2.copy()

In [10]:
truncation_ss = pd.read_stata(fpath+"truncation_sample_sizes_alternative.dta")
train_ss = {}
valid_ss = {}
for i in range(truncation_ss.shape[0]):
    nth_term = truncation_ss.nth_term.iloc[i]
    tss = truncation_ss.train_sample_size.iloc[i]
    vss = truncation_ss.valid_sample_size.iloc[i]
    train_ss[nth_term] = tss
    valid_ss[nth_term] = vss

In [11]:
### Random truncation (sampling) for training sample
random.seed(12345)
sample_1 = {}
diff = 0
for nth in range(17,0,-1):
    tss = int(train_ss[nth]) + diff
    pool_size = len(nth_dict_1_cp[nth])
    if pool_size < tss:
        sample_1[nth] = nth_dict_1_cp[nth]
        diff = tss - pool_size
    else:
        sample_1[nth] = set(random.sample(nth_dict_1_cp[nth], tss))    
    for i in range(nth,0,-1):
        nth_dict_1_cp[i] = nth_dict_1_cp[i].difference(sample_1[nth])
    print("nth == {0} is sampled: {1} out of {2}".format(nth,min(tss, pool_size),pool_size))

nth == 17 is sampled: 1486 out of 14190
nth == 16 is sampled: 6886 out of 17036
nth == 15 is sampled: 3265 out of 11318
nth == 14 is sampled: 2010 out of 15053
nth == 13 is sampled: 10241 out of 20065
nth == 12 is sampled: 5360 out of 14088
nth == 11 is sampled: 3167 out of 19540
nth == 10 is sampled: 17701 out of 27793
nth == 9 is sampled: 8909 out of 18058
nth == 8 is sampled: 4601 out of 26692
nth == 7 is sampled: 28924 out of 40802
nth == 6 is sampled: 13596 out of 24315
nth == 5 is sampled: 7889 out of 38326
nth == 4 is sampled: 50346 out of 57328
nth == 3 is sampled: 22492 out of 24211
nth == 2 is sampled: 12868 out of 42141
nth == 1 is sampled: 98883 out of 98883


In [12]:
prop_1 = np.array([len(sample_1[i+1]) for i in range(len(sample_1))])
prop_1 = prop_1/np.sum(prop_1)

In [13]:
### Random truncation (sampling) for validation sample
random.seed(12345)
sample_2 = {}
diff = 0
for nth in range(17,0,-1):
    vss = int(valid_ss[nth]) + diff
    pool_size = len(nth_dict_2_cp[nth])
    if pool_size < vss:
        sample_2[nth] = nth_dict_2_cp[nth]
        diff = vss - pool_size
    else:
        sample_2[nth] = set(random.sample(nth_dict_2_cp[nth], vss))    
    for i in range(nth,0,-1):
        nth_dict_2_cp[i] = nth_dict_2_cp[i].difference(sample_2[nth])
    print("nth == {0} is sampled: {1} out of {2}".format(nth,min(vss, pool_size),pool_size))

nth == 17 is sampled: 165 out of 1560
nth == 16 is sampled: 765 out of 1898
nth == 15 is sampled: 363 out of 1237
nth == 14 is sampled: 223 out of 1642
nth == 13 is sampled: 1137 out of 2252
nth == 12 is sampled: 595 out of 1450
nth == 11 is sampled: 352 out of 2174
nth == 10 is sampled: 1966 out of 3080
nth == 9 is sampled: 989 out of 2011
nth == 8 is sampled: 511 out of 2941
nth == 7 is sampled: 3212 out of 4566
nth == 6 is sampled: 1510 out of 2724
nth == 5 is sampled: 876 out of 4339
nth == 4 is sampled: 5591 out of 6366
nth == 3 is sampled: 2498 out of 2788
nth == 2 is sampled: 1429 out of 4653
nth == 1 is sampled: 10979 out of 10979


In [14]:
prop_2 = np.array([len(sample_2[i+1]) for i in range(len(sample_2))])
prop_2 = prop_2/np.sum(prop_2)

In [15]:
prop_df = pd.DataFrame({"nth_term": range(1,18),
                        "crnt_cohort": np.round(truncation_ss.prop,4), 
                        "truncated_train": np.round(prop_1,4),
                        "truncated_validation": np.round(prop_2,4)})\
.loc[:,['nth_term', 'crnt_cohort', 'truncated_train', 'truncated_validation']]
print(prop_df)

    nth_term  crnt_cohort  truncated_train  truncated_validation
0          1       0.3311           0.3311                0.3311
1          2       0.0431           0.0431                0.0431
2          3       0.0753           0.0753                0.0753
3          4       0.1686           0.1686                0.1686
4          5       0.0264           0.0264                0.0264
5          6       0.0455           0.0455                0.0455
6          7       0.0969           0.0969                0.0969
7          8       0.0154           0.0154                0.0154
8          9       0.0298           0.0298                0.0298
9         10       0.0593           0.0593                0.0593
10        11       0.0106           0.0106                0.0106
11        12       0.0179           0.0179                0.0179
12        13       0.0343           0.0343                0.0343
13        14       0.0067           0.0067                0.0067
14        15       0.0109

In [16]:
prop_df.to_csv(fpath + "proportion_after_truncation_alternative.csv", index=False)

#### Find the end term of each observation after truncation

In [17]:
new_nth_dict = {}
s1 = 0
for k,v in sample_1.items():
    s1 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
s2 = 0
for k,v in sample_2.items():
    s2 += len(v)
    for vccsid in v:
        new_nth_dict[vccsid] = k
print(s1,s2)

298624 33161


In [18]:
old_nth_df = enrolled_nth.groupby(['vccsid', 'first_nonde_strm']).agg({'nth':'max'}).reset_index()
new_nth_df = pd.DataFrame.from_dict(new_nth_dict, orient="index").reset_index().rename(columns={0:"new_nth", 'index':'vccsid'})
final_nth_df = old_nth_df.merge(new_nth_df, on=['vccsid'], how='inner').merge(valid_ind, on=['vccsid'], how='inner').sort_values(['vccsid'])

In [19]:
final_nth_df.loc[:,'truncated'] = final_nth_df.apply(lambda r: int(r.loc['nth'] > r.loc['new_nth']), axis=1)

In [20]:
final_nth_df.groupby(['valid']).agg({'truncated':'mean'})

Unnamed: 0_level_0,truncated
valid,Unnamed: 1_level_1
0.0,0.428174
1.0,0.429028


In [21]:
print(np.round(final_nth_df[final_nth_df.valid==0].new_nth.mean(), 2),
      np.round(final_nth_df[final_nth_df.valid==1].new_nth.mean(), 2))

4.88 4.88


In [22]:
final_nth_df = final_nth_df.drop(['nth'], axis=1)

In [23]:
final_nth_df.loc[:,'yr'] = (final_nth_df.new_nth-1) // 3
final_nth_df.loc[:,'t'] = (final_nth_df.new_nth-1) % 3

In [24]:
final_nth_df.loc[:,'last_term'] = final_nth_df.first_nonde_strm + 10*final_nth_df.yr

In [25]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term + final_nth_df.t

In [26]:
final_nth_df.loc[:,'last_term'] = final_nth_df.last_term.apply(lambda x: x+7 if x % 10 == 5 or x % 10 == 6 else x)

In [27]:
final_nth_df = final_nth_df.loc[:,['vccsid', 'first_nonde_strm', 'last_term', 'truncated','new_nth']]\
.rename(columns={'new_nth':'nth'})

In [28]:
final_nth_df.sort_values(['vccsid']).to_stata(fpath + "truncation_nth_df_alternative.dta", write_index=False)

In [29]:
Counter(final_nth_df.nth)

Counter({1: 109862,
         2: 14297,
         3: 24990,
         4: 55937,
         5: 8765,
         6: 15106,
         7: 32136,
         8: 5112,
         9: 9898,
         10: 19667,
         11: 3519,
         12: 5955,
         13: 11378,
         14: 2233,
         15: 3628,
         16: 7651,
         17: 1651})

#### Sanity check: dropped students

In [36]:
enrolled_terms = pd.read_stata("C:\\Users\\ys8mz\\Box Sync\\Predictive Models of College Completion (VCCS)\\dta\\student_level_sample_and_outcomes.dta")\
.loc[:,['vccsid', 'first_nonde_strm', 'deg_vccs_associate_strm', 'deg_vccs_certificate_strm', 'deg_vccs_diploma_strm']]
enrolled_terms.loc[:,'first_degree_strm'] =\
enrolled_terms.loc[:,['deg_vccs_associate_strm', 'deg_vccs_certificate_strm', 'deg_vccs_diploma_strm']].min(axis=1)
enrolled_terms.drop(['deg_vccs_associate_strm', 'deg_vccs_certificate_strm', 'deg_vccs_diploma_strm'], axis=1, inplace=True)

In [37]:
enrolled_nth_1_new = valid_ind[valid_ind.valid == 0].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_1_new = enrolled_nth_1_new[pd.isnull(enrolled_nth_1_new.nth)].loc[:,['vccsid']]
enrolled_nth_1_new = enrolled_nth_1_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_1_new.first_nonde_strm == enrolled_nth_1_new.first_degree_strm).all()

In [38]:
enrolled_nth_2_new = valid_ind[valid_ind.valid == 1].merge(enrolled_nth, on=['vccsid'], how='left')
enrolled_nth_2_new = enrolled_nth_2_new[pd.isnull(enrolled_nth_2_new.nth)].loc[:,['vccsid']]
enrolled_nth_2_new = enrolled_nth_2_new.merge(enrolled_terms, on=['vccsid'], how='left')
assert (enrolled_nth_2_new.first_nonde_strm == enrolled_nth_2_new.first_degree_strm).all()