<b>Name:</b> find_prior_terms_gpa_and_enrl_intensity.ipynb <br>
<b>Author:</b> Yifeng Song <br>
<b>Purpose:</b> Reorganize the term-specific GPA and enrollment intensity data into dictionaries that map student x term to the lists of prior term GPA and term enrollment values, so that they can be processed later on to find out the trendline (slope) of term GPA and term enrollment intensity as predictors for grade prediction models.

In [3]:
import pandas as pd
import numpy as np
import pickle
import getpass

fpath = "C:\\Users\\ys8mz\\Box Sync\\Clickstream\\data\\full"

In [4]:
# Load the table which contains the term enrollment intensity and term GPA of all actively enrolled terms for each student
df = pd.read_stata("{}\\term_lvl_gpa_enrl_intensity.dta".format(fpath)).loc[:,['vccsid', 'strm', 'term_credits_attempted', 'term_gpa', 'term_num']] # This file was created by the Stata script "processing_additional.do"
df.shape

(1223978, 5)

In [5]:
# For each student x term, create the list of term_gpa values within all prior terms with respect to the "current" term
results_dict_1 = {} # Use a dictionary to store the values, mapping each student x term to the list of all prior term GPA values
crnt_id = ""
start_indx = 0
for i in range(df.shape[0]):
    if i%1e5 == 0:
        print(i)
    vccsid = df.iloc[i,0]
    strm = df.iloc[i,1]
    if vccsid != crnt_id: # The DataFrame has been sorted with respect to vccsid and strm, so seeing a new vccsid means the completion of processing the data of the previous student
        crnt_id = vccsid
        start_indx = i
        results_dict_1[vccsid+str(strm)] = []
    else:
        results_dict_1[vccsid+str(strm)] = [e for e in list(df.iloc[start_indx:i,-2]) if pd.isnull(e) == False] # exclude the term where term GPA value is missing

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [6]:
# For each student x term, create the list of term enrollment intensity values within all prior terms with respect to the "current" term
results_dict_2 = {} # Use a dictionary to store the values, mapping each student x term to the list of all prior term GPA values
crnt_id = ""
start_indx = 0
for i in range(df.shape[0]):
    if i%1e5 == 0:
        print(i)
    vccsid = df.iloc[i,0]
    strm = df.iloc[i,1]
    if vccsid != crnt_id: # The DataFrame has been sorted with respect to vccsid and strm, so seeing a new vccsid means the completion of processing the data of the previous student
        crnt_id = vccsid
        start_indx = i
        results_dict_2[vccsid+str(strm)] = []
    else:
        results_dict_2[vccsid+str(strm)] = [e for e in list(df.iloc[start_indx:i,-3]) if pd.isnull(e) == False] # exclude the term where term enrollment intensity value is missing

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [7]:
# Save the dictionaries to file for later retrieval
results_1 = [(k,v) for k,v in results_dict_1.items()]
pickle.dump(results_1, open("{}\\results_1.p".format(fpath), "wb"))

results_2 = [(k,v) for k,v in results_dict_2.items()]
pickle.dump(results_2, open("{}\\results_2.p".format(fpath), "wb"))