<b>Name:</b> cum_gpa_by_term.ipynb <br>
<b>Author:</b> Yifeng Song <br>
<b>Purpose:</b> Calculate the aggregated cumulative GPA value by student x term for all students included in the study sample.

In [1]:
import os
import getpass
import pandas as pd
import numpy as np
import gc
import subprocess

home_dir = os.path.expanduser('~')
fpath = os.path.join(home_dir, 'Box Sync\\Clickstream\\data')

In [2]:
# Load the merged GPA data and keep the useful columns
# Note: in the future we might use an input data file different from the one created by 
# the script "A1: create_merged_class_and_gpa.do", because for the students in the study sample for course recommendation,
# we must use the most up-to-date data, while the Merge_Class.dta used for building grade prediction models 
# using the historical VCCS data doesn't have to be updated 
df = pd.read_stata(os.path.join(fpath, "Merged_GPA.dta")
df = df.loc[:,['vccsid', 'institution', 'collnum', 'strm', 'cum_gpa', 'tot_taken_prgrss']]
all_terms = sorted(np.unique(df.strm))

296


In [3]:
# Construct the nested dictionary 'cum_gpa_dict', with the primary key being the student x collegename,
# and the secondary key being the term number. And the values of inner dictionaries correspond to 
# the cumulative credits earned and the cumulative GPA at each college the student attended
cum_gpa_dict = {}
for i in range(df.shape[0]):
    if i % 1e5 == 0: # keep running garbage collection in every 100000 observations processed to prevent memory leak
        gc.collect()
    stuid = df.vccsid.iloc[i]
    college = df.institution.iloc[i]
    stuid_college = stuid + "-" + college
    term = df.strm.iloc[i]
    credits_gpa_pair = (df.tot_taken_prgrss.iloc[i], df.cum_gpa.iloc[i])
    if stuid_college not in cum_gpa_dict:
        cum_gpa_dict[stuid_college] = {term:credits_gpa_pair}
    else:
        cum_gpa_dict[stuid_college][term] = credits_gpa_pair
del df
gc.collect()

7

In [4]:
# If a student didn't attend the college during the first semester (according to the merged data),
# the values corresponding to the terms prior to the student's actual first term should be (0,0),
# meaning 0 credits attempted and 0 GPA; a new nested dictionary "cum_gpa_dict_2" will be created;
# If a student stopped out during a certain semester, the cumulative credits and GPA will be the
# values of the most recent prior semester -- this will make it easier to calculate aggregated cumulative GPA
first_term = all_terms[0]
cum_gpa_dict_2 = {}
i = 0
for k,v in cum_gpa_dict.items():
    if i % 1e5 == 0:
        gc.collect()
    new_v = v.copy()
    if first_term not in v:
        new_v[first_term] = (0,0)
    for indx,t in enumerate(all_terms[:-1]):
        crnt_term = all_terms[indx+1]
        if crnt_term not in new_v:
            new_v[crnt_term] = new_v[t]
    cum_gpa_dict_2[k] = new_v.copy()
    i += 1
del cum_gpa_dict
gc.collect()

0

In [5]:
# Create a new nested dictionary "cum_gpa_dict_3", which has student_id as the primary key,
# and term number as the secondary, and the values in the inner dictionaries correspond to
# the list of all pairs of cumulative credits earned & cumulative GPA of the student during
# the term, with each pair corresponding to each college the student attended during that term
cum_gpa_dict_3 = {}
i = 0
for k,v in cum_gpa_dict_2.items():
    if i % 1e5 == 0:
        gc.collect()
    stuid = k.split("-")[0]
    if stuid not in cum_gpa_dict_3:
        cum_gpa_dict_3[stuid] = {k2:[v2] for k2,v2 in v.items()}
    else:
        for k2,v2 in v.items():
            cum_gpa_dict_3[stuid][k2].append(v2)
    i += 1
del cum_gpa_dict_2
gc.collect()

0

In [6]:
def calc_agg_cum_gpa(l):
    # This function calculates the aggregated cumulative GPA of the student during each term:
    # If the student attended more than one institution, the aggregated cumulative GPA will be
    # the weighted average of the cumulative GPA of all colleges student has ever attended so far,
    # even if the student is only actively enrolled in one college
    a = np.array(l)
    if a.shape[0] == 1:
        if a[0,0] > 0:
            r = a[0,1]
        else:
            r = np.nan
    else:
        a = np.array(l)
        a[:,1] = a[:,0]*a[:,1]
        b = np.sum(a,axis=0)
        if b[0] == 0:
            r = np.nan
        else:
            r = b[1]/b[0]
    if pd.isnull(r):
        return r
    else:
        return round(r,3)

In [7]:
# create the nested dictionary "agg_cum_gpa", which has student_id as the primary key and term number as the secondary key,
# and there is one aggregated GPA value corresponding to each student x term
agg_cum_gpa = {}
i = 0
for k,v in cum_gpa_dict_3.items():
    if i % 1e5 == 0:
        gc.collect()
    agg_cum_gpa[k] = {k2:calc_agg_cum_gpa(v2) for k2,v2 in v.items()}
    i += 1
del cum_gpa_dict_3
gc.collect()

0

In [8]:
# Final cleanup and transform the aggregated cumulative GPA data from dictionary format into tabular format,
# save the tabular data to file in the .dta file format
final_df = pd.DataFrame.from_dict(agg_cum_gpa, orient="index")
final_df = final_df.loc[:,sorted(final_df.columns.values)] # The column names are order by term
gc.collect()
final_df.columns = ["term_" + str(cn) for cn in final_df.columns.values] # Add prefix "term_" to column names
final_df.reset_index(inplace=True)
final_df.rename(columns = {'index':'vccsid'}, inplace=True)
final_df.to_stata(os.path.join(fpath, "agg_cum_gpa_by_term.dta"), write_index=False)