This script creates the subject-specific predictors: interaction between the subject the target course belongs to and the subjects the previously taken courses belong to.

In [7]:
import pandas as pd
import numpy as np

In [4]:
df1 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\LMS_data_new.dta")
df2 = pd.read_stata("~\\Box Sync\\Clickstream\\data\\prior_courses.dta")

In [8]:
all_subjects = []
for cn in list(df1.course) + list(df2.course):
    all_subjects.append(cn.split("_")[0])
all_subjects = np.unique(all_subjects)

In [9]:
all_subjects

array(['ACC', 'ADJ', 'AGR', 'AIR', 'ARA', 'ARC', 'ARO', 'ART', 'ASL',
       'AST', 'AUT', 'BCS', 'BIO', 'BLD', 'BSK', 'BUS', 'CAD', 'CHD',
       'CHI', 'CHM', 'COS', 'CRF', 'CSC', 'CSP', 'CST', 'DAN', 'DIT',
       'DMS', 'DNA', 'DNH', 'DRF', 'ECO', 'EDE', 'EDU', 'EGR', 'ELE',
       'EMS', 'EMT', 'ENE', 'ENF', 'ENG', 'ENV', 'EQU', 'ESL', 'ETR',
       'FIN', 'FIR', 'FOR', 'FRE', 'FST', 'GEO', 'GER', 'GIS', 'GOL',
       'HCT', 'HIM', 'HIS', 'HIT', 'HLT', 'HMS', 'HRI', 'HRT', 'HUM',
       'IND', 'IST', 'ITA', 'ITD', 'ITE', 'ITN', 'ITP', 'JPN', 'KOR',
       'LAT', 'LBR', 'LGL', 'MAC', 'MAR', 'MCR', 'MDE', 'MDL', 'MEC',
       'MEN', 'MKT', 'MSC', 'MTE', 'MTH', 'MTT', 'MUS', 'NAS', 'NSG',
       'NUR', 'OCT', 'PED', 'PHI', 'PHT', 'PHY', 'PLS', 'PNE', 'PNG',
       'PNT', 'PSG', 'PSY', 'PTH', 'RAD', 'REA', 'REL', 'RPK', 'RTH',
       'RUS', 'RVH', 'SAF', 'SCM', 'SCT', 'SDV', 'SOC', 'SPA', 'SPD',
       'SSC', 'STD', 'SUR', 'TEL', 'TRK', 'TRV', 'VEN', 'VET', 'WEL'],
      dtype='<U3')

In [13]:
subj_dict = {'BUS': ['ACC', 'BUS', 'MKT', 'HRI', 'FIN', 'REA'],
             'ART': ['ART', 'DAN', 'MUS', 'HRT', 'CRF', 'PHT'],
             'SCI': ['BIO', 'GOL', 'PHY', 'NAS', 'CHM', 'SCT'],
             'EGR': ['EGR', 'CSC', 'ITD', 'ITE', 'ITN', 'ITP',
                     'HIM', 'ENE', 'GIS', 'ENV', 'HIT', 'ARC',
                     'ETR', 'AST', 'IND', 'IST', 'DRF', 'CAD', 
                     'TEL', 'MEC'],
             'MTH': ['MTE', 'MTH', 'MTT', 'MCR', 'MDE'],
             'SOC': ['EDU', 'SOC', 'ECO', 'GEO', 'HIS', 'PSY',
                     'PLS', 'ADJ', 'LGL', 'CHD', 'SSC'],
             'MED': ['EMS', 'NUR', 'HLT', 'PED', 'DMS', 'RAD', 
                     'EMT', 'PNE', 'DNA', 'SUR', 'NSG', 'HCT',
                     'DNH', 'OCT', 'CSP', 'DIT', 'PSG', 'PNG',
                     'PTH', 'MEN', 'MDL', 'RTH'],
             'HUM': ['HUM', 'CST', 'PHI', 'REL', 'ENG', 'ENF',
                     'EDE', 'HMS', 'SDV', 'SPD', 'STD', 'SCM'],
             'FLA': ['ASL', 'ARA', 'FRE', 'GER', 'JPN', 'KOR',
                     'ITA', 'LAT', 'SPA', 'CHI', 'ESL', 'RUS'],
             'OCC': ['FST', 'AGR', 'AIR', 'ARO', 'AUT', 'FOR',
                     'FIR', 'WEL', 'VET', 'EQU', 'LBR', 'BCS',
                     'BLD', 'BSK', 'COS', 'VEN', 'TRK', 'TRV',
                     'PNT', 'MSC', 'MAR', 'MAC', 'ELE', 'RVH',
                     'RPK', 'SAF']}
subj_inverse_dict = {e:k for k,v in subj_dict.items() for e in v}
assert len(subj_inverse_dict) == len(all_subjects)

In [28]:
df2.loc[:,'subject'] = df2.course.apply(lambda x: x.split("_")[0])
df2.loc[:,'cluster'] = df2.subject.apply(lambda x: subj_inverse_dict[x])
df2.loc[:,'grade_point'] = df2.credit * df2.est_grade

In [26]:
cluster_dict = {}
for strm in [2193, 2194, 2203, 2204, 2212]:
    df2_sub = df2[df2.strm < strm].copy()
    df2_sub_agg = df2_sub.groupby(['vccsid', 'cluster']).agg({'credit':'sum', 'grade_point':'sum'}).reset_index()
    df2_sub_agg.loc[:,'est_grade'] = df2_sub_agg.grade_point/df2_sub_agg.credit
    for i in range(df2_sub_agg.shape[0]):
        vccsid = df2_sub_agg.vccsid.iloc[i]
        cluster = df2_sub_agg.cluster.iloc[i]
        est_grade = df2_sub_agg.est_grade.iloc[i]
        try:
            cluster_dict[vccsid + "_" + str(strm)].append((cluster, est_grade))
        except KeyError:
            cluster_dict[vccsid + "_" + str(strm)] = [(cluster, est_grade)]

In [31]:
df1.loc[:,'subject'] = df1.course.apply(lambda x: x.split("_")[0])
df1.loc[:,'cluster'] = df1.subject.apply(lambda x: subj_inverse_dict[x])

In [37]:
full_cluster_dict = {}
for i in range(df1.shape[0]):
    vccsid = df1.vccsid.iloc[i]
    strm = df1.strm.iloc[i]
    course = df1.course.iloc[i]
    section  = df1.section.iloc[i]
    subject = df1.subject.iloc[i]
    cluster = df1.cluster.iloc[i]
    l = []
    if vccsid + "_" + str(strm) in cluster_dict:
        ll = cluster_dict[vccsid + "_" + str(strm)]
        l = [(t[0]+"_"+cluster, t[1]) for t in ll]
    d = {}
    for t in l:
        d[t[0]] = 1
        d[t[0] + "_grade"] = t[1]
    full_cluster_dict["-".join([vccsid, str(strm), course, section])] = d.copy()

In [69]:
full_cluster_df = pd.DataFrame.from_dict(full_cluster_dict, orient='index')
for indx,var in enumerate(['vccsid','strm','course','section']):
    full_cluster_df.loc[:,var] = list(pd.Series(list(full_cluster_df.index)).apply(lambda x: x.split("-")[indx]))
full_cluster_df.loc[:,'strm'] = full_cluster_df.strm.astype(int)
full_cluster_df = df1.loc[:,['vccsid','strm','course','section']].merge(full_cluster_df, how='left',
                                                                        on = ['vccsid','strm','course','section']).fillna(0)

In [72]:
full_cluster_df.to_csv("~\\Box Sync\\Clickstream\\data\\cluster_specific_predictors.csv", index=False)