## Part 1: Generating Training and Testing Data
Before we can get started with differential privacy models, we first need to get the data to train and test the models.  
To do that, we will take the grade distributions from the UW-Madison Registrar's office and randomly sample them.

In [1]:
DATA_FOLDER = './data'

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import random

In [3]:
sns.set()

### Reading the Dataset

In [4]:
def plot():
    plt.figure(dpi=200, figsize=(8, 5))

def read_file_into_pd(file_name):
    return pd.read_csv(os.path.join(DATA_FOLDER, file_name))

In [5]:
grade_distributions = read_file_into_pd('grade_distributions.csv')
course_offerings = read_file_into_pd('course_offerings.csv')
courses = read_file_into_pd('courses.csv')
cs_courses = pd.read_csv('cs.csv', names=['name'])

In [6]:
grade_distributions.head()

Unnamed: 0,course_offering_uuid,section_number,a_count,ab_count,b_count,bc_count,c_count,d_count,f_count,s_count,u_count,cr_count,n_count,p_count,i_count,nw_count,nr_count,other_count
0,344b3ebe-da7e-314c-83ed-9425269695fd,1,105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,f718e6cd-33f0-3c14-a9a6-834d9c3610a8,1,158,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,ea3b717c-d66b-30dc-8b37-964d9688295f,1,139,12,2,0,3,0,0,0,0,0,0,0,0,0,0,0
3,075da420-5f49-3dd0-93df-13e3c152e1b1,1,87,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,2b4e216d-a728-3713-8c7c-19afffc6b2fd,1,70,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [7]:
course_offerings.head()

Unnamed: 0,uuid,course_uuid,term_code,name
0,344b3ebe-da7e-314c-83ed-9425269695fd,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1092,Cooperative Education Prog
1,f718e6cd-33f0-3c14-a9a6-834d9c3610a8,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1082,Cooperative Education Prog
2,ea3b717c-d66b-30dc-8b37-964d9688295f,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1172,Cooperative Education Prog
3,075da420-5f49-3dd0-93df-13e3c152e1b1,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1114,Cooperative Education Prog
4,2b4e216d-a728-3713-8c7c-19afffc6b2fd,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1104,Cooperative Education Prog


In [8]:
courses.head()

Unnamed: 0,uuid,name,number
0,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,Cooperative Education Program,1
1,c070a84f-648f-351d-9499-5d0e30ad02cc,Cooperative Education/Co-op in Life Sciences C...,1
2,e6b4b7ae-0e0b-3aa5-9d77-7fcd90c9cfa3,Cooperative Education Program,1
3,8f63bde1-ff7f-3fe7-9901-862908bf134c,Workshop in Dance Activity,1
4,f3541888-584a-3923-9ce7-6341ff3d84a1,Cooperative Education/Co-op in Agricultural & ...,1


In [9]:
cs_courses.head()

Unnamed: 0,name
0,PROGRAMMING I
1,INTRODUCTION TO COMPUTATION
2,DATA PROGRAMMING I
3,INTRODUCTION TO DISCRETE MATHEMATICS
4,DIGITAL SOCIETY: THE IMPACT OF COMPUTERS AND C...


In [10]:
distribution_renaming_dictionary = {
    "a_count": "A",
    "ab_count": "AB",
    "b_count": "B",
    "bc_count": "BC",
    "c_count": "C",
    "d_count": "D",
    "f_count": "F",
}

distribution_grades_to_drop = ["s_count","u_count","cr_count","n_count","p_count","i_count","nw_count","nr_count","other_count"]

def get_ids_for_course_name(name):
    return courses[courses['name'].str.lower() == name.lower()].uuid.tolist()

def get_offering_ids_for_course_ids(ids):
    return course_offerings[course_offerings.course_uuid.isin(ids)].uuid.tolist()

def get_ids_for_course_offering_name(name):
    return course_offerings[course_offerings.name.str.lower() == name.lower()].uuid.tolist()

def get_grades_for_course_offerings(course_offering_ids):
    return grade_distributions[grade_distributions.course_offering_uuid.isin(course_offering_ids)] \
        .sum().drop(['course_offering_uuid', 'section_number']).drop(distribution_grades_to_drop) \
        .rename(distribution_renaming_dictionary)

def get_grade_distribution_for_course_name(name):
    course_ids = get_ids_for_course_name(name)
    offering_ids = get_offering_ids_for_course_ids(course_ids)
    return get_grades_for_course_offerings(offering_ids)

def get_grade_distribution_for_course_offering_name(name):
    offering_ids = get_ids_for_course_offering_name(name)
    return get_grades_for_course_offerings(offering_ids)

def plot_grade_distribution(distribution, course_name):
    plot()
    plt.title(course_name)
    sns.barplot(x=distribution.index, y=distribution.values)
    
def sample_distribution(distribution):
    cumulative_sum = distribution.cumsum()
    r = random.randint(1, cumulative_sum['F'])
    for (index, value) in cumulative_sum.items():
        if value >= r:
            return index

In [11]:
def generate_transcripts(num_students=10_000):
    transcripts = pd.DataFrame(columns=cs_courses['name'])
    for i in range(num_students):
        random_courses = set([])
        student_transcript = pd.Series(index=transcripts.columns)
        while len(random_courses) < 8:
            random_course_number = random.randint(0, len(cs_courses)-1)
            name = cs_courses.loc[random_course_number]['name']
            random_courses.add(name)
            distribution = get_grade_distribution_for_course_name(name)
            if distribution['F'] > 0:
                student_transcript[name] = sample_distribution(distribution)
        transcripts = transcripts.append(student_transcript, ignore_index=True)
        if i%100 == 0:
            print("%d/%d..." % (i, num_students))
    transcripts.to_csv('transcripts.csv')

In [12]:
generate_transcripts(num_students=500)

0/500...
100/500...
200/500...
300/500...
400/500...
