## Part 1: Generating Training and Testing Data
Before we can get started with differential privacy models, we first need to get the data to train and test the models.  
To do that, we will take the grade distributions from the UW-Madison Registrar's office and randomly sample them.

In [1]:
DATA_FOLDER = './data'

In [2]:
import pandas as pd
import os
import random
import numpy as np
import math

### Reading the Dataset

In [3]:
def read_file_into_pd(file_name):
    return pd.read_csv(os.path.join(DATA_FOLDER, file_name))

In [4]:
grade_distributions = read_file_into_pd('grade_distributions.csv')
course_offerings = read_file_into_pd('course_offerings.csv')
courses = read_file_into_pd('courses.csv')
cs_courses = pd.read_csv('cs.csv', names=['name'])

In [5]:
grade_distributions.head()

Unnamed: 0,course_offering_uuid,section_number,a_count,ab_count,b_count,bc_count,c_count,d_count,f_count,s_count,u_count,cr_count,n_count,p_count,i_count,nw_count,nr_count,other_count
0,344b3ebe-da7e-314c-83ed-9425269695fd,1,105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,f718e6cd-33f0-3c14-a9a6-834d9c3610a8,1,158,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,ea3b717c-d66b-30dc-8b37-964d9688295f,1,139,12,2,0,3,0,0,0,0,0,0,0,0,0,0,0
3,075da420-5f49-3dd0-93df-13e3c152e1b1,1,87,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,2b4e216d-a728-3713-8c7c-19afffc6b2fd,1,70,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [6]:
course_offerings.head()

Unnamed: 0,uuid,course_uuid,term_code,name
0,344b3ebe-da7e-314c-83ed-9425269695fd,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1092,Cooperative Education Prog
1,f718e6cd-33f0-3c14-a9a6-834d9c3610a8,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1082,Cooperative Education Prog
2,ea3b717c-d66b-30dc-8b37-964d9688295f,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1172,Cooperative Education Prog
3,075da420-5f49-3dd0-93df-13e3c152e1b1,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1114,Cooperative Education Prog
4,2b4e216d-a728-3713-8c7c-19afffc6b2fd,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,1104,Cooperative Education Prog


In [7]:
courses.head()

Unnamed: 0,uuid,name,number
0,a3e3e1c3-543d-3bb5-ae65-5f2aec4ad1de,Cooperative Education Program,1
1,c070a84f-648f-351d-9499-5d0e30ad02cc,Cooperative Education/Co-op in Life Sciences C...,1
2,e6b4b7ae-0e0b-3aa5-9d77-7fcd90c9cfa3,Cooperative Education Program,1
3,8f63bde1-ff7f-3fe7-9901-862908bf134c,Workshop in Dance Activity,1
4,f3541888-584a-3923-9ce7-6341ff3d84a1,Cooperative Education/Co-op in Agricultural & ...,1


In [8]:
cs_courses.head()

Unnamed: 0,name
0,PROGRAMMING I
1,INTRODUCTION TO COMPUTATION
2,DATA PROGRAMMING I
3,INTRODUCTION TO DISCRETE MATHEMATICS
4,DIGITAL SOCIETY: THE IMPACT OF COMPUTERS AND C...


### Calculate the weight range to bias student distributions

In [9]:
# Dictionary to help convert the dataset labels into easier to use values
distribution_renaming_dictionary = {
    "a_count": "A",
    "ab_count": "AB",
    "b_count": "B",
    "bc_count": "BC",
    "c_count": "C",
    "d_count": "D",
    "f_count": "F",
}


# List of grades that we're not interested in. All courses which have only these grades will be eventually filtered out
distribution_grades_to_drop = ["s_count","u_count","cr_count","n_count","p_count","i_count","nw_count","nr_count","other_count"]


# Functions to get data out of the dataset
def get_ids_for_course_name(name):
    return courses[courses['name'].str.lower() == name.lower()].uuid.tolist()

def get_offering_ids_for_course_ids(ids):
    return course_offerings[course_offerings.course_uuid.isin(ids)].uuid.tolist()

def get_ids_for_course_offering_name(name):
    return course_offerings[course_offerings.name.str.lower() == name.lower()].uuid.tolist()

def get_grades_for_course_offerings(course_offering_ids):
    return grade_distributions[grade_distributions.course_offering_uuid.isin(course_offering_ids)] \
        .sum().drop(['course_offering_uuid', 'section_number']).drop(distribution_grades_to_drop) \
        .rename(distribution_renaming_dictionary)

def get_grade_distribution_for_course_name(name):
    course_ids = get_ids_for_course_name(name)
    offering_ids = get_offering_ids_for_course_ids(course_ids)
    return get_grades_for_course_offerings(offering_ids)

def get_grade_distribution_for_course_offering_name(name):
    offering_ids = get_ids_for_course_offering_name(name)
    return get_grades_for_course_offerings(offering_ids)

def plot_grade_distribution(distribution, course_name):
    plot()
    plt.title(course_name)
    sns.barplot(x=distribution.index, y=distribution.values)
    

# Functions to sample probability distributions
# All the functions follow a similar algorithm:
    # Each distribution contains a count for each element in the distribution
    # The cumulative sum of each distribution is taken
    # A random value between 0 and the highest cumulative sum is generated from a uniform distribution
    # This value is then used to pick the corresponding element from the distribution
# If a distribution is biased, then the algorithm is slightly modified:
    # The distribution's counts are increased or decreased as per the bias and weights
    # Then the above algorithm is applied
def sample_distribution(distribution):
    cumulative_sum = distribution.cumsum()
    r = random.randint(0, cumulative_sum['F'])
    for (index, value) in cumulative_sum.items():
        if value >= r:
            return index

def sample_courses(courses):
    cumulative_sum = courses['students'].cumsum()
    highest_value = cumulative_sum.iloc[len(filtered_cs_courses)-1]
    r = random.randint(0, highest_value)
    for (index, value) in cumulative_sum.items():
        if value >= r:
            return index

def sample_distribution_biased(distribution, bias):
    distribution_copy = distribution.copy()
    for ((index, value), weight) in zip(distribution.items(), weight_range):
        delta = value * bias * weight
        new_value = value + delta
        distribution_copy[index] = math.floor(new_value)
    cumulative_sum = distribution_copy.cumsum()
    r = random.randint(0, cumulative_sum['F'])
    for (index, value) in cumulative_sum.items():
        if value >= r:
            return index

### Setting the weightage for each grade, from A through F

In [10]:
num_grades = len(distribution_renaming_dictionary)
weight_limit = 0.5
weight_range = np.linspace(start=weight_limit, stop=-weight_limit, num=num_grades)

### Drop all courses with no students

In [11]:
filtered_cs_courses = pd.DataFrame(columns=['name', 'students'])
for course in cs_courses['name']:
    num_students = get_grade_distribution_for_course_name(course).sum()
    if (num_students != 0):
        filtered_cs_courses = filtered_cs_courses.append({'name': course, 'students': num_students}, ignore_index=True)

In [12]:
print("Dropped %d courses after filtering." % (len(cs_courses) - len(filtered_cs_courses)))
print("Now using %d courses." % len(filtered_cs_courses))

Dropped 29 courses after filtering.
Now using 94 courses.


### Sort the filtered CS courses so we can sample them

In [13]:
filtered_cs_courses = filtered_cs_courses.sort_values(by='students', ascending=False)
filtered_cs_courses.head()

Unnamed: 0,name,students
52,DIRECTED STUDY,7231
4,INTRODUCTION TO COMPUTER ENGINEERING,5185
2,INTRODUCTION TO DISCRETE MATHEMATICS,3842
10,MACHINE ORGANIZATION AND PROGRAMMING,3660
9,DIGITAL SYSTEM FUNDAMENTALS,3222


In [14]:
def generate_transcripts(num_students=100_000):
    # Create a DataFrame to hold our generated transcripts
    transcripts = pd.DataFrame(columns=filtered_cs_courses['name'])
    
    for i in range(num_students):  # For each student
        # We'll store the selected courses of a student in a set to guarantee unique courses per student
        random_courses = set([])
        
        # Each student transcript is a Series, which will be appended to the DataFrame
        student_transcript = pd.Series(index=transcripts.columns)
        
        # Generate a random bias for a student.
        # Higher values mean that a student is better, so they will have higher chances of getting better grades.
        # Lower values mean that a student is worse, so they will have higher chances of getting worse grades.
        bias = random.uniform(-2, 2)
        
        while len(random_courses) < 8:  # Generate 8 courses per student
            # First sample the CS courses to get a course with a weighted probability
            random_course_number = sample_courses(filtered_cs_courses)
            
            # Get the name of that course
            name = filtered_cs_courses.iloc[random_course_number]['name']
            
            if name in random_courses:  # This course already exists in the current student's transcript
                continue
            
            # Get the grade distribution for that course and add this course to the set
            distribution = get_grade_distribution_for_course_name(name)
            random_courses.add(name)
            
            # Get the grade for a student from a course's grade distribution, biasing the sampling using the student bias
            student_transcript[name] = sample_distribution_biased(distribution, bias)
        
        # Store the randomly generated student transcript in the DataFrame
        transcripts = transcripts.append(student_transcript, ignore_index=True)
        
        # Report progress
        if i%100 == 0:
            print("%d/%d..." % (i, num_students))
    
    print("Done!")
    transcripts.to_csv('transcripts.csv')

In [15]:
generate_transcripts(num_students=50_000)

0/50000...
100/50000...
200/50000...
300/50000...
400/50000...
500/50000...
600/50000...
700/50000...
800/50000...
900/50000...
1000/50000...
1100/50000...
1200/50000...
1300/50000...
1400/50000...
1500/50000...
1600/50000...
1700/50000...
1800/50000...
1900/50000...
2000/50000...
2100/50000...
2200/50000...
2300/50000...
2400/50000...
2500/50000...
2600/50000...
2700/50000...
2800/50000...
2900/50000...
3000/50000...
3100/50000...
3200/50000...
3300/50000...
3400/50000...
3500/50000...
3600/50000...
3700/50000...
3800/50000...
3900/50000...
4000/50000...
4100/50000...
4200/50000...
4300/50000...
4400/50000...
4500/50000...
4600/50000...
4700/50000...
4800/50000...
4900/50000...
5000/50000...
5100/50000...
5200/50000...
5300/50000...
5400/50000...
5500/50000...
5600/50000...
5700/50000...
5800/50000...
5900/50000...
6000/50000...
6100/50000...
6200/50000...
6300/50000...
6400/50000...
6500/50000...
6600/50000...
6700/50000...
6800/50000...
6900/50000...
7000/50000...
7100/50000...
7200