The purpose of this notebook is to generate test data for students and schools. 

In [1]:
from faker import Faker
import pandas as pd
from random import randint, choice, sample
import numpy as np

## Prepare Constants

In [2]:
NUM_STUDENTS = 50
MAX_STU_RANKS = 2 # must be > 1

NUM_PARTNERS = 5 # must be > 1
MAX_PARTNER_RANKS = 5 # must be partners * max_partners_ranks < students

PARTNERS = ['College {0}'.format(i) for i in range(1, NUM_PARTNERS + 1)]

In [3]:
# data factory to create test data
dfactory = Faker()

## Create Student Data

In [4]:
stu_df = pd.DataFrame({
    'slate_id': ["{:09d}".format(randint(1, 999999999)) 
                 for i in sample(range(int(1.2 * NUM_STUDENTS)), NUM_STUDENTS)],
    'first_name': [dfactory.first_name() for i in range(NUM_STUDENTS)],
    'last_name': [dfactory.last_name() for i in range(NUM_STUDENTS)],
    'gender': [choice(['M', 'F', '']) for i in range(NUM_STUDENTS)],
    'ach_score': [randint(1, 5) for i in range(NUM_STUDENTS)],
    'fin_score': [randint(1, 5) for i in range(NUM_STUDENTS)]    
})
stu_df.head()

Unnamed: 0,slate_id,first_name,last_name,gender,ach_score,fin_score
0,356611809,Devin,Silva,M,1,1
1,963487360,Brian,Nguyen,M,3,1
2,600569956,Desiree,Hines,F,3,1
3,684156216,Matthew,Mcdonald,F,2,2
4,971999253,Joseph,Bray,M,2,3


In [5]:
# check for dupliates
if stu_df['slate_id'].duplicated().value_counts().size > 1:
    raise ValueError("Duplicate SlateIDs found... re-run.")

## Create Student Ranking for Schools

In [6]:
def create_school_rank(slate_id, max_ranks, partners):
    '''Function to randomly assign ranks for partners from studetns
    
    Use with `apply()` on student column to generate random partner ranks
    for each student up to max allowed ranks. Must at least rank 1 partner
    per student. Not all partners will be ranked.
    
    Args:
        slate_id <str>: SlateID of student
        max_ranks <int>: Maximum number of ranks per student allowed
        partners <list>: List of partner/college names to rank from
        
    Return <pd.Series>:
        Random student ranks per partner
    '''
    # choose random number of ranks per student up to `max_ranks`
    rank_count = randint(1, max_ranks)
    
    # prep container
    ranks = {}.fromkeys(partners)
    ranks = {partner: i for i, partner in enumerate(sample(partners, rank_count), 1)}
    
    # add slate_id
    ranks['slate_id'] = slate_id
    
    return pd.Series(ranks)

In [7]:
# create student rankings for schools for each student, must rank at least 1 school
stu_rank_df = stu_df['slate_id'].apply(create_school_rank, args = (MAX_STU_RANKS, PARTNERS))[(['slate_id'] + PARTNERS)]

In [8]:
# make sure both data frame have same size
if stu_df.shape[0] != stu_rank_df.shape[0]:
    raise ValueError("Shape of Student dataframes DO NOT match up! They need to have the same number of students!")

In [9]:
# some clean up; replace Nan with 0 and force to int
stu_rank_df = stu_rank_df.replace(np.nan, 0)
for partner in PARTNERS:
    stu_rank_df[partner] = stu_rank_df[partner].astype(int)

## Create Partner Ranks

In [10]:
[1, 2, 3].extend([1, 2, 3])

In [11]:
partner_ranks = {
    'partner_name': [], 
    'slate_id': [], 
    'rank': []
}

for partner in PARTNERS:
    # choose random number of ranks per partner up to `max partner ranks`
    rank_count = randint(1, MAX_PARTNER_RANKS)
    
    # chose randome sample of students
    students = sample(list(stu_df['slate_id']), rank_count)
    
    # give random ranks to each student
    ranks = list(range(1, rank_count + 1))
    
    partner_ranks['partner_name'].extend(([partner] * rank_count))
    partner_ranks['slate_id'].extend(students)
    partner_ranks['rank'].extend(ranks)

partner_ranks = pd.DataFrame(partner_ranks)

## Create outputs

In [12]:
# student data
stu_df.to_csv('./../data/raw/test/test_student_data.csv', encoding = 'utf8', index = False)

In [13]:
# student ranks
stu_rank_df.to_csv('./../data/raw/test/test_student_ranks.csv', encoding = 'utf8', index = False)

In [14]:
# partner ranks
partner_ranks.to_csv('./../data/raw/test/test_partner_ranks.csv', encoding = 'utf8', index = False)