In [2]:
import time
import csv
import re

import pandas as pd
import numpy as np

## Import and Tidy Data from Form

In [153]:
df = pd.read_csv('./data/data_1.csv')

In [154]:
df = df[[col for col in df.columns if (("Preferences" in col) | ("Email" in col) | ("Name" in col) | ("Year" in col))]]

In [155]:
df.columns = list(df.columns[0:3]) + [re.split('\[|\]',name)[1] for name in df.columns[3:]]
cohort_names = list(df.columns[3:])

In [156]:
for col in cohort_names:
    df[col] = pd.to_numeric(df[col].str.extract('(\d)', expand=False).str.strip())

In [157]:
df.fillna(0,inplace=True)

In [158]:
df

Unnamed: 0,Email Address,First Name,Last Name,Economic History,Development Economics,Economics of Inequality,Economics and Machine Learning
0,gwenythross@uchicago.edu,Gwenyth,Ross,4.0,3.0,2.0,1.0
1,yasminemara@uchicago.edu,Yasmin,Hashem,4.0,1.0,2.0,3.0
2,brycklen@uchicago.edu,Brycklen,Arnold,1.0,0.0,2.0,4.0
3,alecboyajian@uchicago.edu,Alec,Boyajian,2.0,1.0,4.0,0.0
4,victorqian@uchicago.edu,Victor,Qian,3.0,4.0,1.0,2.0
...,...,...,...,...,...,...,...
58,panc@uchicago.edu,Christopher,Pan,3.0,1.0,4.0,2.0
59,tanyadholakia@uchicago.edu,Tanya,Dholakia,4.0,1.0,0.0,2.0
60,malvarezdemalde@uchicago.edu,Matias,Alvarez Demalde,2.0,1.0,4.0,3.0
61,bermann@uchicago.edu,Benjamin,Bermann,4.0,1.0,2.0,3.0


## Sort Students

Fill each cohort with people who selected as first choice then cut down and re-sort

In [139]:
cohorts = {name:[] for name in cohort_names}

In [140]:
temp = df.copy()
for rank in range(1,len(cohort_names)+1):
    for name in cohort_names:
        for row in range(0,temp.shape[0]):
            if ((pd.notna(temp.loc[row,'Email Address'])) and (len(cohorts[name]) < 20) and (temp[name][row] == rank)):
                cohorts[name].append(temp['Email Address'][row])
                temp.loc[row,'Email Address'] = np.nan

In [141]:
print(list(cohorts.keys()))
print([len(cohorts[key]) for key in cohorts.keys()])

['Economic History', 'Development Economics', 'Economics of Inequality', 'Economics and Machine Learning']
[13, 20, 9, 20]


In [142]:
for c in cohorts.keys():
    pref = {email:rank for email,rank in zip(df['Email Address'],df[c])}
    print(c,': ', pd.Series(cohorts[c]).replace(pref).unique())

Economic History :  [1. 2. 4.]
Development Economics :  [1.]
Economics of Inequality :  [1. 2.]
Economics and Machine Learning :  [1. 2.]


In [148]:
econ_hist = {email:email for email in cohorts['Economic History']}

In [149]:
pref_hist = {email:rank for email,rank in zip(df['Email Address'],df['Economic History'])}
for mail in econ_hist.keys():
    econ_hist[mail] = pref_hist[mail]

In [150]:
econ_hist

{'brycklen@uchicago.edu': 1.0,
 'kmeyyappan@uchicago.edu': 1.0,
 'cbuyalos@uchicago.edu': 1.0,
 'ashivaram27@uchicago.edu': 1.0,
 'lilychen@uchicago.edu': 1.0,
 'gabicampos@uchicago.edu': 1.0,
 'jhiverson@uchicago.edu': 1.0,
 'adrakopoulou@uchicago.edu': 1.0,
 'raylee@uchicago.edu': 1.0,
 'rachelalper@uchicago.edu': 1.0,
 'shivsawhney@uchicago.edu': 2.0,
 'malvarezdemalde@uchicago.edu': 2.0,
 'tanyadholakia@uchicago.edu': 4.0}

In [152]:
df[df['Email Address'].str.contains('tanyadholakia')]

Unnamed: 0,Email Address,First Name,Last Name,Economic History,Development Economics,Economics of Inequality,Economics and Machine Learning
59,tanyadholakia@uchicago.edu,Tanya,Dholakia,4.0,1.0,0.0,2.0


In [173]:
cohort_assignments = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in cohorts.items()]))

In [183]:
for cohort, emails in cohorts.items():
    indiv_assign = df[df['Email Address'].isin(emails)]
    path = './assignments/' + '_'.join(cohort.lower().split(' ')) + '.csv'
    indiv_assign.to_csv(path_or_buf=path)
    print(path)

./assignments/economic_history.csv
./assignments/development_economics.csv
./assignments/economics_of_inequality.csv
./assignments/economics_and_machine_learning.csv


In [174]:
cohort_assignments

Unnamed: 0,Economic History,Development Economics,Economics of Inequality,Economics and Machine Learning
0,brycklen@uchicago.edu,yasminemara@uchicago.edu,victorqian@uchicago.edu,gwenythross@uchicago.edu
1,kmeyyappan@uchicago.edu,alecboyajian@uchicago.edu,sycalvind@uchicago.edu,gasparolea@uchicago.edu
2,cbuyalos@uchicago.edu,ghwhorton@uchicago.edu,iwang1@uchicago.edu,locdang@uchicago.edu
3,ashivaram27@uchicago.edu,avalenzuelapidal@uchicago.edu,edosani@uchicago.edu,aaronz@uchicago.edu
4,lilychen@uchicago.edu,hilfersd@uchicago.edu,maxz@uchicago.edu,owenrhumphries@uchicago.edu
5,gabicampos@uchicago.edu,graceoh@uchicago.edu,jxiong3@uchicago.edu,kaizewu@uchicago.edu
6,jhiverson@uchicago.edu,jzmiller97@uchicago.edu,connollyb@uchicago.edu,nmkhan@uchicago.edu
7,adrakopoulou@uchicago.edu,lucashou@uchicago.edu,alsong@uchicago.edu,ethanjiang@uchicago.edu
8,raylee@uchicago.edu,sitongliu@uchicago.edu,bermann@uchicago.edu,epan02@uchicago.edu
9,rachelalper@uchicago.edu,msnasir@uchicago.edu,,nicholaschen2003@uchicago.edu


In [161]:
np.array([len(cohort) for cohort in cohorts.values()]).sum()

62

In [168]:
assigned = sum(cohorts.values(), [])

In [171]:
set(assigned).symmetric_difference(set(df['Email Address'].tolist()))

{'giyoung@uchicago.edu'}

In [None]:
df[df['Email Address'].str.contains('tanyadholakia')]

In [178]:
leftovers = list(set(assigned).symmetric_difference(set(df['Email Address'].tolist())))

In [180]:
if len(leftovers) != 0:
    df[df['Email Address'].isin(leftovers)].to_csv(path_or_buf='./assignments/leftovers.csv')

In [182]:
df[df['Email Address'].isin(leftovers)]

Unnamed: 0,Email Address,First Name,Last Name,Economic History,Development Economics,Economics of Inequality,Economics and Machine Learning
53,giyoung@uchicago.edu,Giyoung,Kwon,0.0,1.0,0.0,0.0


## Test

In [3]:
df = pd.read_csv('./data/data_1.csv')
df = df[[col for col in df.columns if (("Preferences" in col) | ("Email" in col) | ("Name" in col) | ("Year" in col))]]
df.columns = list(df.columns[0:4]) + [re.split('\[|\]',name)[1] for name in df.columns[4:]]

IndexError: list index out of range

In [4]:
df

Unnamed: 0,Email Address,First Name,Last Name,Year,Please Rank Your Cohort Preferences [Economic History],Please Rank Your Cohort Preferences [Development Economics],Please Rank Your Cohort Preferences [Economics of Inequality],Please Rank Your Cohort Preferences [Economics and Machine Learning]
0,gwenythross@uchicago.edu,Gwenyth,Ross,2,4 (Last Choice),3,2,1 (First Choice)
1,yasminemara@uchicago.edu,Yasmin,Hashem,3,4 (Last Choice),1 (First Choice),2,3
2,brycklen@uchicago.edu,Brycklen,Arnold,1,1 (First Choice),Cannot Attend,2,4 (Last Choice)
3,alecboyajian@uchicago.edu,Alec,Boyajian,1,2,1 (First Choice),4 (Last Choice),Cannot Attend
4,victorqian@uchicago.edu,Victor,Qian,2,3,4 (Last Choice),1 (First Choice),2
...,...,...,...,...,...,...,...,...
63,gflegueras@uchicago.edu,Gianina,Flegueras,2,Cannot Attend,1 (First Choice),2,Cannot Attend
64,rmguthrie@uchicago.edu,Ryan,Guthrie,1,Cannot Attend,1 (First Choice),Cannot Attend,Cannot Attend
65,laraburuk@uchicago.edu,Lara,Buruk,1,4 (Last Choice),2,3,1 (First Choice)
66,bettyerose@uchicago.edu,Bettye,Igbenebor,2,4 (Last Choice),1 (First Choice),3,2
