In [87]:
import pandas as pd
import numpy as np
import random

Assumption: 536 total student accounts

Using the distribution of the number of apps, we get 3901 total apps: 3043 to the 20 most popular colleges, 858 to all the rest.

Of those 3901 applications, there will be 536 early decision/admission apps, and the remainder are regular decision. The choice to bucket early admission and decision together is for simplicity, so that we don't have to look up which university does what type of early application.


In [88]:
universities = pd.read_csv('UNIVERSITIES.csv', header=[0], index_col=[0])
universities.head()

Unnamed: 0_level_0,institution_name,state,region,year,student_count,percent_admitted,percent_admitted_men,percent_admitted_women,yield_total,yield_men,yield_women
university_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
110404,California Institute of Technology,California,"Far West (AK, CA, HI, NV, OR, WA)",2022,2518,3,2.0,4.0,50,55.0,46.0
166027,Harvard University,Massachusetts,"New England (CT, ME, MA, NH, RI, VT)",2022,37613,3,3.0,3.0,83,81.0,84.0
166683,Massachusetts Institute of Technology,Massachusetts,"New England (CT, ME, MA, NH, RI, VT)",2022,12829,4,3.0,6.0,85,88.0,82.0
190150,Columbia University in the City of New York,New York,"Mid East (DE, DC, MD, NJ, NY, PA)",2022,37954,4,5.0,3.0,64,64.0,63.0
243744,Stanford University,California,"Far West (AK, CA, HI, NV, OR, WA)",2022,20955,4,3.0,4.0,84,84.0,84.0


In [None]:
popular_unis = [
    "Harvard University",
    "Boston College",
    "Brown University",
    "Dartmouth College",
    "Middlebury College" ,
    "Georgetown University",
    "Bowdoin College",
    "Williams College",
    "Tufts University",
    "Duke University",
    "Colby College",
    "Princeton University",
    "Cornell University",
    "Yale University",
    "University of Virginia-Main Campus",
    "Hamilton College",
    "Bates College",
    "Boston University",
    "University of Chicago",
    "Columbia University in the City of New York"]

distribution = [.116, .091, .077, .077, .064, .051, .051, .051, .051 ,.051,
                .038, .038, .038, .038, .038, .026, .026, .026, .026, .026]
# 78% of schools applied to are in this group
# distribution is how much of that 78%

In [92]:
pop_picks = random.choices(population=popular_unis, weights = distribution, k=3043)
len(pop_picks)

3043

In [93]:
for uni in popular_unis:
    print(f"{uni}: {pop_picks.count(uni)}")

Harvard University: 316
Boston College: 282
Brown University: 231
Dartmouth College: 251
Middlebury College: 179
Georgetown University: 173
Bowdoin College: 158
Williams College: 169
Tufts University: 154
Duke University: 179
Colby College: 103
Princeton University: 97
Cornell University: 112
Yale University: 106
University of Virginia-Main Campus: 134
Hamilton College: 95
Bates College: 69
Boston University: 87
University of Chicago: 75
Columbia University in the City of New York: 73


In [None]:
pop_apps = []
# converting names to IDs
# a few in the "all the rest" category have same names so this cuts down
# on the confusion
for i in range(len(pop_picks)):
    pop_apps.append(universities[universities['institution_name']==pop_picks[i]].index.item())

In [95]:
regions = [
"New England (CT, ME, MA, NH, RI, VT)",
"Mid East (DE, DC, MD, NJ, NY, PA)",
"Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV)",
"Great Lakes (IL, IN, MI, OH, WI)",
"Far West (AK, CA, HI, NV, OR, WA)",
"Plains (IA, KS, MN, MO, NE, ND, SD)",
"Rocky Mountains (CO, ID, MT, UT, WY)",
"Southwest (AZ, NM, OK, TX)",

]
region_distribution = [.39, .22, .13, .09, .08, .03, .03, .033]

In [96]:
all_the_rest_regions = random.choices(population=regions, weights = region_distribution, k=858)

In [97]:
for region in regions:
    print(f"{region}: {all_the_rest_regions.count(region)}")

New England (CT, ME, MA, NH, RI, VT): 341
Mid East (DE, DC, MD, NJ, NY, PA): 175
Southeast (AL, AR, FL, GA, KY, LA, MS, NC, SC, TN, VA, WV): 111
Great Lakes (IL, IN, MI, OH, WI): 77
Far West (AK, CA, HI, NV, OR, WA): 73
Plains (IA, KS, MN, MO, NE, ND, SD): 23
Rocky Mountains (CO, ID, MT, UT, WY): 29
Southwest (AZ, NM, OK, TX): 29


In [106]:
# removing the popular unis from the list in order to generate the remaining 858 apps
all_the_rest = universities[~universities['institution_name'].isin(popular_unis)]

In [99]:
atr_uni_picks = []
for region in all_the_rest_regions:
    uni = all_the_rest[all_the_rest['region'] == region].sample(n=1)
    atr_uni_picks.append(uni.index.item())
len(atr_uni_picks)

858

In [None]:
# concatenating and shuffling the list of applications
all_applied_unis = pop_apps + atr_uni_picks
random.shuffle(all_applied_unis)
len(all_applied_unis)


3901

In [None]:
# converting into a dataframe
applications = pd.DataFrame(all_applied_unis, columns=['university_id'])
applications.index += 1
applications.head()

Unnamed: 0,university_id
1,131496
2,164924
3,186131
4,161004
5,159568


In [None]:
# randomizing admission status
def admit(university_id):
    percent = universities.loc[university_id]['percent_admitted'].item()
    return random.randrange(0,100) <= percent

admissions = []
for i in range(len(all_applied_unis)):
    admissions.append(int(admit(all_applied_unis[i])))
applications.insert(1, 'admitted', admissions)
applications.head()

Unnamed: 0,university_id,admitted
1,131496,0
2,164924,0
3,186131,0
4,161004,0
5,159568,1


In [None]:
# randomizing early decision vs regular decision
early = [1 for i in range(536)]
regular = [0 for i in range(3901-536)]
decisions = early + regular
random.shuffle(decisions)
applications.insert(2, 'ed', decisions)
applications.insert(3, 'rd', [int(not elem) for elem in decisions])
applications.head()

Unnamed: 0,university_id,admitted,ed,rd
1,131496,0,0,1
2,164924,0,0,1
3,186131,0,1,0
4,161004,0,0,1
5,159568,1,1,0


In [None]:
# generating 4-digit student IDs
student_ids = random.sample(range(1000,10000), 536)

In [None]:
# randomly selecting how many regular decision apps each student makes
# app nums are total apps made minus 1 since all students do one ED app
app_nums = [0, 1, 2, 3, 4, 5, 6, 
            7, 8, 9, 10, 11, 12, 13, 14]
app_distribution = [.1, .02, .03, .02, .06, .17, .16, 
                    .11, .09, .07, .04, .04, .03, .03, .03]

student_rd_apps = []
for id in student_ids:
    apps = random.choices(population = app_nums, weights=app_distribution, k=1)
    if apps[-1] > 0:
        for i in range(apps[-1]):
            student_rd_apps.append(id)


In [None]:
# organizing and combining the two kinds of applications
ids_copy = student_ids.copy()
random.shuffle(student_rd_apps)
student_apps = []
for i in range(3901):
    if decisions[i] == 1:
        student_apps.append(ids_copy.pop())
    else:
        student_apps.append(student_rd_apps.pop())

In [142]:
applications.insert(4, 'student_ids', student_apps)

In [143]:
applications.head()

Unnamed: 0,university_id,admitted,ed,rd,student_ids
1,131496,0,0,1,7075
2,164924,0,0,1,6094
3,186131,0,1,0,3694
4,161004,0,0,1,7765
5,159568,1,1,0,2185


below cells were purely for exporting the data so I could load it into the google sheet

In [144]:
applications.to_csv('applications.csv', index=True)

In [145]:
import csv
with open('students.csv', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(student_ids)