# Step 0: Grab data files from NCES
# This page is just to grab the files we need

## At the core, each "survey" file has the csv data along with a dictionary for deciphering it
### These will be downloaded to a local "input directory"

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import csv

In [2]:
# This function is just a utility for grabbing the files
import urllib
import io
import os

def grab_zipfile(baseUrl, fileroot):
    '''uses a known static baseUrl to grab a zipped archive'''
    full_url = baseUrl + fileroot + '.zip'
    print(full_url, flush=True)
    remoteFile = urllib.request.urlopen(full_url)

    bio = io.BytesIO()
    bio.write(remoteFile.read())

    z = ZipFile(bio, 'r')
    print(z.namelist(), flush=True)
    z.extract(z.namelist()[0])
    return z.namelist()[0]

In [3]:
os.mkdir('inputs')
os.chdir('inputs')

In [6]:
# Here's where the files live:
baseUrl = 'https://nces.ed.gov/ipeds/datacenter/data/'

# Grad rate data; for each file, we typically try 2 more years than the prior year and if that file doesn't exist, try one more
fileroot = 'GR2021'

grab_zipfile(baseUrl, fileroot)
grab_zipfile(baseUrl, fileroot+'_DICT')


https://nces.ed.gov/ipeds/datacenter/data/GR2021.zip
['gr2021.csv']
https://nces.ed.gov/ipeds/datacenter/data/GR2021_DICT.zip
['gr2021.xlsx']


'gr2021.xlsx'

In [7]:
# For grad rate only, we grab the 3 most recent years
grab_zipfile(baseUrl, 'GR2020')
grab_zipfile(baseUrl, 'GR2019')

https://nces.ed.gov/ipeds/datacenter/data/GR2020.zip
['gr2020.csv', 'gr2020_rv.csv']
https://nces.ed.gov/ipeds/datacenter/data/GR2019.zip
['gr2019.csv', 'gr2019_rv.csv']


'gr2019.csv'

In [10]:
# The master directory:
grab_zipfile(baseUrl, 'HD2021')

https://nces.ed.gov/ipeds/datacenter/data/HD2021.zip
['hd2021.csv']


'hd2021.csv'

In [11]:
grab_zipfile(baseUrl, 'HD2021_DICT')

https://nces.ed.gov/ipeds/datacenter/data/HD2021_DICT.zip
['hd2021.xlsx']


'hd2021.xlsx'

In [12]:
# Admissions
grab_zipfile(baseUrl, 'ADM2021')
grab_zipfile(baseUrl, 'ADM2021_DICT')

https://nces.ed.gov/ipeds/datacenter/data/ADM2021.zip
['adm2021.csv']
https://nces.ed.gov/ipeds/datacenter/data/ADM2021_DICT.zip
['adm2021.xlsx']


'adm2021.xlsx'

In [13]:
# Ethnicity
grab_zipfile(baseUrl, 'EF2021A')
grab_zipfile(baseUrl, 'EF2021A_DICT')

https://nces.ed.gov/ipeds/datacenter/data/EF2021A.zip
['ef2021a.csv']
https://nces.ed.gov/ipeds/datacenter/data/EF2021A_DICT.zip
['ef2021a.xlsx']


'ef2021a.xlsx'

In [14]:
# Retention
grab_zipfile(baseUrl, 'EF2021D')
grab_zipfile(baseUrl, 'EF2021D_DICT')

https://nces.ed.gov/ipeds/datacenter/data/EF2021D.zip
['ef2021d.csv']
https://nces.ed.gov/ipeds/datacenter/data/EF2021D_DICT.zip
['ef2021d.xlsx']


'ef2021d.xlsx'

In [15]:
# Institutional Characteristics
grab_zipfile(baseUrl, 'IC2021')
grab_zipfile(baseUrl, 'IC2021_DICT')

https://nces.ed.gov/ipeds/datacenter/data/IC2021.zip
['ic2021.csv']
https://nces.ed.gov/ipeds/datacenter/data/IC2021_DICT.zip
['ic2021.xlsx']


'ic2021.xlsx'

In [16]:
# Student Financial Aid
grab_zipfile(baseUrl, 'SFA2021')
grab_zipfile(baseUrl, 'SFA2021_DICT')

https://nces.ed.gov/ipeds/datacenter/data/SFA2021.zip
['sfa2021.csv']
https://nces.ed.gov/ipeds/datacenter/data/SFA2021_DICT.zip
['sfa2021.xlsx']


'sfa2021.xlsx'