# Step 0: Grab data files from NCES
# This page is just to grab the files we need

## At the core, each "survey" file has the csv data along with a dictionary for deciphering it
### These will be downloaded to a local "input directory"

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import csv

In [2]:
# This function is just a utility for grabbing the files
import urllib
import io
import os

def grab_zipfile(baseUrl, fileroot):
    '''uses a known static baseUrl to grab a zipped archive'''
    full_url = baseUrl + fileroot + '.zip'
    print(full_url, flush=True)
    remoteFile = urllib.request.urlopen(full_url)

    bio = io.BytesIO()
    bio.write(remoteFile.read())

    z = ZipFile(bio, 'r')
    print(z.namelist(), flush=True)
    z.extract(z.namelist()[0])
    return z.namelist()[0]

In [3]:
os.mkdir('inputs')
os.chdir('inputs')

In [5]:
# Here's where the files live:
baseUrl = 'https://nces.ed.gov/ipeds/datacenter/data/'

# Grad rate data; for each file, we typically try 2 more years than the prior year and if that file doesn't exist, try one more
fileroot = 'GR2018'

grab_zipfile(baseUrl, fileroot)
grab_zipfile(baseUrl, fileroot+'_DICT')


https://nces.ed.gov/ipeds/datacenter/data/GR2018.zip
['gr2018.csv']
https://nces.ed.gov/ipeds/datacenter/data/GR2018_DICT.zip
['gr2018.xlsx']


'gr2018.xlsx'

In [6]:
# For grad rate only, we grab the 3 most recent years
grab_zipfile(baseUrl, 'GR2017')
grab_zipfile(baseUrl, 'GR2016')

https://nces.ed.gov/ipeds/datacenter/data/GR2017.zip
['gr2017.csv', 'gr2017_rv.csv']
https://nces.ed.gov/ipeds/datacenter/data/GR2016.zip
['gr2016.csv', 'gr2016_rv.csv']


'gr2016.csv'

In [7]:
# The master directory:
grab_zipfile(baseUrl, 'HD2018')

https://nces.ed.gov/ipeds/datacenter/data/HD2018.zip
['hd2018.csv']


'hd2018.csv'

In [8]:
grab_zipfile(baseUrl, 'HD2018_DICT')

https://nces.ed.gov/ipeds/datacenter/data/HD2018_DICT.zip
['hd2018.xlsx']


'hd2018.xlsx'

In [9]:
# Admissions
grab_zipfile(baseUrl, 'ADM2018')
grab_zipfile(baseUrl, 'ADM2018_DICT')

https://nces.ed.gov/ipeds/datacenter/data/ADM2018.zip
['adm2018.csv']
https://nces.ed.gov/ipeds/datacenter/data/ADM2018_DICT.zip
['adm2018.xlsx']


'adm2018.xlsx'

In [10]:
# Ethnicity
grab_zipfile(baseUrl, 'EF2018A')
grab_zipfile(baseUrl, 'EF2018A_DICT')

https://nces.ed.gov/ipeds/datacenter/data/EF2018A.zip
['ef2018a.csv']
https://nces.ed.gov/ipeds/datacenter/data/EF2018A_DICT.zip
['ef2018a.xlsx']


'ef2018a.xlsx'

In [11]:
# Retention
grab_zipfile(baseUrl, 'EF2018D')
grab_zipfile(baseUrl, 'EF2018D_DICT')

https://nces.ed.gov/ipeds/datacenter/data/EF2018D.zip
['ef2018d.csv']
https://nces.ed.gov/ipeds/datacenter/data/EF2018D_DICT.zip
['ef2018d.xlsx']


'ef2018d.xlsx'

In [12]:
# Institutional Characteristics
grab_zipfile(baseUrl, 'IC2018')
grab_zipfile(baseUrl, 'IC2018_DICT')

https://nces.ed.gov/ipeds/datacenter/data/IC2018.zip
['ic2018.csv', 'ic2018_rv.csv']
https://nces.ed.gov/ipeds/datacenter/data/IC2018_DICT.zip
['ic2018.xlsx']


'ic2018.xlsx'

In [13]:
# Student Financial Aid
grab_zipfile(baseUrl, 'SFA1718')
grab_zipfile(baseUrl, 'SFA1718_DICT')

https://nces.ed.gov/ipeds/datacenter/data/SFA1718.zip
['sfa1718.csv']
https://nces.ed.gov/ipeds/datacenter/data/SFA1718_DICT.zip
['sfa1718.xlsx']


'sfa1718.xlsx'