In [1]:
#Import the two libraries needed to run the script
import pandas as pd
import requests

In [2]:
#This is the baseurl used to retrieve the list of datasets on the website
baseurl = 'https://redistrictingdatahub.org/wp-json/download/list'

In [3]:
"""This function retrieves a list of all datasets on the RDH site. In order to run, you must be an API user and registered with the RDH site.
Inputs: username (string), password (string)
Optional Inputs: form (string -- either 'csv' or 'json'), baseurl"""
def get_list(username, password, form='csv', baseurl=baseurl):
    print('Retrieving list of datasets on RDH Website...')
    params = {}
    params['username'] = username
    params['password'] = password
    params['format'] = form
    r = requests.get(baseurl, params=params)
    #Write the request to a file in the current workig directory
    open('RDH_FileListing.'+form, 'wb').write(r.content)
    print('List of datasets retrieved.')

In [4]:
"""This function reads in the list of datasets as a pandas dataframe.
Optional input: form (string -- either 'csv' or 'json')"""
def read_list(form='csv'):
    if form == 'csv':
        df = pd.read_csv(r'./RDH_FileListing.csv')
    else:
        df = pd.read_json(r'./RDH_FileListing.json')
    return df

In [5]:
"""This function checks all strings in a list and returns either True or False based on if the the string is in that row
Inputs: string list (list), and row
Outputs: T/F"""
def check_string(string_list, row):
    check_list = []
    for i in string_list:
        if i in row:
            #If the string from the list is in the row being searched, add True to the check_list
            check_list.append(True)
        else:
            #if it isn't, add False
            check_list.append(False)
    #make the list a set to eliminate duplicate values
    check_list = set(check_list)
    if len(check_list) == 1:
        if any('True') in check_list:
            #if the lenght of the set is 1 (one value between T/F) AND that value is True, return True
            return True
        else:
            #If there is any False value, return False
            return False
    else:
        return False

In [6]:
'''This function extracts the data that meets input specifications to the current working directory. In order to run, you must be an API user and registered with the RDH site.
Inputs: username (string), password (string), state_name (string), add_string (list of strings)
Output: N/A'''
def get_data(username,password,state_name,add_string):
    #get list of datasets
    get_list(username,password)
    #read in the list
    df = read_list()
    params = (
    ('username', username),
    ('password', password),
    )
    #subset the df by state name
    df['Subset'] = df['State'].apply(lambda x: True if x==state_name else False)
    df = df[df['Subset']==True]
    #subset the df by the additional string info
    df['Subset'] = df['Title'].apply(lambda x: check_string(add_string,x))
    #df['Subset'] = df['Title'].apply(lambda x: True if add_string in str(x) else False)
    df = df[df['Subset']==True]
    #take all of the urls in the subset df and split them to just get the baseurl of the dataset (no params)
    urls = list(df['Download URL'])
    new_urls = []
    for i in urls:
        new = i.split('?')[0]
        new_urls.append(new)
    counter = 1
    #iterate over all of the new urls and retrieve the data
    for i in new_urls:
        print('Retrieving', str(counter), 'of',str(len(new_urls)),'files')
        #get the data from the url and the params listed above
        response = requests.get(i,params)
        #get the file name of the dataset
        file_name = i.split('%2F')[-1]
        file_name = file_name.split('/')[-1]
        file_name_no_zip = file_name.split('.')[0]
        zipdot = '.'+file_name.split('.')[1]
        #because we have multiple datasets with the same name (for CSV and SHP), but we may want SHP or CSV, we need to make them unique filenames
        if 'csv' in i:
            dtype = '_csv'
        else:
            dtype = '_shp'
        #new filename
        file_name = file_name_no_zip+dtype+zipdot
        print('Retrieving ', file_name)
        #write the data
        file = open(file_name, "wb")
        file.write(response.content)
        file.close()
        #print(response.url)
        counter = counter+1


In [7]:
#Instantiate the inputs to be run
username = str(input('RDH Username: '))
password = str(input('RDH Password: '))

state = str(input('What state do you want data for? '))
state = state.capitalize()
string = str(input('Any other parameters? Please separate by comma (e.g. VEST, 2011, precinct, shp, csv). '))
string = [i.strip() for i in string.split(',')]

RDH Username: spencer.rdh.test
RDH Password: UCb#(\TE4afx`#HQ
What state do you want data for? missouri
Any other parameters? Please separate by comma (e.g. VEST, 2011, precinct, shp, csv). VEST, 2016


In [8]:
get_data(username,password,state,string)

Retrieving list of datasets on RDH Website...
List of datasets retrieved.
Retrieving 1 of 1 files
Retrieving  mo_vest_16_shp.zip
