This is MODIFIED code originally copied directly from https://github.com/FEMA/openfema-samples/blob/master/code-samples/api_paging_jsonoutput.py

Which is the github repo of working code samples provided by OpenFEMA https://www.fema.gov/about/openfema/developer-resources#paging

see also OpenFEMA API documentation https://www.fema.gov/about/openfema/api#filter

# LAST MODIFIED 1/9/2023 TO REMOVE COMMENTS AND ANY PARTS OF THE CODE THAT AREN'T GOING IN THE PAPER. SEE DRAFT 2 VERSION FOR THE FILE THAT IS BEST COMMENTED AND CLOSEST TO THE ORIGINAL

In [1]:
#!/usr/bin/env python3
# Paging example using Python 3. Output in JSON.
# update the code to the get CA claims from NFIP

import sys
import urllib.request
import json
import math
from datetime import datetime

# Base URL for this endpoint. Add filters, column selection, and sort order to this.
baseUrl = "https://www.fema.gov/api/open/v1/FimaNfipClaims?$filter=state%20eq%20%27CA%27&"

top = 5000      # number of records to get per call
skip = 0        # number of records to skip

# Return 1 record with your criteria to get total record count. 
# Specify only 1 column here to reduce amount of data returned.
# Need inlinecount to get record count.
webUrl = urllib.request.urlopen(
    baseUrl + "$inlinecount=allpages&$select=id&$top=1")
result = webUrl.read()
jsonData = json.loads(result.decode())

# calculate the number of calls we will need to get all of our data (using the maximum of 5000)
recCount = jsonData['metadata']['count']
loopNum = math.ceil(recCount / top)
#loopNum = 1

# send some logging info to the console so we know what is happening
print("START " + str(datetime.now()) + ", " + str(recCount) + " records, " +
      str(top) + " returned per call, " + str(loopNum) + " iterations needed.")

# Initialize our file. Only doing this because of the type of file wanted. See the loop below.
# The root json entity is usually the name of the dataset, but you can use any name.
# Set a new name each time the script is run because it keeps adding to the old file
outFile = open("output4.json", "a")
outFile.write('{"FimaNfipClaims":[')

# Loop and call the API endpoint changing the record start each iteration. 
# The metadata is being suppressed as we no longer need it.
i = 0
while (i < loopNum):
    # By default data is returned as a JSON object, the data set name being the root element. Unless
    #   you extract records as you process, you will end up with 1 distinct JSON object for EVERY
    #   call/iteration. An alternative is to return the data as JSONA (an array of json objects) with
    #   no root element - just a bracket at the start and end. This is easier to manipulate.
    webUrl = urllib.request.urlopen(
        baseUrl + "&$metadata=off&$format=jsona&$skip=" + str(skip) + "&$top=" + str(top))
    result = webUrl.read()

    # The data is already returned in a JSON format. There is no need to decode and load as a JSON object.
    #   If you want to begin working with and manipulating the JSON, import the json library and load with
    #   something like: jsonData = json.loads(result.decode())

    # Append results to file, trimming off first and last JSONA brackets, adding comma except for last call,
    #   AND root element terminating array bracket and brace to end unless on last call. The goal here is to
    #   create a valid JSON file that contains ALL the records. This can be done differently.
    if (i == (loopNum - 1)):
        # on the last so terminate the single JSON object
        outFile.write(str(result[1:-1], 'utf-8') + "]}")
    else:
        outFile.write(str(result[1:-1], 'utf-8') + ",")

    # increment the loop counter and skip value
    i += 1
    skip = i * top

    print("Iteration " + str(i) + " done")

outFile.close()


START 2023-01-09 16:46:08.710367, 50994 records, 5000 returned per call, 11 iterations needed.
Iteration 1 done
Iteration 2 done
Iteration 3 done
Iteration 4 done
Iteration 5 done
Iteration 6 done
Iteration 7 done
Iteration 8 done
Iteration 9 done
Iteration 10 done
Iteration 11 done


In [2]:
# lets re-open the file and see if we got the number of records we expected
inFile = open("output4.json", "r")
my_data = json.load(inFile)
print("END " + str(datetime.now()) + ", " + str(len(my_data['FimaNfipClaims'])) + " records in file")
inFile.close()

END 2023-01-09 16:47:45.967360, 50994 records in file


# THIS NEXT PART IS HOW TO GET THE RESULTS READ FROM THE WEBPAGE EXPORTED AS A .CSV
# YOU CAN ALSO READ THE .JSON DIRECTLY FROM THE OUTPUT FILE INTO A DATAFRAME, WHICH IS HOW IT IS DONE IN MY PAPER

In [10]:
#[naomi says: this loads the results just returned from the website, not the file that was written]

import json

jsonData = json.loads(result.decode())

In [12]:
#there are two keys, the first is the file name, the second is the one with the data
print(jsonData[:1])

[{'agricultureStructureIndicator': False, 'asOfDate': '2022-02-26T00:00:00.000Z', 'baseFloodElevation': None, 'basementEnclosureCrawlspace': 0, 'reportedCity': 'Temporarily Unavailable', 'condominiumIndicator': 'N', 'policyCount': 1, 'countyCode': '06097', 'communityRatingSystemDiscount': 10, 'dateOfLoss': '2019-02-26T00:00:00.000Z', 'elevatedBuildingIndicator': False, 'elevationCertificateIndicator': 3, 'elevationDifference': None, 'censusTract': '06097153705', 'floodZone': 'AE', 'houseWorship': False, 'latitude': '38.5', 'longitude': '-122.9', 'locationOfContents': 3, 'lowestAdjacentGrade': None, 'lowestFloorElevation': 0, 'numberOfFloorsInTheInsuredBuilding': 1, 'nonProfitIndicator': False, 'obstructionType': None, 'occupancyType': 6, 'originalConstructionDate': '1960-01-01T00:00:00.000Z', 'originalNBDate': '2000-01-01T00:00:00.000Z', 'amountPaidOnBuildingClaim': 221485.01, 'amountPaidOnContentsClaim': 49423.76, 'amountPaidOnIncreasedCostOfComplianceClaim': None, 'postFIRMConstructi

In [14]:
import pandas as pd

In [15]:
df = pd.DataFrame(jsonData)
df.head()

Unnamed: 0,agricultureStructureIndicator,asOfDate,baseFloodElevation,basementEnclosureCrawlspace,reportedCity,condominiumIndicator,policyCount,countyCode,communityRatingSystemDiscount,dateOfLoss,...,postFIRMConstructionIndicator,rateMethod,smallBusinessIndicatorBuilding,state,totalBuildingInsuranceCoverage,totalContentsInsuranceCoverage,yearOfLoss,reportedZipcode,primaryResidence,id
0,False,2022-02-26T00:00:00.000Z,,0.0,Temporarily Unavailable,N,1,6097,10.0,2019-02-26T00:00:00.000Z,...,False,1,False,CA,500000,100000,2019,95436,False,3c92bad1-5db5-48c3-9ee6-4559bdc0cb6e
1,False,2022-02-26T00:00:00.000Z,77.0,0.0,Temporarily Unavailable,N,1,6097,10.0,2019-02-26T00:00:00.000Z,...,True,1,False,CA,100000,50000,2019,95436,False,7d6dde0e-6f73-4834-a6de-78a1e076a0f0
2,False,2022-02-26T00:00:00.000Z,,,Temporarily Unavailable,N,1,6067,,1995-01-10T00:00:00.000Z,...,False,3,False,CA,62000,26900,1995,95641,False,f7f84dac-a63a-40c1-910d-b254bae24cae
3,False,2022-02-26T00:00:00.000Z,,,Temporarily Unavailable,N,1,6067,,1997-01-03T00:00:00.000Z,...,False,3,False,CA,75000,29600,1997,95641,False,c84d7984-da02-42b2-8638-f9c7de49258d
4,False,2022-02-26T00:00:00.000Z,,0.0,Temporarily Unavailable,N,1,6037,,2004-12-28T00:00:00.000Z,...,False,7,False,CA,250000,60000,2004,91301,True,68bf0af0-8d5c-4b41-90a4-10eb073f3334


In [12]:
df.columns

Index(['agricultureStructureIndicator', 'asOfDate', 'baseFloodElevation',
       'basementEnclosureCrawlspace', 'reportedCity', 'condominiumIndicator',
       'policyCount', 'countyCode', 'communityRatingSystemDiscount',
       'dateOfLoss', 'elevatedBuildingIndicator',
       'elevationCertificateIndicator', 'elevationDifference', 'censusTract',
       'floodZone', 'houseWorship', 'latitude', 'longitude',
       'locationOfContents', 'lowestAdjacentGrade', 'lowestFloorElevation',
       'numberOfFloorsInTheInsuredBuilding', 'nonProfitIndicator',
       'obstructionType', 'occupancyType', 'originalConstructionDate',
       'originalNBDate', 'amountPaidOnBuildingClaim',
       'amountPaidOnContentsClaim',
       'amountPaidOnIncreasedCostOfComplianceClaim',
       'postFIRMConstructionIndicator', 'rateMethod',
       'smallBusinessIndicatorBuilding', 'state',
       'totalBuildingInsuranceCoverage', 'totalContentsInsuranceCoverage',
       'yearOfLoss', 'reportedZipcode', 'primaryReside

In [16]:
df.describe()

Unnamed: 0,baseFloodElevation,basementEnclosureCrawlspace,policyCount,communityRatingSystemDiscount,elevationCertificateIndicator,elevationDifference,locationOfContents,lowestAdjacentGrade,lowestFloorElevation,numberOfFloorsInTheInsuredBuilding,occupancyType,amountPaidOnBuildingClaim,amountPaidOnContentsClaim,amountPaidOnIncreasedCostOfComplianceClaim,totalBuildingInsuranceCoverage,totalContentsInsuranceCoverage,yearOfLoss
count,107.0,400.0,994.0,362.0,154.0,110.0,994.0,96.0,994.0,994.0,994.0,659.0,234.0,37.0,994.0,994.0,994.0
mean,331.46729,0.725,1.32495,6.914365,1.837662,0.7,2.096579,159.798958,35.755131,1.629779,2.609658,26055.334992,9075.001752,810.810811,204920.5,39128.169014,2004.419517
std,1028.155918,1.281984,3.944739,1.747452,0.881701,4.852976,1.929975,307.005357,351.464033,0.824938,2.909952,57259.604916,21948.995495,4931.969619,791804.2,67317.651434,13.839392
min,0.0,0.0,1.0,1.0,1.0,-15.0,0.0,-0.5,-5.0,1.0,1.0,-5387.05,-1156.04,0.0,0.0,0.0,1977.0
25%,11.0,0.0,1.0,6.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,2400.225,406.39,0.0,52000.0,0.0,1995.0
50%,53.0,0.0,1.0,7.0,2.0,1.0,3.0,33.25,0.0,1.0,1.0,8923.9,2340.905,0.0,146700.0,10000.0,2005.0
75%,256.0,1.0,1.0,7.0,2.0,2.0,3.0,95.55,0.0,2.0,3.0,24783.06,10000.0,0.0,250000.0,60000.0,2017.0
max,9990.0,4.0,103.0,10.0,4.0,22.0,7.0,1329.1,9990.0,6.0,18.0,839906.06,250000.0,30000.0,22656800.0,500000.0,2022.0


In [17]:
df.to_csv('ca_claims2.csv')