# Analyzing FEMA's NFIP Claims Dataset With DuckDB
Author: Mark Bauer

In [1]:
import duckdb
import pandas as pd

In [2]:
print("duckdb version: {}".format(duckdb.__version__))

duckdb version: 0.10.0


In [3]:
con = duckdb.connect(config={"allow_unsigned_extensions": "true"})
path = "../duckdb/build/release/repository/v0.10.0/osx_amd64/httpfs.duckdb_extension"

# install and load extension
con.install_extension(path)
con.load_extension(path)

In [4]:
con.sql(
    """
    CREATE TABLE claims AS
    FROM read_parquet('data/FimaNfipClaims.parquet')
    """
)

con.sql("SELECT COUNT(*) FROM claims").show()

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      2609973 │
└──────────────┘



In [5]:
con.sql("SELECT * FROM claims LIMIT 10").show()

┌──────────────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬──────────┬───────────┐
│          id          │ agricultureStructu…  │       asOfDate       │ … │ censusBlockGroupFips │ latitude │ longitude │
│       varchar        │       boolean        │      timestamp       │   │       varchar        │  double  │  double   │
├──────────────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼──────────┼───────────┤
│ 61a2811b-a92b-486d…  │ false                │ 2020-12-11 16:25:4…  │ … │ 010030114072         │     30.3 │     -87.7 │
│ b9123f27-fa0e-4097…  │ false                │ 2020-11-13 14:50:3…  │ … │ 010030114073         │     30.3 │     -87.7 │
│ 9de3700b-bbbd-408c…  │ false                │ 2022-04-21 19:56:4…  │ … │ 010010204001         │     32.5 │     -86.4 │
│ c2450563-0de0-40df…  │ false                │ 2020-03-09 19:28:2…  │ … │ 010010204001         │     32.5 │     -86.4 │
│ 6b13ea94-ef8d-428a…  │ false  

In [6]:
describe_df = con.sql("DESCRIBE claims").df()

describe_df

Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,agricultureStructureIndicator,BOOLEAN,YES,,,
2,asOfDate,TIMESTAMP,YES,,,
3,basementEnclosureCrawlspaceType,SMALLINT,YES,,,
4,policyCount,SMALLINT,YES,,,
...,...,...,...,...,...,...
68,countyCode,VARCHAR,YES,,,
69,censusTract,VARCHAR,YES,,,
70,censusBlockGroupFips,VARCHAR,YES,,,
71,latitude,DOUBLE,YES,,,


In [7]:
describe_df.iloc[:25, :]

Unnamed: 0,column_name,column_type,null,key,default,extra
0,id,VARCHAR,YES,,,
1,agricultureStructureIndicator,BOOLEAN,YES,,,
2,asOfDate,TIMESTAMP,YES,,,
3,basementEnclosureCrawlspaceType,SMALLINT,YES,,,
4,policyCount,SMALLINT,YES,,,
5,crsClassificationCode,SMALLINT,YES,,,
6,dateOfLoss,DATE,YES,,,
7,elevatedBuildingIndicator,BOOLEAN,YES,,,
8,elevationCertificateIndicator,VARCHAR,YES,,,
9,elevationDifference,DOUBLE,YES,,,


In [8]:
describe_df.iloc[25:50, :]

Unnamed: 0,column_name,column_type,null,key,default,extra
25,postFIRMConstructionIndicator,BOOLEAN,YES,,,
26,rateMethod,VARCHAR,YES,,,
27,smallBusinessIndicatorBuilding,BOOLEAN,YES,,,
28,totalBuildingInsuranceCoverage,BIGINT,YES,,,
29,totalContentsInsuranceCoverage,BIGINT,YES,,,
30,yearOfLoss,SMALLINT,YES,,,
31,primaryResidenceIndicator,BOOLEAN,YES,,,
32,buildingDamageAmount,BIGINT,YES,,,
33,buildingDeductibleCode,VARCHAR,YES,,,
34,netBuildingPaymentAmount,DOUBLE,YES,,,


In [9]:
describe_df.iloc[50:, :]

Unnamed: 0,column_name,column_type,null,key,default,extra
50,netIccPaymentAmount,DOUBLE,YES,,,
51,nfipRatedCommunityNumber,VARCHAR,YES,,,
52,nfipCommunityNumberCurrent,VARCHAR,YES,,,
53,nfipCommunityName,VARCHAR,YES,,,
54,nonPaymentReasonContents,VARCHAR,YES,,,
55,nonPaymentReasonBuilding,VARCHAR,YES,,,
56,numberOfUnits,INTEGER,YES,,,
57,buildingReplacementCost,BIGINT,YES,,,
58,contentsReplacementCost,BIGINT,YES,,,
59,replacementCostBasis,VARCHAR,YES,,,


In [10]:
summarize_df = con.sql("SUMMARIZE claims").df()

summarize_df

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,id,VARCHAR,000006af-51d1-453e-8c06-5791060175ab,fffffeef-b65f-4405-ab4e-60257523aa4f,2662059,,,,,,2609973,0.00
1,agricultureStructureIndicator,BOOLEAN,false,true,2,,,,,,2609973,0.00
2,asOfDate,TIMESTAMP,2019-09-19 06:12:43.388,2024-05-01 20:18:36.77,425022,,,,,,2609973,0.00
3,basementEnclosureCrawlspaceType,SMALLINT,0,4,4,1.228374267022299,1.056693471847669,0,1,2,2609973,70.18
4,policyCount,SMALLINT,1,1090,398,1.2797094069555508,6.67351428857184,1,1,1,2609973,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
68,countyCode,VARCHAR,01001,78030,2942,,,,,,2609973,2.38
69,censusTract,VARCHAR,01001020100,78030961200,54666,,,,,,2609973,5.27
70,censusBlockGroupFips,VARCHAR,010010201001,780309612002,110152,,,,,,2609973,5.27
71,latitude,DOUBLE,-36.0,69.9,335,33.86958959935469,5.826147648469369,29.890613345766084,31.08104547546888,39.748689975453026,2609973,1.53


In [11]:
summarize_df.iloc[:25, :]

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,id,VARCHAR,000006af-51d1-453e-8c06-5791060175ab,fffffeef-b65f-4405-ab4e-60257523aa4f,2662059,,,,,,2609973,0.0
1,agricultureStructureIndicator,BOOLEAN,false,true,2,,,,,,2609973,0.0
2,asOfDate,TIMESTAMP,2019-09-19 06:12:43.388,2024-05-01 20:18:36.77,425022,,,,,,2609973,0.0
3,basementEnclosureCrawlspaceType,SMALLINT,0,4,4,1.228374267022299,1.056693471847669,0.0,1.0,2.0,2609973,70.18
4,policyCount,SMALLINT,1,1090,398,1.2797094069555508,6.67351428857184,1.0,1.0,1.0,2609973,0.0
5,crsClassificationCode,SMALLINT,1,10,10,6.848428755453079,1.5049489596949983,5.0,7.0,8.0,2609973,82.3
6,dateOfLoss,DATE,1978-01-01,2024-05-01,16687,,,,,,2609973,0.0
7,elevatedBuildingIndicator,BOOLEAN,false,true,2,,,,,,2609973,0.0
8,elevationCertificateIndicator,VARCHAR,1,E,9,,,,,,2609973,77.41
9,elevationDifference,DOUBLE,-9989.0,998.0,370,1.281729690047788,29.36265619963509,0.0,1.0,2.73404367789926,2609973,72.86


In [12]:
summarize_df.iloc[25:50, :]

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
25,postFIRMConstructionIndicator,BOOLEAN,false,true,2,,,,,,2609973,0.0
26,rateMethod,VARCHAR,1,W,22,,,,,,2609973,1.9
27,smallBusinessIndicatorBuilding,BOOLEAN,false,true,2,,,,,,2609973,0.0
28,totalBuildingInsuranceCoverage,BIGINT,0,243903000,11951,166889.41585940993,1228597.5907276445,39052.0,99935.0,210133.0,2609973,0.0
29,totalContentsInsuranceCoverage,BIGINT,0,6000000,3048,30733.689877477143,50621.5474246335,0.0,11530.0,44693.0,2609973,0.0
30,yearOfLoss,SMALLINT,1978,2024,47,2002.143508381121,12.501478526203302,1993.0,2005.0,2012.0,2609973,0.0
31,primaryResidenceIndicator,BOOLEAN,false,true,2,,,,,,2609973,0.0
32,buildingDamageAmount,BIGINT,0,927700000,201765,36311.10027419284,814576.355110798,3437.0,11220.0,40212.0,2609973,22.4
33,buildingDeductibleCode,VARCHAR,0,H,15,,,,,,2609973,11.99
34,netBuildingPaymentAmount,DOUBLE,-162432.16,10000000.0,1280807,24695.03533074863,59271.29895574316,0.8144092536265172,4708.958993265915,24777.876218092988,2609973,0.0


In [13]:
summarize_df.iloc[50:, :]

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
50,netIccPaymentAmount,DOUBLE,-6450.0,60000.0,8708,361.2506056652691,3098.892784914053,0.0,0.0,0.0,2609973,0.0
51,nfipRatedCommunityNumber,VARCHAR,000000,999999,16288,,,,,,2609973,0.0
52,nfipCommunityNumberCurrent,VARCHAR,0000,815000,11987,,,,,,2609973,72.96
53,nfipCommunityName,VARCHAR,ABBEVILLE COUNTY *,"ZUMBRO FALLS, CITY OF",9794,,,,,,2609973,72.43
54,nonPaymentReasonContents,VARCHAR,01,99,23,,,,,,2609973,68.43
55,nonPaymentReasonBuilding,VARCHAR,01,99,23,,,,,,2609973,77.62
56,numberOfUnits,INTEGER,0,99999,445,1.3626484479940102,66.17856487595735,1.0,1.0,1.0,2609973,0.11
57,buildingReplacementCost,BIGINT,0,9999000000,443562,8613520.156327939,235672086.9084631,0.0,120386.0,222737.0,2609973,22.4
58,contentsReplacementCost,BIGINT,0,20000000,8867,2802.1066546353063,46646.03669656595,0.0,0.0,0.0,2609973,59.11
59,replacementCostBasis,VARCHAR,A,R,2,,,,,,2609973,7.27


In [14]:
(con
 .sql("SELECT * FROM claims LIMIT 10")
 .df()
 .info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 73 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   id                                          10 non-null     object        
 1   agricultureStructureIndicator               10 non-null     bool          
 2   asOfDate                                    10 non-null     datetime64[ns]
 3   basementEnclosureCrawlspaceType             3 non-null      float64       
 4   policyCount                                 10 non-null     int16         
 5   crsClassificationCode                       7 non-null      float64       
 6   dateOfLoss                                  10 non-null     datetime64[ns]
 7   elevatedBuildingIndicator                   10 non-null     bool          
 8   elevationCertificateIndicator               1 non-null      object        
 9   elevationDiff

In [15]:
con.sql(
    """
    SELECT
        state,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim, 
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌─────────┬─────────────┬───────────────┬───────────────┬────────────────────────────────┐
│  state  │ TotalClaim  │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│ varchar │    int64    │     int64     │     int64     │             int64              │
├─────────┼─────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ LA      │ 20838923032 │   16431635833 │    4129930893 │                      277356306 │
│ TX      │ 17083042210 │   13136650534 │    3888589851 │                       57801825 │
│ FL      │ 11166770151 │    9458190896 │    1671373622 │                       37205633 │
│ NJ      │  6465642998 │    5345477364 │     869804214 │                      250361420 │
│ NY      │  5714558618 │    4944383614 │     717516208 │                       52658796 │
│ MS      │  3121617364 │    2385247785 │     675466706 │                       60902873 │
│ NC      │  2002988094 │    1692155613 │     273870198 │                       36962283 │

In [16]:
con.sql(
    """
    SELECT
        state, 
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    WHERE dateOfLoss between '2023-10-01' AND '2024-09-30'
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌─────────┬────────────┬───────────────┬───────────────┬────────────────────────────────┐
│  state  │ TotalClaim │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│ varchar │   int64    │     int64     │     int64     │             int64              │
├─────────┼────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ ME      │   28553103 │      26108035 │       2435049 │                          10019 │
│ NY      │   23722092 │      21998415 │       1723677 │                              0 │
│ FL      │   23229815 │      21388638 │       1841177 │                              0 │
│ CA      │   21744654 │      20373883 │       1370772 │                              0 │
│ NJ      │   19296081 │      17150204 │       2145877 │                              0 │
│ RI      │    8546350 │       8129389 │        416960 │                              0 │
│ SC      │    8444517 │       7837273 │        607244 │                              0 │
│ NH      

In [17]:
con.sql(
    """
    SELECT
        countyCode,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌────────────┬────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ countyCode │ TotalClaim │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│  varchar   │   int64    │     int64     │     int64     │             int64              │
├────────────┼────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ 48201      │ 8705456456 │    6756752295 │    1928687883 │                       20016278 │
│ 22071      │ 7294446959 │    5890983538 │    1292969906 │                      110493515 │
│ 22051      │ 3527668438 │    2619027797 │     847127476 │                       61513166 │
│ 12071      │ 3333790855 │    2975824827 │     356154969 │                        1811059 │
│ 34029      │ 2607850866 │    2160793243 │     275246922 │                      171810701 │
│ 48167      │ 2456431694 │    1887282122 │     548178802 │                       20970769 │
│ 36059      │ 2279190677 │    1972284395 │     290167645 │           

In [18]:
con.sql(
    """
    SELECT
        yearOfLoss,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌────────────┬─────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ yearOfLoss │ TotalClaim  │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│   int16    │    int64    │     int64     │     int64     │             int64              │
├────────────┼─────────────┼───────────────┼───────────────┼────────────────────────────────┤
│       2005 │ 17770847244 │   13879514035 │    3604298083 │                      287035126 │
│       2017 │ 10657091054 │    8282076880 │    2350474755 │                       24539420 │
│       2012 │  9738239360 │    8306050543 │    1124096129 │                      308092688 │
│       2022 │  4997567787 │    4435519701 │     560003742 │                        2044343 │
│       2016 │  4468404400 │    3782715597 │     663420857 │                       22267946 │
│       2008 │  3491287805 │    2708717648 │     704664019 │                       77906138 │
│       2011 │  2431706170 │    2047987566 │     335400032 │

In [19]:
con.sql(
    """
    SELECT
        floodEvent,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    WHERE floodEvent NOT NULL
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌──────────────────────────┬─────────────┬───────────────┬───────────────┬────────────────────────────────┐
│        floodEvent        │ TotalClaim  │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│         varchar          │    int64    │     int64     │     int64     │             int64              │
├──────────────────────────┼─────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ Hurricane Katrina        │ 16261697056 │   12659081935 │    3360020221 │                      242594900 │
│ Hurricane Harvey         │  9055446500 │    6925357028 │    2115077279 │                       15012193 │
│ Hurricane Sandy          │  8956175518 │    7707763973 │     951584858 │                      296826687 │
│ Hurricane Ian            │  4653622925 │    4132266664 │     519501586 │                        1854675 │
│ Flooding                 │  3886101980 │    2880486344 │    1003281617 │                        2334019 │
│ Hurricane Ike            │

In [20]:
con.sql(
    """
    SELECT
        ratedFloodZone as ratedFloodZone,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌────────────────┬─────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ ratedFloodZone │ TotalClaim  │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│    varchar     │    int64    │     int64     │     int64     │             int64              │
├────────────────┼─────────────┼───────────────┼───────────────┼────────────────────────────────┤
│ AE             │ 35111157725 │   29248455727 │    5349491157 │                      513210841 │
│ X              │ 13293684798 │   10258338304 │    3006893246 │                       28453247 │
│ B              │  3826003058 │    2835227048 │     971988985 │                       18787025 │
│ A              │  3097345563 │    2457942866 │     602004885 │                       37397812 │
│ C              │  2851078461 │    2117465674 │     723537437 │                       10075351 │
│ A04            │  2185900631 │    1800894098 │     364396806 │                       20609726 │
│ A03            │  

In [21]:
con.sql(
    """
    SELECT
        occupancyType,
        ROUND(
            SUM(amountPaidOnBuildingClaim)
            + SUM(amountPaidOnContentsClaim)
            + SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS TotalClaim,  
        ROUND(SUM(amountPaidOnBuildingClaim), 0)::INT64 AS BuildingClaim,
        ROUND(SUM(amountPaidOnContentsClaim), 0)::INT64 AS ContentsClaim, 
        ROUND(SUM(amountPaidOnIncreasedCostOfComplianceClaim), 0)::INT64 AS IncreasedCostOfComplianceClaim
    FROM claims
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
    """
)

┌───────────────┬─────────────┬───────────────┬───────────────┬────────────────────────────────┐
│ occupancyType │ TotalClaim  │ BuildingClaim │ ContentsClaim │ IncreasedCostOfComplianceClaim │
│     int16     │    int64    │     int64     │     int64     │             int64              │
├───────────────┼─────────────┼───────────────┼───────────────┼────────────────────────────────┤
│             1 │ 58068945128 │   46435050309 │   10743686447 │                      890208372 │
│             4 │  6399065992 │    4292302635 │    2095965716 │                       10797641 │
│             3 │  4374189293 │    4228633027 │     138249314 │                        7306952 │
│             2 │  3730087287 │    3407331381 │     287664029 │                       35091878 │
│             6 │  3358494979 │    2511223292 │     845669820 │                        1601866 │
│            11 │  2491648290 │    2158598189 │     331902381 │                        1147721 │
│            15 │   861271847 