In [1]:
import pandas as pd

# Load the parquet file
df = pd.read_parquet("FimaNfipClaims.parquet")

# Display the first 5 rows
df.head()

Unnamed: 0,agricultureStructureIndicator,asOfDate,basementEnclosureCrawlspaceType,policyCount,crsClassificationCode,dateOfLoss,elevatedBuildingIndicator,elevationCertificateIndicator,elevationDifference,baseFloodElevation,...,rentalPropertyIndicator,state,reportedCity,reportedZipCode,countyCode,censusTract,censusBlockGroupFips,latitude,longitude,id
0,False,2020-12-11 16:25:40.587000+00:00,0.0,1,,2020-09-16,True,,4.0,10.0,...,False,AL,Currently Unavailable,36542,1003,1003011407,10030114072,30.3,-87.7,b'\xac++rKqD\x15\x96o\x16/oO\xebl'
1,False,2020-11-13 14:50:38.288000+00:00,2.0,1,,2020-09-16,False,,6.0,10.0,...,False,AL,Currently Unavailable,36542,1003,1003011407,10030114073,30.3,-87.7,"b""\xdd\x95V\x0b\xaf\xd1F\xcb\xb7j\x19\xc5\x8f-..."
2,False,2022-04-21 19:56:43.051000+00:00,,1,8.0,2020-03-05,True,,,,...,False,AL,Currently Unavailable,36066,1001,1001020400,10010204001,32.5,-86.4,b'\xe44\xf7\xe3\xee\xffB\xe4\xab\xc6\xa5\xd4\x...
3,False,2020-03-09 19:28:27.372000+00:00,,1,8.0,2017-06-22,True,,,,...,False,AL,Currently Unavailable,36066,1001,1001020400,10010204001,32.5,-86.4,b']\xdd\x1b\x02\xa8GK \x87\xbeX\x02\xda\x96 d'
4,False,2020-01-22 16:55:53.194965+00:00,,1,8.0,2016-01-22,False,,,,...,False,AL,Currently Unavailable,36067,1001,1001020600,10010206001,32.5,-86.5,b'\xa6\xa3\x9c<\x8e\xe5G\xe9\x97_j\xd0f\xa3\xa...


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2709121 entries, 0 to 2709120
Data columns (total 73 columns):
 #   Column                                      Dtype              
---  ------                                      -----              
 0   agricultureStructureIndicator               bool               
 1   asOfDate                                    datetime64[ns, UTC]
 2   basementEnclosureCrawlspaceType             float64            
 3   policyCount                                 int16              
 4   crsClassificationCode                       float64            
 5   dateOfLoss                                  object             
 6   elevatedBuildingIndicator                   bool               
 7   elevationCertificateIndicator               object             
 8   elevationDifference                         object             
 9   baseFloodElevation                          object             
 10  ratedFloodZone                              object    

In [3]:
df['buildingDescriptionCode'].value_counts().sort_index()
# 10-11 apartments, 12-13 Co-ops, 14 is commercial building, 15-16 condos, 

1.0     924505
2.0       2342
3.0        772
4.0        411
5.0       2415
6.0       1425
7.0        976
8.0      38780
9.0        176
10.0      2487
11.0       358
12.0       140
13.0       115
14.0         8
15.0         4
16.0         1
18.0         2
21.0         2
Name: buildingDescriptionCode, dtype: int64

In [7]:
num_unique_events = df['floodEvent'].nunique()
print(f"Number of unique flood events: {num_unique_events}")
missing_count = df['floodEvent'].isna().sum()
print(f"Number of missing values in 'floodEvent': {missing_count}")
missing_zip = df['reportedZipCode'].isna().sum()
print(f"Number of missing values in 'reportedZipCode': {missing_zip}")
missing_tract = df['censusTract'].isna().sum()
print(f"Number of missing values in 'censusTract': {missing_tract}")
missing_lat = df['latitude'].isna().sum()
print(f"Number of missing values in 'latitude': {missing_lat}")

Number of unique flood events: 177
Number of missing values in 'floodEvent': 764610
Number of missing values in 'reportedZipCode': 0
Number of missing values in 'censusTract': 137680
Number of missing values in 'latitude': 41311


In [8]:
top_50_df = df['floodEvent'].value_counts().head(50).reset_index()
top_50_df.columns = ['floodEvent', 'count']
display(top_50_df)

Unnamed: 0,floodEvent,count
0,Flooding,357614
1,Hurricane Katrina,208348
2,Hurricane Sandy,144848
3,Hurricane Harvey,92397
4,Hurricane Irene,73685
5,Hurricane Ike,58126
6,Hurricane Helene,57738
7,Hurricane Ian,48755
8,Tropical Storm Allison,39463
9,Hurricane Irma,33339


In [7]:
missing_count_ednum = df['eventDesignationNumber'].isna().sum()
print(f"Number of missing values in 'eventDesignationNumber': {missing_count_ednum}")

missing_count_year = df['yearOfLoss'].isna().sum()
print(f"Number of missing values in 'yearOfLoss': {missing_count_year}")


Number of missing values in 'eventDesignationNumber': 2525643
Number of missing values in 'yearOfLoss': 0


In [14]:
# Keep only Florida zip codes
df = df[df['reportedZipCode'].str.startswith(('32', '33'))]

In [15]:
columns_to_check = [
    'amountPaidOnBuildingClaim',
    'buildingDamageAmount',
    'netBuildingPaymentAmount',
    'buildingReplacementCost',
    'buildingPropertyValue',
    'reportedZipCode',
    'countyCode',
    'censusTract',
    'censusBlockGroupFips'
]

missing_counts = df[columns_to_check].isna().sum().reset_index()
missing_counts.columns = ['column', 'missing_values']
print(missing_counts)

                      column  missing_values
0  amountPaidOnBuildingClaim          101392
1       buildingDamageAmount          105469
2   netBuildingPaymentAmount               0
3    buildingReplacementCost          105469
4      buildingPropertyValue          105469
5            reportedZipCode               0
6                 countyCode             578
7                censusTract            5780
8       censusBlockGroupFips            5780


In [16]:
# Filter to events from 2000 onward
df_filtered = df[df['yearOfLoss'] >= 2000].copy()

# Create primary ID using eventDesignationNumber or ficoNumber
df_filtered['primaryID'] = df_filtered['eventDesignationNumber'].fillna(df_filtered['ficoNumber'])

# Drop rows where both identifiers are missing
df_filtered = df_filtered[~df_filtered['primaryID'].isna()]

df_filtered['dateOfLoss'] = pd.to_datetime(df_filtered['dateOfLoss'], errors='coerce')

# Function to create grouped summaries
def build_event_summary(df, group_cols):
    return (
        df.groupby(group_cols, dropna=False)
        .agg(
            num_observations=('primaryID', 'size'),
            total_damages=('netBuildingPaymentAmount', 'sum'),
            states_involved=('state', lambda x: ', '.join(sorted(x.dropna().unique()))),
            min_dateOfLoss=('dateOfLoss', 'min'),
            max_dateOfLoss=('dateOfLoss', 'max')
        )
        .reset_index()
    )

# Define groupings
group_id = ['primaryID', 'floodEvent', 'yearOfLoss']
group_zip = group_id + ['reportedZipCode']
group_block = group_id + ['censusBlockGroupFips']

# Build summaries
event_summary_id = build_event_summary(df_filtered, group_id)
event_summary_zip = build_event_summary(df_filtered, group_zip)
event_summary_block = build_event_summary(df_filtered, group_block)

# Clean for Stata export
def clean_for_stata(df):
    df = df.copy()
    df['primaryID'] = df['primaryID'].astype(str).where(df['primaryID'].notna(), None)
    df['total_damages'] = pd.to_numeric(df['total_damages'], errors='coerce')
    df['num_observations'] = df['num_observations'].astype('Int64')

    # Format date columns as strings (ISO 8601) for Stata
    df['min_dateOfLoss'] = df['min_dateOfLoss'].dt.strftime('%Y-%m-%d').where(df['min_dateOfLoss'].notna(), None)
    df['max_dateOfLoss'] = df['max_dateOfLoss'].dt.strftime('%Y-%m-%d').where(df['max_dateOfLoss'].notna(), None)
    
    return df

event_summary_id_clean = clean_for_stata(event_summary_id)
event_summary_zip_clean = clean_for_stata(event_summary_zip)
event_summary_block_clean = clean_for_stata(event_summary_block)

# Export to CSV
event_summary_id_clean.to_csv("event_summary_id.csv", index=False)
event_summary_zip_clean.to_csv("event_summary_zip.csv", index=False)
event_summary_block_clean.to_csv("event_summary_block.csv", index=False)

# Export to Stata
event_summary_id_clean.to_stata("event_summary_id.dta", write_index=False)
event_summary_zip_clean.to_stata("event_summary_zip.dta", write_index=False)
event_summary_block_clean.to_stata("event_summary_block.dta", write_index=False)