# Historical Flood Data

In [None]:
import pandas as pd
# historical flood data
def clean_flood_data():
    flood_df = pd.read_excel('FloodArchive.xlsx', engine='openpyxl')

    #filter only for United States
    flood_df = flood_df[flood_df['Country'] == 'USA']
    #add a zip code based on the long and lat

    return flood_df

flood_df = clean_flood_data()

FileNotFoundError: [Errno 2] No such file or directory: 'FloodArchive.xlsx'

In [None]:
print(f'USA number of flood incidents: {len(flood_df)}')
print(flood_df.head())
#save as csv
flood_df.to_csv('united_states_floods.csv', index=False)


In [None]:
# !pip install folium
import folium
import math

#visual representation of FLOOD DATA
map_usa = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
severity_colors = {
    1.0: "yellow", #large flood events: 1-2 decades-long reported interval since the last similar event
    1.5: "orange", #very large events: greater than 2 decades but less than 100 year estimated recurrence interval
    2.0: "red" #Extreme events: with an estimated recurrence interval greater than 100 years.
}

# Add a marker for each flood occurrence
for index, row in flood_df.iterrows():
    # Add a circle marker at each flood location (lat, long)
    folium.CircleMarker(
        location=[row['lat'], row['long']],
        popup=f"ID: {row['ID']} | Severity: {row['Severity']} | Displaced: {row['Displaced']} | Date: {row['Began']}",
        color=severity_colors[row['Severity']] ,
        fill=True,
        fill_opacity=0.6
    ).add_to(map_usa)

# Save map to HTML file
map_usa.save('flood_map.html')


In [None]:
!pip install sqlalchemy_mate==2.0.0.0

#add zipcode column to flood data
from uszipcode import SearchEngine

search = SearchEngine()

def get_zipcode(lat, lon):
    result = search.by_coordinates(lat, lon)
    if result:
        return result[0].zipcode
    return None

# Apply the get_zipcode function to each row and create a new column 'zipcode'
flood_df['zipcode'] = flood_df.apply(lambda row: get_zipcode(row['lat'], row['long']), axis=1)


In [None]:
#save as csv
pd.set_option('display.max_columns', None)
print(flood_df.head())
flood_df.to_csv('united_states_floods.csv', index=False)

#Historical Housing Data

In [None]:
valid_zipcodes = flood_df[~flood_df['zipcode'].isna()]['zipcode'].unique()
def clean_housing_data():
    df = pd.read_csv('hpi_at_bdl_zip5.csv', dtype={'Five-Digit ZIP Code': str})
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    #zipcodes from flood data
    df = df[df['Five-Digit ZIP Code'].isin(valid_zipcodes)]
    return df

df_housing = clean_housing_data()

In [None]:
print(df_housing.head(10))
#save as csv
df_housing.to_csv('united_states_housing.csv', index=False)


     Five-Digit ZIP Code  Year  Annual Change (%)     HPI  HPI with 1990 base  \
2401               01253  2001                NaN  100.00                 NaN   
2402               01253  2002               9.38  109.38                 NaN   
2403               01253  2003               2.17  111.75                 NaN   
2404               01253  2004              21.80  136.12                 NaN   
2405               01253  2005               8.09  147.13                 NaN   
2406               01253  2006              12.92  166.14                 NaN   
2407               01253  2007              -2.73  161.60                 NaN   
2408               01253  2008              -1.61  159.00                 NaN   
2409               01253  2009              -2.66  154.77                 NaN   
2410               01253  2010               0.33  155.28                 NaN   

      HPI with 2000 base  
2401                 NaN  
2402                 NaN  
2403                 NaN  


#Random Forest Regressor

In [23]:
#code for predicing HPI based on flood data
#load data
import pandas as pd
flood_data = pd.read_csv('united_states_floods.csv', dtype={'zipcode': str})
housing_data = pd.read_csv('united_states_housing.csv', dtype={'Five-Digit ZIP Code': str})
print(flood_data.head(5))


   ID GlideNumber Country OtherCountry      long      lat       Area  \
0   9           0     USA            0  -85.1742  40.6691  210527.96   
1  11           0     USA            0  -89.5537  40.6814   26266.14   
2  12           0     USA            0 -108.0930  35.3824   26527.13   
3  13           0     USA            0  -96.7845  29.6044  141508.00   
4  14           0     USA            0  -83.5377  42.0122   16883.54   

        Began       Ended Validation  Dead  Displaced          MainCause  \
0  1985-02-22  1985-03-01       News     7       2250  Rain and snowmelt   
1  1985-03-03  1985-03-08       News     4       2400  Rain and snowmelt   
2  1985-03-13  1985-03-14       News     0         80  Rain and snowmelt   
3  1985-03-14  1985-03-15       News     0          0         Heavy rain   
4  1985-03-30  1985-03-31       News     0        300         Heavy rain   

   Severity zipcode  
0       2.0   46781  
1       2.0   61611  
2       1.0   87045  
3       1.0   78962  


In [24]:
print(housing_data.head(5))

  Five-Digit ZIP Code  Year  Annual Change (%)     HPI  HPI with 1990 base  \
0               01253  2001                NaN  100.00                 NaN   
1               01253  2002               9.38  109.38                 NaN   
2               01253  2003               2.17  111.75                 NaN   
3               01253  2004              21.80  136.12                 NaN   
4               01253  2005               8.09  147.13                 NaN   

   HPI with 2000 base  
0                 NaN  
1                 NaN  
2                 NaN  
3                 NaN  
4                 NaN  


In [25]:
#SUMMARIZE FLOOD EVENTS
# For each zip code, get a summary of the floods that occurred up to and including that year.
# calculate duration(in days)
flood_data['Began'] = pd.to_datetime(flood_data['Began'])
flood_data['Ended'] = pd.to_datetime(flood_data['Ended'])
flood_data.loc[:, 'Duration'] = (flood_data['Ended'] - flood_data['Began']).dt.days
flood_data['Year'] = flood_data['Began'].dt.year


# # aggregate flood data by zip code and year
flood_summary = flood_data.groupby(['zipcode', 'Year']).agg(
    flood_count=('Severity', 'count'),
    max_severity=('Severity', lambda x: x.max()),
    median_dead=('Dead', 'median'),
    median_displaced=('Displaced', 'median'),
    median_duration=('Duration', 'median'),
    median_area = ('Area', 'median')
).reset_index()

flood_summary



Unnamed: 0,zipcode,Year,flood_count,max_severity,median_dead,median_displaced,median_duration,median_area
0,00641,1985,1,1.0,2.0,4500.0,1.0,1423.92
1,00664,1985,1,1.0,99.0,6200.0,2.0,7219.80
2,00698,2001,1,1.0,2.0,0.0,1.0,245.95
3,01031,1996,1,2.0,5.0,700.0,4.0,67013.98
4,01031,2007,1,1.0,9.0,0.0,6.0,109079.09
...,...,...,...,...,...,...,...,...
450,99705,1991,1,1.0,2.0,800.0,4.0,22130.06
451,99705,2003,1,1.0,0.0,0.0,4.0,1681.67
452,99709,2003,1,1.0,0.0,0.0,5.0,6718.98
453,99743,2003,1,1.0,0.0,20.0,4.0,13111.55


In [26]:
housing_data.rename(columns={'Five-Digit ZIP Code': 'zipcode'}, inplace=True)

# Merge flood summary with housing price index data based on 'zipcode' and 'Year'
merged_data = pd.merge(flood_summary, housing_data, on=['zipcode', 'Year'], how='left')

# Display the first few rows of the merged data
merged_data

Unnamed: 0,zipcode,Year,flood_count,max_severity,median_dead,median_displaced,median_duration,median_area,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base
0,00641,1985,1,1.0,2.0,4500.0,1.0,1423.92,,,,
1,00664,1985,1,1.0,99.0,6200.0,2.0,7219.80,,,,
2,00698,2001,1,1.0,2.0,0.0,1.0,245.95,,,,
3,01031,1996,1,2.0,5.0,700.0,4.0,67013.98,,,,
4,01031,2007,1,1.0,9.0,0.0,6.0,109079.09,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
450,99705,1991,1,1.0,2.0,800.0,4.0,22130.06,20.76,72.44,120.76,69.72
451,99705,2003,1,1.0,0.0,0.0,4.0,1681.67,4.43,113.15,188.63,108.90
452,99709,2003,1,1.0,0.0,0.0,5.0,6718.98,2.88,246.91,162.72,110.30
453,99743,2003,1,1.0,0.0,20.0,4.0,13111.55,,,,


In [27]:
#drop unmatched rows
#NOTE ROWS DROPPED PRETTY SIGNIFICANTLY
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,zipcode,Year,flood_count,max_severity,median_dead,median_displaced,median_duration,median_area,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base
6,01749,2018,1,1.5,0.0,0.0,2.0,5095.45,6.20,468.17,230.13,164.19
12,03903,2006,1,2.0,1.0,2500.0,11.0,42910.99,3.83,314.67,214.37,164.46
20,07753,2011,1,2.0,20.0,370000.0,17.0,252912.89,-7.22,562.36,194.21,161.89
22,08088,2004,1,2.0,0.0,500.0,3.0,2441.81,14.87,395.73,163.53,144.36
32,17036,2018,1,1.5,1.0,0.0,1.0,25377.62,4.30,265.31,187.85,152.03
...,...,...,...,...,...,...,...,...,...,...,...,...
442,98503,2008,1,1.5,2.0,0.0,6.0,75156.68,-1.42,578.90,289.04,181.96
443,98506,2007,1,1.0,2.0,1100.0,4.0,41684.03,4.41,653.48,320.03,188.83
450,99705,1991,1,1.0,2.0,800.0,4.0,22130.06,20.76,72.44,120.76,69.72
451,99705,2003,1,1.0,0.0,0.0,4.0,1681.67,4.43,113.15,188.63,108.90


In [28]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


#Use RandomForestRegressor to predict HPI based on floods

X = merged_data[['flood_count', 'max_severity', 'median_dead', 'median_displaced', 'median_duration', 'median_area']]
y = merged_data['HPI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# evaluate the model performance
print("Random Forest Regressor: Predicting HPI(no past history, just current year):")
r2 = r2_score(y_test, y_pred)
print(f'R² score: {r2:.3f}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.3f}')

print(f"Predictions: {y_pred[:10]}")
print(f"Actual: {y_test[:10].values}")

Random Forest Regressor: Predicting HPI(no past history, just current year):
R² score: -0.707
Mean Absolute Error (MAE): 167.823
Predictions: [245.7141 278.2456 384.5934 364.4457 326.7193 148.5004 654.1843 298.0053
 294.3737 352.6352]
Actual: [508.46 236.46 160.39 171.49 348.77 188.54 286.93 194.88 242.57 468.17]


### Random Forest Regressor w/ past memory(10 years prior)

In [29]:
flood_data = flood_data.dropna(subset=['zipcode', 'Year'])

In [30]:
# Create the 3-area zipcode column (first 3 digits of zipcode)
flood_summary['zipcode_3area'] = flood_summary['zipcode'].astype(str).str[:3]
flood_summary = flood_summary.sort_values(by=['zipcode_3area', 'Year'])

flood_summary_10_years = flood_summary.copy()

# Rolling average for each feature, grouped by the 3-area zipcode
flood_summary_10_years['flood_count_total_10'] = flood_summary.groupby('zipcode_3area')['flood_count'].transform(lambda x: x.rolling(window=10, min_periods=1).sum())
flood_summary_10_years['max_severity_avg_10'] = flood_summary.groupby('zipcode_3area')['max_severity'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
flood_summary_10_years['median_dead_avg_10'] = flood_summary.groupby('zipcode_3area')['median_dead'].transform(lambda x: x.rolling(window=10, min_periods=1).median())
flood_summary_10_years['median_displaced_avg_10'] = flood_summary.groupby('zipcode_3area')['median_displaced'].transform(lambda x: x.rolling(window=10, min_periods=1).median())
flood_summary_10_years['median_duration_avg_10'] = flood_summary.groupby('zipcode_3area')['median_duration'].transform(lambda x: x.rolling(window=10, min_periods=1).median())
flood_summary_10_years['median_area_avg_10'] = flood_summary.groupby('zipcode_3area')['median_area'].transform(lambda x: x.rolling(window=10, min_periods=1).median())


flood_summary_10_years.head(10)


Unnamed: 0,zipcode,Year,flood_count,max_severity,median_dead,median_displaced,median_duration,median_area,zipcode_3area,flood_count_total_10,max_severity_avg_10,median_dead_avg_10,median_displaced_avg_10,median_duration_avg_10,median_area_avg_10
0,641,1985,1,1.0,2.0,4500.0,1.0,1423.92,6,1.0,1.0,2.0,4500.0,1.0,1423.92
1,664,1985,1,1.0,99.0,6200.0,2.0,7219.8,6,2.0,1.0,50.5,5350.0,1.5,4321.86
2,698,2001,1,1.0,2.0,0.0,1.0,245.95,6,3.0,1.0,2.0,4500.0,1.0,1423.92
3,1031,1996,1,2.0,5.0,700.0,4.0,67013.98,10,1.0,2.0,5.0,700.0,4.0,67013.98
4,1031,2007,1,1.0,9.0,0.0,6.0,109079.09,10,2.0,1.5,7.0,350.0,5.0,88046.535
5,1253,1998,1,2.0,0.0,0.0,2.0,1697.01,12,1.0,2.0,0.0,0.0,2.0,1697.01
6,1749,2018,1,1.5,0.0,0.0,2.0,5095.45,17,1.0,1.5,0.0,0.0,2.0,5095.45
7,3262,1990,1,1.0,0.0,400.0,4.0,20288.6,32,1.0,1.0,0.0,400.0,4.0,20288.6
8,3448,2005,1,2.0,11.0,3000.0,9.0,38291.77,34,1.0,2.0,11.0,3000.0,9.0,38291.77
10,3585,1998,1,2.0,21.0,11000.0,7.0,18116.79,35,1.0,2.0,21.0,11000.0,7.0,18116.79


In [31]:
merged_data = pd.merge(flood_summary_10_years, housing_data, on=['zipcode', 'Year'], how='left')
merged_data = merged_data.dropna()
merged_data

Unnamed: 0,zipcode,Year,flood_count,max_severity,median_dead,median_displaced,median_duration,median_area,zipcode_3area,flood_count_total_10,max_severity_avg_10,median_dead_avg_10,median_displaced_avg_10,median_duration_avg_10,median_area_avg_10,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base
6,01749,2018,1,1.5,0.0,0.0,2.0,5095.45,017,1.0,1.5,0.0,0.0,2.0,5095.450,6.20,468.17,230.13,164.19
12,03903,2006,1,2.0,1.0,2500.0,11.0,42910.99,039,1.0,2.0,1.0,2500.0,11.0,42910.990,3.83,314.67,214.37,164.46
20,07753,2011,1,2.0,20.0,370000.0,17.0,252912.89,077,1.0,2.0,20.0,370000.0,17.0,252912.890,-7.22,562.36,194.21,161.89
21,08088,2004,1,2.0,0.0,500.0,3.0,2441.81,080,1.0,2.0,0.0,500.0,3.0,2441.810,14.87,395.73,163.53,144.36
32,17036,2018,1,1.5,1.0,0.0,1.0,25377.62,170,1.0,1.5,1.0,0.0,1.0,25377.620,4.30,265.31,187.85,152.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,98503,2008,1,1.5,2.0,0.0,6.0,75156.68,985,3.0,1.5,2.0,1100.0,6.0,75156.680,-1.42,578.90,289.04,181.96
444,98501,2009,1,1.5,9.0,0.0,1.0,47811.61,985,4.0,1.5,2.0,550.0,5.0,61484.145,-5.55,577.53,281.05,170.92
449,99705,1991,1,1.0,2.0,800.0,4.0,22130.06,997,1.0,1.0,2.0,800.0,4.0,22130.060,20.76,72.44,120.76,69.72
451,99705,2003,1,1.0,0.0,0.0,4.0,1681.67,997,3.0,1.0,0.0,0.0,4.0,1681.670,4.43,113.15,188.63,108.90


In [33]:
X = merged_data[['flood_count', 'max_severity', 'median_dead', 'median_displaced',
                 'median_duration', 'median_area',
                 'flood_count_total_10', 'max_severity_avg_10',
                 'median_dead_avg_10', 'median_displaced_avg_10',
                 'median_duration_avg_10', 'median_area_avg_10']]

y = merged_data['HPI']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# evaluate the model's performance
print("Random Forest Regressor: Predicting HPI(w/ past history, prior 10 year flood averages):")
r2 = r2_score(y_test, y_pred)
print(f'R² score: {r2:.3f}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.3f}')

print(f"Predictions: {y_pred[:10]}")  # First 10 predicted values
print(f"Actual: {y_test[:10].values}")  # First 10 actual values

Random Forest Regressor: Predicting HPI(w/ past history, prior 10 year flood averages):
R² score: -0.603
Mean Absolute Error (MAE): 145.056
Predictions: [321.9379 327.8695 449.8243 205.7653 217.6442 122.0748 440.8597 690.8394
 237.1035 707.6181]
Actual: [508.46 236.46 160.39 171.49 348.77 188.54 286.93 194.88 242.57 468.17]
