## INFO 450 Final Project: FEMA Disaster Relief

*Mia Avellanet*

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from scipy import stats

In [44]:
DATA_URL = "https://storage.googleapis.com/info_450/IndividualAssistanceHousingRegistrantsLargeDisasters%20(1).csv"

In [45]:
df=pd.read_csv(DATA_URL, nrows=10000)
df.head(5)

Unnamed: 0,disasterNumber,damagedCity,damagedStateAbbreviation,damagedZipCode,householdComposition,grossIncome,specialNeeds,ownRent,residenceType,homeOwnersInsurance,...,rentalAssistanceEndDate,rentalResourceCity,rentalResourceStateAbbreviation,rentalResourceZipCode,primaryResidence,personalPropertyEligible,ppfvl,censusBlockId,id,censusYear
0,4332,HOUSTON,TX,77036,1,1800.0,1,Renter,Apartment,0,...,,,,,1,0,,482014300000000.0,d214ed1e-951e-484b-b014-2a885a7ea234,2010.0
1,4337,SARASOTA,FL,34238,1,,0,Renter,Condo,0,...,,,,,1,0,,121150000000000.0,e7b10c4f-adec-4a58-a4fc-85b5a20dfa4c,2010.0
2,4337,KISSIMMEE,FL,34758,1,28000.0,0,Renter,House/Duplex,0,...,,,,,1,0,0.0,120970400000000.0,0832cc95-c445-4429-956c-e2d1864d37ac,2010.0
3,4339,ARECIBO,PR,612,2,2100.0,0,Owner,House/Duplex,0,...,,,,,1,0,199.99,720133000000000.0,63bc856a-5cfa-44c8-96e1-f1866735cf94,2010.0
4,4559,SULPHUR,LA,70663,4,55000.0,0,Renter,Apartment,0,...,,,,,1,0,,220190000000000.0,ce6f1140-3777-41d8-8da0-f7f4891b2228,2020.0


In [46]:
df=df.dropna(subset=["tsaEligible"])
df["tsaEligible"]=df["tsaEligible"].astype(int)

* Drops rows where tsaEligible (target) is missing

In [47]:
df["residenceType"]=df["residenceType"].fillna("Unknown")

* Fills missing residenceType with Unknown

In [48]:
df["grossIncome_missing"]=df["grossIncome"].isna().astype(int)
df["grossIncome"]=df["grossIncome"].fillna(0)
df["repairAmount_missing"]=df["repairAmount"].isna().astype(int)
df["repairAmount"]=df["repairAmount"].fillna(0)

* Fills missing grossIncome and repairAmount with 0, and creates flags

In [49]:
df["damagedStateAbbreviation"]=df["damagedStateAbbreviation"].fillna("UNK")

* Fills missing state abbreviations with "UNK"

In [50]:
for col in ["destroyed","specialNeeds"]:
  if col in df.columns:
    df[col]=df[col].replace(
        {"Yes":1, "yes": 1, "No": 0, "no": 0}
    ).fillna(0).astype(int)
  print(df.isna().sum().head(10))

disasterNumber              0
damagedCity                 0
damagedStateAbbreviation    0
damagedZipCode              0
householdComposition        0
grossIncome                 0
specialNeeds                0
ownRent                     0
residenceType               0
homeOwnersInsurance         0
dtype: int64
disasterNumber              0
damagedCity                 0
damagedStateAbbreviation    0
damagedZipCode              0
householdComposition        0
grossIncome                 0
specialNeeds                0
ownRent                     0
residenceType               0
homeOwnersInsurance         0
dtype: int64


* Converts yes/no columns to 1/0, blanks assumed as "No"

In [51]:
crosstab_state=pd.crosstab(df["residenceType"],df["tsaEligible"], normalize="index")*100
print("\nTSA Eligibility Rate by State/Territory(%):")
print(crosstab_state.round(1))


TSA Eligibility Rate by State/Territory(%):
tsaEligible                  0     1
residenceType                       
Apartment                 62.8  37.2
Assisted Living Facility  57.1  42.9
Boat                      36.4  63.6
College Dorm              66.7  33.3
Condo                     51.4  48.6
House/Duplex              60.9  39.1
Military Housing          50.0  50.0
Mobile Home               79.8  20.2
Other                     55.7  44.3
Townhouse                 45.9  54.1
Travel Trailer            68.3  31.7


* TSA Eligibility by residence type

In [52]:
crosstab_state=pd.crosstab(df["damagedStateAbbreviation"],df["tsaEligible"], normalize="index")*100
print("\nTSA Eligibility Rate by State/Territory(%):")
print(crosstab_state.round(1))


TSA Eligibility Rate by State/Territory(%):
tsaEligible                  0     1
damagedStateAbbreviation            
FL                        68.1  31.9
LA                        84.5  15.5
NC                        94.7   5.3
PR                         8.6  91.4
TX                        71.1  28.9


* TSA eligibility by state/territory

In [53]:
avg_repair=df.groupby("damagedStateAbbreviation")["repairAmount"].mean().sort_values(ascending=False)
print("\nAverage Repair Amount by State:")
print(avg_repair.head(10))


Average Repair Amount by State:
damagedStateAbbreviation
TX    811.336921
LA    486.130988
PR    420.444592
NC    327.231630
FL    105.062896
Name: repairAmount, dtype: float64


* Average repair amount by state

In [54]:
tsa_rate_state=(
    df.groupby("damagedStateAbbreviation")["tsaEligible"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
figone=px.bar(
    tsa_rate_state,
    x="damagedStateAbbreviation",
    y="tsaEligible",
    title="TSA Eligibility Rate by State/Territory",
    labels={"tsaEligible": "Eligibility Rate", "damagedStateAbbreviation": "State"},
)
figone.show()

* Bar Chart: TSA eligibility rate by state

In [55]:
figtwo=px.histogram(
    df,
    x="repairAmount",
    nbins=60,
    title="Distribution of Repair Amount",
    labels={"repairAmount": "Repair Amount ($)"},
)
figtwo.show()

* Histogram: Distribution of repairAmount

In [56]:
figthree=px.box(
    df,
    x="residenceType",
    y="repairAmount",
    title="Repair Amount Across Residence Types",
    labels={"residenceType": "Residence Type", "repairAmount": "Repair Amount ($)"},
)
figthree.show()

* Boxplot: repairAmount across residence types

In [57]:
figfour=px.histogram(
    df,
    x="specialNeeds",
    color="tsaEligible",
    barmode="group",
    title="TSA Eligibility by Special Needs Status",
    labels={"specialNeeds": "Special Needs (0=No, 1=Yes)", "tsaEligible": "TSA Eligible"},
)
figfour.show()

In [58]:
def mean_confidence_interval(data, confidence=0.95):
  data=np.array(data)
  n = len(data)
  mean = np.mean(data)
  se = stats.sem(data)
  h = se * stats.t.ppf((1+confidence)/2, n-1)
  return mean, mean - h, mean + h

* Defines a small function in order to calculate for CI

In [59]:
mean_all, lower_all, upper_all = mean_confidence_interval(df["repairAmount"])
print("Overall Mean Repair Amount and 95% Confidence Interval:")
print(f"Mean = ${mean_all:,.2f}")
print(f"95% CI = [${lower_all:,.2f}, ${upper_all:,.2f}]")

Overall Mean Repair Amount and 95% Confidence Interval:
Mean = $370.14
95% CI = [$327.09, $413.19]


* Calculate CI for repairAmount (whole sample)

In [60]:
eligible=df[df["tsaEligible"]==1]["repairAmount"]
not_eligible=df[df["tsaEligible"]==0]["repairAmount"]

* I calculated a 95& CI for the average repair amount to estimate the range where the true population mean falls. This gives FEMA an estimation of the typical repair costs applicants faced after disasters. The confidence interval shows the average repair amount along with upper and lower limits, meaning that we can be 95% confident that the actual mean repair cost is within that range.

In [61]:
t_stat,p_value = stats.ttest_ind(eligible,not_eligible,equal_var=False)
t_stat, p_value

(np.float64(6.8370569964133185), np.float64(8.976802964415808e-12))

* Welch's t-test

In [62]:
print("\nT-Test: TSA Eligible vs Not Eligible")
print(f"t-statistic = {t_stat:.3f}")
print(f"p-value = {p_value:.4f}")


T-Test: TSA Eligible vs Not Eligible
t-statistic = 6.837
p-value = 0.0000


In [63]:
if p_value < 0.05:
  print("There is a significant difference in average repair amounts")
else:
  print("There is no significant difference in average repair amounts")

There is a significant difference in average repair amounts


* I used a t-test to compare the average repair amounts between applicants who were TSA eligible and those who were not. This test checks if the difference in their averages is statistically significant or just due to random chance. If the p-value was less than 0.05, it meant that the difference is significant, which could correlate to TSA eligibility possiblity relating to higher/lower repair costs.

In [64]:
state1="LA"
state2="TX"
la=df[df["damagedStateAbbreviation"]==state1]["repairAmount"]
tx=df[df["damagedStateAbbreviation"]==state2]["repairAmount"]

In [65]:
t_stat2,p_value2 = stats.ttest_ind(tx,la,equal_var=False)

In [66]:
print(f"\nT-Test: {state1} vs {state2}")
print(f"t-statistic = {t_stat2:.3f}")
print(f"p-value = {p_value2:.4f}")


T-Test: LA vs TX
t-statistic = 3.168
p-value = 0.0015


In [67]:
if p_value2 < 0.05:
    print(f" {state1} and {state2} have significantly different average repair amounts.")
else:
    print(f" No significant difference between {state1} and {state2}.")

 LA and TX have significantly different average repair amounts.


* I ran another t-test in order to compare the mean repair amounts between applicants in LA and TX. Both states experience frequent hurriances and floods, so I was curious to see if the damage costs differ. If the p-value was below 0.05, it shows a meaningful difference in average repair amounts between the two states. If it was higher, it means their repair costs were not significantly different. In my case, LA and TX are significantly difference because of the p value being 0. These results sugguest that both eligibility and location play an important role in the amount of repair assistance applicants may need.

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

In [69]:
X=df[["grossIncome", "repairAmount", "destroyed","waterLevel","residenceType", "damagedStateAbbreviation"]]
y=df["tsaEligible"].astype(int)

* Before modeling, I filled any missing values to avoid errors and ensure all rows are used. Numeric values such as grossIncome and repairAmount were replaced with 0, and text fields were replaced with "unknown" or UNK

In [70]:
X["grossIncome"] = X["grossIncome"].fillna(0)
X["repairAmount"] = X["repairAmount"].fillna(0)
X["destroyed"] = X["destroyed"].fillna(0)
X["waterLevel"] = X["waterLevel"].fillna(0)
X["residenceType"] = X["residenceType"].fillna("Unknown")
X["damagedStateAbbreviation"] = X["damagedStateAbbreviation"].fillna("UNK")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

* I chose predictors that could influence if an applicant qualifies for TSA. The variable tsaEligible indicates if an applicant was approved for temporary shelter assistance.

In [71]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

* The dataset was split into 80% training and 20% testing to evaluate model performance fairly.

In [72]:
numeric_features=["grossIncome","repairAmount", "destroyed", "waterLevel"]
categorical_features=["residenceType","damagedStateAbbreviation"]

preprocessor=ColumnTransformer(
    transformers=[
        ("num",MinMaxScaler(),numeric_features),
        ("cat",OneHotEncoder(handle_unknown="ignore"),categorical_features)
    ]
)

* I used MinMaxScaler to scale numeric variables between 0 and 1 so that no feature dominates the others. For the text variables I used OneHotEncoder to convert them into binary columns.

In [73]:
tree_pipeline=Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("model", DecisionTreeClassifier(max_depth=6, random_state=42))
])

In [74]:
rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42))
])

* I built a decision tree and a random forest. The random forest combines many trees to improve accuracy. The decision tree provides a visual of the key predictors. Both use the same preprocessing pipeline for consistency.

In [75]:
tree_pipeline.fit(X_train, y_train)
rf_pipeline.fit(X_train, y_train)

In [76]:
y_pred_tree=tree_pipeline.predict(X_test)
y_pred_rf=rf_pipeline.predict(X_test)

* After training I used each model to predict TSA eligibility for the test set

In [77]:
def show_results(name, y_true, y_pred):
  print(f"\n{name} Results:")
  print("Accuracy:", round(accuracy_score(y_true, y_pred),3))
  print("Precision:", round(precision_score(y_true, y_pred),3))
  print("Recall:", round(recall_score(y_true,y_pred),3))
  print("Confusion Matrix:\n", confusion_matrix(y_true,y_pred))

show_results("Decision Tree", y_test,y_pred_tree)
show_results("Random Forest",y_test,y_pred_rf)


Decision Tree Results:
Accuracy: 0.783
Precision: 0.886
Recall: 0.491
Confusion Matrix:
 [[1193   48]
 [ 386  373]]

Random Forest Results:
Accuracy: 0.784
Precision: 0.888
Recall: 0.493
Confusion Matrix:
 [[1194   47]
 [ 385  374]]


* I then evaluated both models using accuracy, precision, recall, and confusion matrices. I think the random forest model performed better in regards to all the metris, because it showed stronger generalization and less missclassifications. As a result of this, I think the Random Forest model is the better one for predicting TSA eligibility.

In [78]:
!pip install streamlit
import streamlit as st
import pandas as pd
import plotly.express as px



In [79]:
st.title("FEMA Disaster Relief Dashboard")
df=pd.read_csv("/content/IndividualAssistanceHousingRegistrantsLargeDisasters.csv")
st.subheader("Data Preview")
st.write(df.head())

#histogram of repair amount
st.subheader("Histogram of Repair Amount")
fig_hist=px.histogram(df, x="repairAmount",nbins=30,title="Distribution of Repair Amounts")

st.plotly_chart(fig_hist)

#boxplot of repair amount by tsa eligibility
st.subheader("Boxplot: Repair Amount by TSA Eligibility")
fig_box=px.box(df, x="tsaEligible", y="repairAmount",title="Repair Amount by TSA Eligibilty",labels={"tsaEligible": "TSA Eligible (1=Yes,0=No)", "repairAmount":"Repair Amount"})
st.plotly_chart(fig_box)

st.markdown("*Insight:* Compare the central tendency and spread of repair amounts for TSA eligible vs. non-eligible households.*")



DeltaGenerator()