In [1]:
import pandas as pd
import os
from pathlib import Path, PureWindowsPath
import numpy as np

# Load file in

In [2]:
filename = 'IndividualsAndHouseholdsProgramValidRegistrations.csv'
url = Path(PureWindowsPath('C:\\Users\\woodn\\github\\UCSD_MDS\\DSC267R'))

In [3]:
filepath = url / filename

In [4]:
df = pd.read_csv(filepath,
                 on_bad_lines = 'warn',
                 low_memory = False
                )

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20644428 entries, 0 to 20644427
Data columns (total 71 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   incidentType                   object 
 1   declarationDate                object 
 2   disasterNumber                 int64  
 3   county                         object 
 4   damagedStateAbbreviation       object 
 5   damagedCity                    object 
 6   damagedZipCode                 object 
 7   applicantAge                   object 
 8   householdComposition           object 
 9   occupantsUnderTwo              object 
 10  occupants2to5                  object 
 11  occupants6to18                 object 
 12  occupants19to64                object 
 13  occupants65andOver             object 
 14  grossIncome                    object 
 15  ownRent                        object 
 16  primaryResidence               float64
 17  residenceType                  object 
 18  

In [6]:
df.head()

Unnamed: 0,incidentType,declarationDate,disasterNumber,county,damagedStateAbbreviation,damagedCity,damagedZipCode,applicantAge,householdComposition,occupantsUnderTwo,...,repairAmount,replacementAssistanceEligible,replacementAmount,personalPropertyEligible,personalPropertyAmount,ihpMax,haMax,onaMax,lastRefresh,id
0,Severe Storm,2003-04-24T00:00:00.000Z,1459,Lawrence (County),MS,MONTICELLO,39654,35-49,1,0,...,345.6,0,0.0,0,0.0,0,0,0,2023-03-18T05:26:48.000Z,20f5e26f-6bc7-4ca7-b68c-cebee055bc7e
1,Severe Storm,2003-04-24T00:00:00.000Z,1459,Pike (County),MS,MCCOMB,39648,35-49,3,0,...,0.0,0,0.0,1,671.54,0,0,0,2023-03-18T05:26:48.000Z,6275ebf0-0d66-451f-ac09-4b974c813acc
2,Severe Storm,2003-04-24T00:00:00.000Z,1459,Lauderdale (County),MS,MERIDIAN,39305,35-49,3,0,...,0.0,0,0.0,0,0.0,0,0,0,2023-03-18T05:26:48.000Z,ae868ee2-3e48-4c8a-91d3-3ef69ce10730
3,Severe Storm,2003-04-24T00:00:00.000Z,1459,Lincoln (County),MS,BROOKHAVEN,39601,35-49,>5,0,...,365.59,0,0.0,1,488.27,0,0,0,2023-03-18T05:26:48.000Z,e1a91d20-2e65-48f1-a189-303043529446
4,Severe Storm,2003-04-24T00:00:00.000Z,1459,Marion (County),MS,SANDY HOOK,39478,50-64,1,0,...,1290.98,0,0.0,1,305.92,0,0,0,2023-03-18T05:26:48.000Z,2879fea6-8b7f-4515-a928-6ac6523d33c8


# Clean to just needed columns

In [7]:
rel_list = ['declarationDate', 'county', 'damagedStateAbbreviation'
            , 'damagedCity', 'damagedZipCode', 'grossIncome'
            , 'floodDamageAmount', 'foundationDamageAmount', 'roofDamageAmount'
            , 'rentalAssistanceAmount', 'repairAmount', 'replacementAmount'
            , 'personalPropertyAmount'
           ]

In [8]:
df = df.loc[:,rel_list]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20644428 entries, 0 to 20644427
Data columns (total 13 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   declarationDate           object 
 1   county                    object 
 2   damagedStateAbbreviation  object 
 3   damagedCity               object 
 4   damagedZipCode            object 
 5   grossIncome               object 
 6   floodDamageAmount         float64
 7   foundationDamageAmount    float64
 8   roofDamageAmount          float64
 9   rentalAssistanceAmount    float64
 10  repairAmount              float64
 11  replacementAmount         float64
 12  personalPropertyAmount    float64
dtypes: float64(7), object(6)
memory usage: 2.0+ GB


# Clean date column

In [10]:
df.loc[:,'declarationDate'].head()

0    2003-04-24T00:00:00.000Z
1    2003-04-24T00:00:00.000Z
2    2003-04-24T00:00:00.000Z
3    2003-04-24T00:00:00.000Z
4    2003-04-24T00:00:00.000Z
Name: declarationDate, dtype: object

In [11]:
try:
    df['declarationDate'] = pd.to_datetime(arg=df.loc[:,'declarationDate'],
                                           errors='raise',
                                           format="%Y-%m-%dT%H:%M:%S"
                                          )
except:
    print('that ain\'t it man')

In [12]:
df.loc[:,'declarationDate'].agg(['min','max'])

min   2002-11-05 00:00:00+00:00
max   2023-04-27 00:00:00+00:00
Name: declarationDate, dtype: datetime64[ns, UTC]

# Change columns to something nicer

In [13]:
mapper = {'damagedStateAbbreviation':'state'
          , 'damagedCity':'city'
          , 'damagedZipCode':'zip'
          , 'declarationDate':'date'
         }

In [14]:
df = df.rename(columns = mapper)

In [15]:
df.columns

Index(['date', 'county', 'state', 'city', 'zip', 'grossIncome',
       'floodDamageAmount', 'foundationDamageAmount', 'roofDamageAmount',
       'rentalAssistanceAmount', 'repairAmount', 'replacementAmount',
       'personalPropertyAmount'],
      dtype='object')

# Clean region columns information

In [16]:
df.loc[:,'county'] = df.loc[:,'county'].str.replace(' \(.*', '', case=False, regex=True)

In [22]:
pd.Series(df.loc[:,'county'].unique()).to_csv('counties.csv')

# Explore State and zip code for anything unusual

# Combine damages and reimbursements

In [23]:
df.loc[:,'damage'] = df.loc[:,'floodDamageAmount']\
                    + df.loc[:,'foundationDamageAmount']\
                    + df.loc[:,'roofDamageAmount']

In [24]:
df.loc[:,'reimbursements'] = df.loc[:,'rentalAssistanceAmount']\
                                + df.loc[:,'repairAmount']\
                                + df.loc[:,'replacementAmount']\
                                + df.loc[:,'personalPropertyAmount']

# Final dataset conditioning and write

In [25]:
# add state and county back in if needed later
drop_list = ['floodDamageAmount', 'foundationDamageAmount', 'roofDamageAmount'
            , 'rentalAssistanceAmount', 'repairAmount', 'replacementAmount'
            , 'personalPropertyAmount', 'grossIncome'
           ]
df = df.drop(columns = drop_list)

In [26]:
df.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20644428 entries, 0 to 20644427
Data columns (total 7 columns):
 #   Column          Non-Null Count     Dtype              
---  ------          --------------     -----              
 0   date            20644428 non-null  datetime64[ns, UTC]
 1   county          20644428 non-null  object             
 2   state           20644428 non-null  object             
 3   city            20644414 non-null  object             
 4   zip             20644392 non-null  object             
 5   damage          20644428 non-null  float64            
 6   reimbursements  20644428 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(2), object(4)
memory usage: 1.1+ GB


In [27]:
df = df.dropna()

In [28]:
df = df.astype({'damage':'float16','reimbursements':'float16'})

In [29]:
df.loc[:,'date'] = df.loc[:,'date'].dt.date

In [30]:
df.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20644383 entries, 0 to 20644427
Data columns (total 7 columns):
 #   Column          Non-Null Count     Dtype  
---  ------          --------------     -----  
 0   date            20644383 non-null  object 
 1   county          20644383 non-null  object 
 2   state           20644383 non-null  object 
 3   city            20644383 non-null  object 
 4   zip             20644383 non-null  object 
 5   damage          20644383 non-null  float16
 6   reimbursements  20644383 non-null  float16
dtypes: float16(2), object(5)
memory usage: 1023.8+ MB


In [31]:
df.head()

Unnamed: 0,date,county,state,city,zip,damage,reimbursements
0,2003-04-24,Lawrence,MS,MONTICELLO,39654,345.5,345.5
1,2003-04-24,Pike,MS,MCCOMB,39648,0.0,671.5
2,2003-04-24,Lauderdale,MS,MERIDIAN,39305,0.0,0.0
3,2003-04-24,Lincoln,MS,BROOKHAVEN,39601,0.0,854.0
4,2003-04-24,Marion,MS,SANDY HOOK,39478,0.0,1597.0


# Write to dataset

In [32]:
filename2 = 'FEMA_claims.csv.gz'
url2 = Path(PureWindowsPath('C:\\Users\\woodn\\github\\datasets'))
filepath2 = url2 / filename2

In [33]:
df.to_csv(filepath2,
          index = False,
          compression = 'gzip'
         )