# IMPORTS

In [67]:
import requests
import pandas as pd
import numpy as np

In [68]:
# citation: https://www.geeksforgeeks.org/using-apply-in-pandas-lambda-functions-with-multiple-if-statements/
def hud_email_condition_func(x):
    if pd.isnull(x):
        return np.NaN
    elif str(x)[::-1][3] != '.':
        return np.NaN
    else:
        return x

# GET HUD DATA

In [69]:
#sample_url = 'http://data.hud.gov/Housing_Counselor/search?AgencyName=&City=&State='

In [70]:
url = f'http://data.hud.gov/Housing_Counselor/search?AgencyName=&City=&State='

In [71]:
url

'http://data.hud.gov/Housing_Counselor/search?AgencyName=&City=&State='

In [72]:
requests.get(url=url).status_code

200

In [73]:
requests.get(url=url).headers

{'Date': 'Tue, 12 Jul 2022 07:40:36 GMT', 'Server': 'Apache', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'X-Frame-Options': 'DENY', 'X-Content-Type-Options': 'nosniff', 'X-ORACLE-DMS-ECID': '70e35437-0e8f-4457-90c6-cf8aa0fc673d-00146214', 'X-ORACLE-DMS-RID': '0', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked', 'Content-Type': 'application/json; charset=iso-8859-1'}

In [74]:
res = requests.get(url=url)
res_json = res.json()

In [75]:
df = pd.DataFrame(res_json)

In [76]:
list(df.columns)

['services',
 'languages',
 'agcid',
 'adr1',
 'adr2',
 'city',
 'email',
 'fax',
 'nme',
 'phone1',
 'statecd',
 'weburl',
 'zipcd',
 'agc_ADDR_LATITUDE',
 'agc_ADDR_LONGITUDE',
 'parentid',
 'county_NME',
 'phone2',
 'mailingadr1',
 'mailingadr2',
 'mailingcity',
 'mailingzipcd',
 'mailingstatecd',
 'state_NME',
 'state_FIPS_CODE',
 'faithbased',
 'colonias_IND',
 'migrantwkrs_IND',
 'agc_STATUS',
 'agc_SRC_CD',
 'counslg_METHOD']

In [77]:
df.shape

(1511, 31)

In [78]:
keep_features = ['services',
                 'agcid',
                 'adr1',
                 'adr2',
                 'city',
                 'email',
                 'fax',
                 'nme',
                 'phone1',
                 'statecd',
                 'weburl',
                 'zipcd',
                 'agc_ADDR_LATITUDE',
                 'agc_ADDR_LONGITUDE',
                 'languages',
                 'faithbased',
                 'counslg_METHOD']

In [79]:
df = df[keep_features]

In [80]:
print(f"{'Feature':<20} NaNs")
for f in df.columns:
    print(f"{f:<20} {df[f].str.strip().isnull().sum()}")

Feature              NaNs
services             21
agcid                0
adr1                 0
adr2                 988
city                 0
email                21
fax                  565
nme                  0
phone1               1
statecd              0
weburl               13
zipcd                0
agc_ADDR_LATITUDE    40
agc_ADDR_LONGITUDE   40
languages            0
faithbased           0
counslg_METHOD       92


In [81]:
df['adr2'] = df['adr2'].replace('N/A', np.NaN)
df['adr2'] = df['adr2'].apply(lambda x: np.NaN if str(x).isspace() else x)

In [82]:
df['email'] = df['email'].apply(lambda x: np.NaN if '@' not in str(x) else x)
df['email'] = df['email'].apply(lambda x: np.NaN if str(x).isspace() else x)
df['email'] = df['email'].apply(lambda x: np.NaN if pd.isnull(x) else ''.join(str(x).split()))
df['email'] = df['email'].apply(lambda x: hud_email_condition_func(x))

In [83]:
df['fax'] = df['fax'].replace('000-000-0000', np.NaN)
df['fax'] = df['fax'].replace('111-111-1111', np.NaN)
df['fax'] = df['fax'].replace('111-222-3333', np.NaN)
df['fax'] = df['fax'].apply(lambda x: np.NaN if str(x).isspace() else x)

In [84]:
df['phone1'] = df['phone1'].replace('000-000-0000', np.NaN)
df['phone1'] = df['phone1'].apply(lambda x: np.NaN if str(x).isspace() else x)

In [85]:
df['weburl'] = df['weburl'].replace('N/A', np.NaN)
df['weburl'] = df['weburl'].apply(lambda x: np.NaN if '@' in str(x) else x)
df['weburl'] = df['weburl'].apply(lambda x: np.NaN if str(x).isspace() else x)
df['weburl'] = df['weburl'].apply(lambda x: np.NaN if 'n/a' in str(x) else x)
df['weburl'] = df['weburl'].apply(lambda x: x if ''.join(str(x).split()) == str(x) else np.NaN)
df['weburl'] = df['weburl'].apply(lambda x: np.NaN if '.www' in str(x) else x)
df['weburl'] = df['weburl'].apply(lambda x: str(x).replace(',','.'))

In [86]:
df['zipcd'] = df['zipcd'].apply(lambda x: x.split('-')[0])

In [87]:
df['agc_ADDR_LATITUDE'] = df['agc_ADDR_LATITUDE'].replace('0', np.NaN)

In [88]:
df['agc_ADDR_LONGITUDE'] = df['agc_ADDR_LONGITUDE'].replace('0', np.NaN)
df['agc_ADDR_LONGITUDE'] = df['agc_ADDR_LONGITUDE'].apply(lambda x: x if '-' in str(x) or pd.isnull(x) else '-' + x)

In [89]:
df['faithbased'] = df['faithbased'].map({'N':'No',
                      'Y':'Yes'})

In [90]:
print(f"{'Feature':<20} NaNs")
for f in df.columns:
    print(f"{f:<20} {df[f].str.strip().isnull().sum()}")

Feature              NaNs
services             21
agcid                0
adr1                 0
adr2                 1287
city                 0
email                433
fax                  682
nme                  0
phone1               17
statecd              0
weburl               0
zipcd                0
agc_ADDR_LATITUDE    43
agc_ADDR_LONGITUDE   43
languages            0
faithbased           0
counslg_METHOD       92


In [91]:
# lang_dict = dict()

# lang_dict = {'ARA':'Arabic',
#              'ASL':'American Sign Language',
#              'CAM':'Cambodian',
#              'CAN':'Cantonese',
#              'CHI':'Chinese Mandarin',
#              'CRE':'Creole',
#              'CZE':'Czech',
#              'ENG':'English',
#              'FAR':'Farsi',
#              'FRE':'French',
#              'GER':'German',
#              'HIN':'Hindi',
#              'HMO':'Hmong',
#              'IND':'Indonesian',
#              'ITA':'Italian',
#              'KOR':'Korean',
#              'OTH':'Other',
#              'POL':'Polish',
#              'POR':'Portuguese',
#              'RUS':'Russian',
#              'SPA':'Spanish',
#              'SWA':'Swahili',
#              'TUR':'Turkish',
#              'UKR':'Ukranian',
#              'VIE':'Vietnamese'}

s = df['languages'].apply(lambda x: np.NaN if pd.isnull(x)
                               else ['language_' + m for m in x.split(',')]).explode()

df = df.join(pd.crosstab(s.index, s))

In [92]:
# srvc_dict = dict()

# srvc_dict = {'DFC':'Mortgage Delinquency and Default Resolution Counseling',
#              'DFW':'Resolving Preventing Mortgage Delinquency Workshops',
#              'FBC':'Financial Management Budget Counseling',
#              'FBW':'Financial, Budgeting, and Credit Workshops',
#              'FHW':'Fair Housing PrePurchase Education Workshops',
#              'HIC':'Home Improvement and Rehabilitation Counseling',
#              'HMC':'Services for Homeless Counseling',
#              'NDW':'Non-Delinquency Post Purchase Workshops',
#              'PLW':'Predatory Lending Education Workshops',
#              'PPC':'PrePurchase Counseling',
#              'PPW':'PrePurchase Homebuyer Education Workshops',
#              'RHC':'Rental Housing Counseling',
#              'RHW':'Rental Housing Workshops',
#              'RMC':'Reverse Mortgage Counseling'}

s = df['services'].apply(lambda x: np.NaN if pd.isnull(x)
                               else ['service_' + m for m in x.split(',')]).explode()

df = df.join(pd.crosstab(s.index, s))

In [93]:
# citation: https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
cons_dict = dict()

cons_dict = {'Face to Face Counseling':'FTF',
             'Group Counseling':'GRP',
             'Internet Counseling':'WEB',
             'Other Counseling':'OTH',
             'Phone Counseling':'TEL',
             'Phone Counseling Only':'TEL',
             'Video Conference':'VID'}

s = df['counslg_METHOD'].apply(lambda x: np.NaN if pd.isnull(x)
                               else ['counseling_' + cons_dict[m] for m in x.split(',')]).explode()

df = df.join(pd.crosstab(s.index, s))

In [94]:
df['counslg_METHOD'] = df['counslg_METHOD'].apply(lambda x: np.NaN if pd.isnull(x) else ','.join([cons_dict[s] for s in set(x.split(','))]))

In [95]:
df.to_csv('./data/hud_data.csv', index=False)

# ADD TO HUD DATA AFTER MODELING

In [96]:
hud_df = pd.read_csv('./data/hud_data.csv')
state_df = pd.read_csv('./data/data_raw/state_codes.csv')
dqf_df = pd.read_csv('./data/delinquency_join_to_hud_data.csv')
pp_df = pd.read_csv('./data/prepurchase_join_to_hud_data.csv')

In [97]:
dqf_df = dqf_df.drop(columns=['Unnamed: 0','rmse_baseline','mae_baseline','mape_baseline'])
pp_df = pp_df.drop(columns=['Unnamed: 0','rmse_baseline','mae_baseline','mape_baseline'])

In [98]:
pp_df['state'] = pp_df['state'].apply(lambda x: x.replace('_',' ').title())
dqf_df['state'] = dqf_df['state'].apply(lambda x: x.replace('_',' ').title())

In [99]:
state_name_dict = dict()

for c in state_df['name']:

    state_name_dict[c] = state_df[state_df['name'] == c]['postal_code'].values[0]

state_name_dict['District Of Columbia'] = 'DC'

In [100]:
pp_df['state'] = pp_df['state'].map(state_name_dict)
dqf_df['state'] = dqf_df['state'].map(state_name_dict)

In [101]:
dqf_names_dict = dict()

for k, v in zip(dqf_df.columns, ['dqf_' + c  if c != 'state' else c for c in dqf_df.columns]):
    
    dqf_names_dict[k] = v

In [102]:
pp_names_dict = dict()

for k, v in zip(pp_df.columns, ['pp_' + c  if c != 'state' else c for c in pp_df.columns]):
    
    pp_names_dict[k] = v

In [103]:
hud_df = pd.merge(hud_df, dqf_df.rename(columns=dqf_names_dict), how='left', left_on='statecd', right_on='state').drop(columns=['state'])

In [104]:
hud_df = pd.merge(hud_df, pp_df.rename(columns=pp_names_dict), how='left', left_on='statecd', right_on='state').drop(columns=['state'])

In [105]:
hud_df.to_csv('./data/hud_data.csv', index=False)