# Goal
This script will transform raw target data (i.e., monthly inventory by city) and associated features into one DataFrame to be utilized for machine learning.

## 0) Setup

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sqlalchemy as sa
import psycopg2

## 1) Pull Target CSV (Inventory Data)

In [2]:
# Load in raw inventory CSV.
path = 'data/Target.csv'
df = pd.read_csv(path)
df

Unnamed: 0,month_date_yyyymm,state,state_id,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,median_days_on_market,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,202301,Alabama,AL,313495,0.0116,0.1654,11755,-0.0611,0.6740,78,...,410280,0.0131,0.0858,15873,-0.041196013,0.2687,0.3513,0.0260,-0.4278,0.0
1,202212,Alabama,AL,309900,-0.0038,0.1442,12519,-0.0298,0.5652,69,...,404967,-0.0070,0.0719,16555,-0.059214639,0.1882,0.3253,-0.0402,-0.4214,0.0
2,202211,Alabama,AL,311083,-0.0124,0.1312,12904,0.0314,0.5070,59,...,407813,-0.0184,0.0671,17597,-0.010737576,0.1501,0.3656,-0.0671,-0.4229,0.0
3,202210,Alabama,AL,315000,-0.0139,0.1455,12511,0.0476,0.3801,54,...,415469,-0.0047,0.0938,17788,-0.006534488,0.1185,0.4327,-0.0682,-0.3233,0.0
4,202209,Alabama,AL,319450,-0.0061,0.1555,11943,0.0111,0.2965,52,...,417431,-0.0082,0.0982,17905,0.018428986,0.0892,0.5009,0.0096,-0.2863,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4024,201611,Wyoming,WY,239000,,,4081,,,112,...,451977,,,4510,-0.072207365,,0.1038,,,
4025,201610,Wyoming,WY,244000,,,4398,,,104,...,459457,,,4861,-0.042921835,,0.1032,,,
4026,201609,Wyoming,WY,245975,,,4578,,,96,...,469306,,,5079,-0.034043362,,0.1104,,,
4027,201608,Wyoming,WY,245738,,,4716,,,87,...,459690,,,5258,0.01251685,,0.1153,,,


In [3]:
# Drop unneeded columns and yy columns
df = df.drop(
    ['state', 'median_listing_price_yy', 'active_listing_count_yy', 'median_days_on_market_yy',\
        'new_listing_count_yy', 'price_increased_count_yy', 'price_reduced_count_yy',\
        'pending_listing_count_yy', 'median_listing_price_per_square_foot_yy',\
        'median_square_feet_yy', 'average_listing_price_yy', 'total_listing_count_yy',\
        'pending_ratio_yy', 'quality_flag'], axis=1
)

# Drop last row to account for div by 0 error
df = df.drop(4028)

In [4]:
# Change type to float
df['total_listing_count_mm'] = df['total_listing_count_mm'].astype('float')

# New column for categorical hot or not
df['Target'] = np.where(df['total_listing_count_mm'] > 0, 'Hot', 'Not hot' )

## 2) Pull Features from CSVs
Perform the following for each CSV:
- Load in raw CSV.
- Unpivot to match the DataFrame made from Section 1.
- Clean up data.
- Aggregate data by month and state, if necessary.
- Join to DataFrame by month and state.

In [5]:
# Load in feature CSVs
hot_df = pd.read_csv('data/Hotness.csv')
unemp_df = pd.read_csv('data/US_States_Unemployment_Rate_by_Month.csv')
temp_df = pd.read_csv('data/US_Temp_AVG_Per_Month.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
# Drop last row (not data)
hot_df.drop(106865, inplace=True)
# Change key to int for joining
hot_df['month_date_yyyymm'] = hot_df['month_date_yyyymm'].astype('int')
# Split state id for joining
hot_df[['county','state_id']] = hot_df['county_name'].str.split(', ',expand=True)
# state_id to capital letters
hot_df['state_id'] = hot_df['state_id'].str.upper()
# Reduce df
hotter_df = hot_df[['month_date_yyyymm', 'hotness_rank_mm', 'hotness_score', 'state_id']]

In [8]:
# Drop unneeded State column
temp_df = temp_df.drop(['State'], axis=1)

# Melt date columns to match format
temp_df = temp_df.melt(
    id_vars=['Abbreviation'], var_name='month_date_yyyymm', value_name='Temperature (F)')

# Change state column to match other DF
temp_df.rename(columns={'Abbreviation': 'state_id'})

# Change date type to int
temp_df['month_date_yyyymm'] = temp_df['month_date_yyyymm'].astype('int')

In [9]:
# Drop unneeded State column
unemp_df = unemp_df.drop(['State'], axis=1)

# Melt date columns to match format
unemp_df = unemp_df.melt(
    id_vars=['Abbreviation'], var_name='month_date_yyyymm', value_name='Unemployment Rate')

# Change state column to match other DF
unemp_df.rename(columns={'Abbreviation': 'state_id'})

# Change date type to int
unemp_df['month_date_yyyymm'] = unemp_df['month_date_yyyymm'].astype('int')

## 3) Clean Up Final DataFrame
- Remove rows where there are any empty features (?).
- Check datatypes and cast to proper datatype (i.e., one that makes sense and one that matches the SQL table).

In [None]:
# df.merge(hotter_df, on=['month_date_yyyymm', 'state_id'], how='left', validate='1:1')

## 4) Load to SQL Database
Before this, set up new database & table in Postgres.

In [None]:
# final_df.to_sql(name='hotness', con=conn, if_exists='append')