In [1]:
import pandas as pd
import os
from pathlib import Path, PureWindowsPath
import numpy as np

# Load file in

In [2]:
filename = 'County_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'
url = Path(PureWindowsPath('C:\\Users\\woodn\\github\\UCSD_MDS\\DSC267R'))

In [3]:
filepath = url / filename

In [4]:
df = pd.read_csv(filepath,
                 on_bad_lines = 'warn',
                 low_memory = True
                )

In [5]:
df.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3078 entries, 0 to 3077
Data columns (total 288 columns):
 #    Column             Non-Null Count  Dtype  
---   ------             --------------  -----  
 0    RegionID           3078 non-null   int64  
 1    SizeRank           3078 non-null   int64  
 2    RegionName         3078 non-null   object 
 3    RegionType         3078 non-null   object 
 4    StateName          3078 non-null   object 
 5    State              3078 non-null   object 
 6    Metro              1832 non-null   object 
 7    StateCodeFIPS      3078 non-null   int64  
 8    MunicipalCodeFIPS  3078 non-null   int64  
 9    2000-01-31         1036 non-null   float64
 10   2000-02-29         1038 non-null   float64
 11   2000-03-31         1040 non-null   float64
 12   2000-04-30         1042 non-null   float64
 13   2000-05-31         1045 non-null   float64
 14   2000-06-30         1047 non-null   float64
 15   2000-07-31         1048 non-null   float64
 16   2000

# Convert to long form and clean columns up

In [6]:
drop_list = ['SizeRank', 'RegionID', 'RegionType', 'StateName', 'Metro']
df = df.drop(columns = drop_list)

In [7]:
cols = df.columns.to_list()[:4]
cols

['RegionName', 'State', 'StateCodeFIPS', 'MunicipalCodeFIPS']

In [8]:
vals = df.columns.to_list()[4:]

In [9]:
df.iloc[:2,:]

Unnamed: 0,RegionName,State,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,2000-02-29,2000-03-31,2000-04-30,2000-05-31,2000-06-30,...,2022-06-30,2022-07-31,2022-08-31,2022-09-30,2022-10-31,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31
0,Los Angeles County,CA,6,37,208179.556892,208402.487335,209259.859164,210952.378857,213117.713834,215196.34901,...,856459.777569,858997.565355,857505.213577,853918.663929,850108.731052,847577.629371,844277.565852,836894.142476,826900.301415,818632.879337
1,Cook County,IL,17,31,131446.801823,131421.655494,131639.47353,132265.480295,133073.117848,133918.529103,...,280457.262562,279823.129207,278506.365622,276435.749606,274694.011148,273915.877877,273175.143042,273785.95142,274519.513193,275811.19923


In [10]:
pd.melt(df.iloc[:2,:], id_vars=['RegionName','State'], value_vars=vals)

Unnamed: 0,RegionName,State,variable,value
0,Los Angeles County,CA,2000-01-31,208179.556892
1,Cook County,IL,2000-01-31,131446.801823
2,Los Angeles County,CA,2000-02-29,208402.487335
3,Cook County,IL,2000-02-29,131421.655494
4,Los Angeles County,CA,2000-03-31,209259.859164
...,...,...,...,...
553,Cook County,IL,2023-01-31,273785.951420
554,Los Angeles County,CA,2023-02-28,826900.301415
555,Cook County,IL,2023-02-28,274519.513193
556,Los Angeles County,CA,2023-03-31,818632.879337


In [11]:
df = df.melt(id_vars=['RegionName','State'], value_vars=vals)

In [12]:
mapper = {'RegionName':'county'
          , 'State':'state'
          , 'variable':'date'
          , 'value':'ZHVI'
         }

In [13]:
df = df.rename(columns = mapper)

In [14]:
df.loc[:,'county'] = df.loc[:,'county']\
                        .str.replace(' County', '', case=False, regex=True)\
                        .str.replace(' Parish', '', case=False, regex=True)\
                        .str.replace(' Municipio', '', case=False, regex=True)\
                        .str.replace(' city', '', case=False, regex=True)

In [15]:
df.columns

Index(['county', 'state', 'date', 'ZHVI'], dtype='object')

In [16]:
df.info(verbose = True, show_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858762 entries, 0 to 858761
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   county  858762 non-null  object 
 1   state   858762 non-null  object 
 2   date    858762 non-null  object 
 3   ZHVI    583927 non-null  float64
dtypes: float64(1), object(3)
memory usage: 26.2+ MB


In [17]:
df.head()

Unnamed: 0,county,state,date,ZHVI
0,Los Angeles,CA,2000-01-31,208179.556892
1,Cook,IL,2000-01-31,131446.801823
2,Harris,TX,2000-01-31,107566.187362
3,Maricopa,AZ,2000-01-31,143016.539578
4,San Diego,CA,2000-01-31,214460.263299


# Write to dataset

In [18]:
filename2 = 'ZHVI.csv.gz'
url2 = Path(PureWindowsPath('C:\\Users\\woodn\\github\\datasets'))
filepath2 = url2 / filename2

In [19]:
df.to_csv(filepath2,
          index = False,
          compression = 'gzip'
         )