In [2]:
import pandas as pd
import tabula
import requests, zipfile, io
from sqlalchemy import create_engine

In [3]:
# Options
pd.set_option('display.max_columns', None)

In [4]:
# Variables
hcad_hearings_data_url = 'https://download.hcad.org/data/CAMA/'
hcad_real_property_data_url = 'https://download.hcad.org/data/CAMA/'
arb_hearing_data_path = r"/Users/paz/Documents/github/home-property-tax/wickersham/tax_year_2023/doc/arb_hearing_data/extracted_files_"
real_property_data_path = r"/Users/paz/Documents/github/home-property-tax/wickersham/tax_year_2023/doc/real_acct_data/extracted_files_"
zwicker_account_id = '931920000058'

In [5]:
# database
engine = create_engine('sqlite:////Users/paz/Documents/github/home-property-tax/wickersham/tax_year_2023/database/home_property_tax.db')

In [6]:
def download_zip_files(url, file_name, folder_name, start_year, end_year):
    for tax_year in range(start_year,end_year + 1):
        tax_year_url = f"{url}{tax_year}/{file_name}.zip"
        print(tax_year_url)
        r = requests.get(tax_year_url, stream=True)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(f'/Users/paz/Documents/github/home-property-tax/wickersham/tax_year_2023/doc/{folder_name}/extracted_files_{tax_year}')

def unify_data(path, file_name, start_year, end_year):
    df = pd.DataFrame()
    for tax_year in range(start_year,end_year + 1):
        file_path = f'{path}{tax_year}/{file_name}.txt'
        df_temp = pd.read_csv(r'{}'.format(file_path), sep="\t", encoding = 'latin1', low_memory=False)
        df = pd.concat([df_temp, df])
        df = df.reset_index(drop=True)
        print(f'finished {tax_year}.')
    return df

In [7]:
# download_zip_files(hcad_hearings_data_url, 'Hearing_files', 'arb_hearing_data', 2005, 2023)

In [8]:
# download_zip_files(hcad_real_property_data_url, 'Real_acct_owner', 'real_acct_data', 2005, 2023)

In [9]:
arb_hearings_real_df = unify_data(arb_hearing_data_path, 'arb_hearings_real', 2005, 2023)

finished 2005.
finished 2006.
finished 2007.
finished 2008.
finished 2009.
finished 2010.
finished 2011.
finished 2012.
finished 2013.
finished 2014.
finished 2015.
finished 2016.
finished 2017.
finished 2018.
finished 2019.
finished 2020.
finished 2021.
finished 2022.
finished 2023.


In [10]:
arb_hearings_real_df['acct'] = arb_hearings_real_df['acct'].astype(str)
arb_hearings_real_df['Tax_Year'] = arb_hearings_real_df['Tax_Year'].astype(str)
arb_hearings_real_df['Scheduled_for_Date'] = pd.to_datetime(arb_hearings_real_df['Scheduled_for_Date'], errors='coerce')
arb_hearings_real_df['Actual_Hearing_Date'] = pd.to_datetime(arb_hearings_real_df['Actual_Hearing_Date'], errors='coerce')
arb_hearings_real_df['Release_Date'] = pd.to_datetime(arb_hearings_real_df['Release_Date'], errors='coerce')
arb_hearings_real_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6239680 entries, 0 to 6239679
Data columns (total 15 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   acct                     object        
 1   Tax_Year                 object        
 2   Real_Personal_Property   object        
 3   Hearing_Type             object        
 4   State_Class_Code         object        
 5   Owner_Name               object        
 6   Scheduled_for_Date       datetime64[ns]
 7   Actual_Hearing_Date      datetime64[ns]
 8   Release_Date             datetime64[ns]
 9   Letter_Type              object        
 10  Agent_Code               object        
 11  Initial_Appraised_Value  float64       
 12  Initial_Market_Value     float64       
 13  Final_Appraised_Value    float64       
 14  Final_Market_Value       float64       
dtypes: datetime64[ns](3), float64(4), object(8)
memory usage: 714.1+ MB


In [18]:
for year in range(2023, 2024):
    real_acct_df = unify_data(real_property_data_path, 'real_acct', year, year)
    real_acct_df['acct'] = real_acct_df['acct'].astype(str)
    real_acct_df['yr'] = real_acct_df['yr'].astype(str)
    real_acct_df['certified_date'] = pd.to_datetime(real_acct_df['certified_date'], errors='coerce')
    real_acct_df['notice_dt'] = pd.to_datetime(real_acct_df['notice_dt'], errors='coerce')
    real_acct_df['rev_dt'] = pd.to_datetime(real_acct_df['rev_dt'], errors='coerce')
    real_acct_df['new_own_dt'] = pd.to_datetime(real_acct_df['new_own_dt'], errors='coerce')
    real_acct_df['splt_dt'] = pd.to_datetime(real_acct_df['splt_dt'], errors='coerce')
    real_acct_df.to_sql('real_acct', engine, if_exists='append', index=False)
    print(f'loaded data to sqlite for {year}')

finished 2023.
loaded data to sqlite for 2023


In [13]:
real_acct_df.loc[real_acct_df['acct'] == zwicker_account_id].head()

In [15]:
# arb_hearings_real_df.to_sql('arb_hearings_real', engine, if_exists='replace', index=False)

6239680