<a href="https://colab.research.google.com/github/mattwantshouses/name_parsing/blob/main/Prod_Property_Address_parsing_2_Columns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parse Property Address 2 columns

In [1]:
import pandas as pd
import re
import os
from google.colab import files
import logging
# Ensure openpyxl is installed
!pip install openpyxl

In [4]:
# Version 2 updated 07/08/24
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input Validation: Improve user input handling
def get_valid_column_input(prompt, max_index):
    while True:
        try:
            col_index = int(input(prompt))
            if 0 <= col_index <= max_index:
                return col_index
            else:
                print(f"Please enter a number between 0 and {max_index}")
        except ValueError:
            print("Please enter a valid number")

# File upload
uploaded = files.upload()

# Get the filename of the uploaded file
filename = next(iter(uploaded))

# Determine file type and read accordingly
file_extension = os.path.splitext(filename)[1].lower()

try:
    if file_extension == '.csv':
        df = pd.read_csv(filename, encoding='utf-8')
    elif file_extension in ['.xls', '.xlsx']:
        df = pd.read_excel(filename, engine='openpyxl')
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")
except Exception as e:
    logger.error(f"Error reading file: {e}")
    raise

# Print column names and ask for input
print("Available columns:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Use the new input validation function
street_col = get_valid_column_input("Enter the number of the column containing the street address: ", len(df.columns) - 1)
city_state_zip_col = get_valid_column_input("Enter the number of the column containing city, state, and zip: ", len(df.columns) - 1)

street_col = df.columns[street_col]
city_state_zip_col = df.columns[city_state_zip_col]

# Address Parsing: Improve parsing function to handle various formats and edge cases
def clean_street_address(address):
    # Add space between number and street name if missing
    return re.sub(r'(\d+)([A-Za-z])', r'\1 \2', str(address))

def parse_address(street, city_state_zip):
    result = {'Property Street': clean_street_address(street),
              'Property City': None, 'Property State': None, 'Property Zip': None,
              'Address Flag': None}

    if pd.isna(city_state_zip):
        result['Address Flag'] = 'Missing City/State/Zip'
        return pd.Series(result)

    city_state_zip = str(city_state_zip).strip()

    # Try to match the format: CITY, STATE- ZIP
    match = re.match(r'^(.*?),\s*([A-Z]{2})-\s*(\d{5})$', city_state_zip)
    if match:
        result['Property City'], result['Property State'], result['Property Zip'] = match.groups()
    else:
        # Try to match the format: CITY, ZIP (missing state)
        match = re.match(r'^(.*?),\s*(\d{5})$', city_state_zip)
        if match:
            result['Property City'], result['Property Zip'] = match.groups()
            result['Property State'] = 'FL'  # Infer state as FL
        else:
            result['Property City'] = city_state_zip
            result['Address Flag'] = 'Incomplete Address'

    # Check if any part is missing and set flag
    if not all([result['Property City'], result['Property State'], result['Property Zip']]):
        result['Address Flag'] = 'Incomplete Address'

    return pd.Series(result)

# Data Inspection: Inspect data before parsing
print("Sample data before parsing:")
print(df[[street_col, city_state_zip_col]].head())

# Data Cleaning: Clean the data before parsing
def clean_address(address):
    if pd.isna(address):
        return address
    return re.sub(r'\s+', ' ', str(address).strip())

df[city_state_zip_col] = df[city_state_zip_col].apply(clean_address)

# Apply the parsing function to the DataFrame
try:
    parsed_df = df.apply(lambda row: parse_address(row[street_col], row[city_state_zip_col]), axis=1)
except Exception as e:
    logger.error(f"Error parsing addresses: {e}")
    logger.error(f"Sample data: {df[[street_col, city_state_zip_col]].head()}")
    raise

# Create a new DataFrame with the desired column order
new_df = parsed_df[['Property Street', 'Property City', 'Property State', 'Property Zip', 'Address Flag']]

# Data Inspection: Inspect data after parsing
print("\nSample data after parsing:")
print(new_df.head())

# Add the new columns to the original DataFrame
df = pd.concat([df, new_df], axis=1)

# Save the DataFrame to a CSV file
output_filename = 'parsed_property_address' + os.path.splitext(filename)[0] + '.csv'
df.to_csv(output_filename, index=False)

# Download the file
files.download(output_filename)

print("\nProcessing complete. Please check the downloaded file for results.")

Saving Combined Base-Scraper Results_2024-07-08_11-54-45.xlsx to Combined Base-Scraper Results_2024-07-08_11-54-45 (2).xlsx
Available columns:
0: County
1: Auction Starts
2: Auction Type:
3: Case #:
4: Final Judgment Amount:
5: Parcel ID:
6: Property Address:
7: Unnamed: 7
8: Assessed Value:
9: Plaintiff Max Bid:
10: Unnamed: 10
11: parcel_id
12: location
13: property_use_code
14: acreage
15: gross_sq_ft
16: finished_sq_ft
17: year_built
18: effective_year_built
19: bedrooms
20: full_bathrooms
21: 0_name
22: 0_address1
23: 0_address2
24: 0_cityStateZip
25: sale_date
26: sale_price
27: instrument
28: 1_name
29: 1_address1
30: 1_address2
31: 1_cityStateZip
32: 2_name
33: 2_address1
34: 2_address2
35: 2_cityStateZip
Enter the number of the column containing the street address: 6
Enter the number of the column containing city, state, and zip: 7
Sample data before parsing:
    Property Address:              Unnamed: 7
0      202 SE 14TH PL  GAINESVILLE, FL- 32601
1     1610 NE 17TH PL  GAIN

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing complete. Please check the downloaded file for results.
