<a href="https://colab.research.google.com/github/mattwantshouses/name_parsing/blob/main/Prod_Address_Parsing_1_and_2_Columns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parse out the Property Address - 2 column
When its listed in two columns like this

Street Address | City ST Zip

```
ex: "202 SE 14TH PL|GAINESVILLE, FL- 32601"
```

## Instructions
Run the cells in order.
Each cell must complete before the next one is ran.

Video instructions here: https://www.loom.com/share/29394d4d70cc440a9f3d8dd3f0623c32?sid=2912910f-e59f-43e0-8dfe-a760211ffdef


In [None]:
import pandas as pd
import re
import os
from google.colab import files
import logging
# Ensure openpyxl is installed
!pip install openpyxl



In [None]:
# Version 2 updated 07/08/24
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Input Validation: Improve user input handling
def get_valid_column_input(prompt, max_index):
    while True:
        try:
            col_index = int(input(prompt))
            if 0 <= col_index <= max_index:
                return col_index
            else:
                print(f"Please enter a number between 0 and {max_index}")
        except ValueError:
            print("Please enter a valid number")

# File upload
uploaded = files.upload()

# Get the filename of the uploaded file
filename = next(iter(uploaded))

# Determine file type and read accordingly
file_extension = os.path.splitext(filename)[1].lower()

try:
    if file_extension == '.csv':
        df = pd.read_csv(filename, encoding='utf-8')
    elif file_extension in ['.xls', '.xlsx']:
        df = pd.read_excel(filename, engine='openpyxl')
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")
except Exception as e:
    logger.error(f"Error reading file: {e}")
    raise

# Print column names and ask for input
print("Available columns:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Use the new input validation function
street_col = get_valid_column_input("Enter the number of the column containing the street address: ", len(df.columns) - 1)
city_state_zip_col = get_valid_column_input("Enter the number of the column containing city, state, and zip: ", len(df.columns) - 1)

street_col = df.columns[street_col]
city_state_zip_col = df.columns[city_state_zip_col]

# Address Parsing: Improve parsing function to handle various formats and edge cases
def clean_street_address(address):
    # Add space between number and street name if missing
    return re.sub(r'(\d+)([A-Za-z])', r'\1 \2', str(address))

def parse_address(street, city_state_zip):
    result = {'Property Street': clean_street_address(street),
              'Property City': None, 'Property State': None, 'Property Zip': None,
              'Address Flag': None}

    if pd.isna(city_state_zip):
        result['Address Flag'] = 'Missing City/State/Zip'
        return pd.Series(result)

    city_state_zip = str(city_state_zip).strip()

    # Try to match the format: CITY, STATE- ZIP
    match = re.match(r'^(.*?),\s*([A-Z]{2})-\s*(\d{5})$', city_state_zip)
    if match:
        result['Property City'], result['Property State'], result['Property Zip'] = match.groups()
    else:
        # Try to match the format: CITY, ZIP (missing state)
        match = re.match(r'^(.*?),\s*(\d{5})$', city_state_zip)
        if match:
            result['Property City'], result['Property Zip'] = match.groups()
            result['Property State'] = 'FL'  # Infer state as FL
        else:
            result['Property City'] = city_state_zip
            result['Address Flag'] = 'Incomplete Address'

    # Check if any part is missing and set flag
    if not all([result['Property City'], result['Property State'], result['Property Zip']]):
        result['Address Flag'] = 'Incomplete Address'

    return pd.Series(result)

# Data Inspection: Inspect data before parsing
print("Sample data before parsing:")
print(df[[street_col, city_state_zip_col]].head())

# Data Cleaning: Clean the data before parsing
def clean_address(address):
    if pd.isna(address):
        return address
    return re.sub(r'\s+', ' ', str(address).strip())

df[city_state_zip_col] = df[city_state_zip_col].apply(clean_address)

# Apply the parsing function to the DataFrame
try:
    parsed_df = df.apply(lambda row: parse_address(row[street_col], row[city_state_zip_col]), axis=1)
except Exception as e:
    logger.error(f"Error parsing addresses: {e}")
    logger.error(f"Sample data: {df[[street_col, city_state_zip_col]].head()}")
    raise

# Create a new DataFrame with the desired column order
new_df = parsed_df[['Property Street', 'Property City', 'Property State', 'Property Zip', 'Address Flag']]

# Data Inspection: Inspect data after parsing
print("\nSample data after parsing:")
print(new_df.head())

# Add the new columns to the original DataFrame
df = pd.concat([df, new_df], axis=1)

# Save the DataFrame to a CSV file
output_filename = 'parsed_property_address' + os.path.splitext(filename)[0] + '.csv'
df.to_csv(output_filename, index=False)

# Download the file
files.download(output_filename)

print("\nProcessing complete. Please check the downloaded file for results.")

Saving Combined Base-Scraper Results_2024-07-08_11-54-45.xlsx to Combined Base-Scraper Results_2024-07-08_11-54-45.xlsx
Available columns:
0: County
1: Auction Starts
2: Auction Type:
3: Case #:
4: Final Judgment Amount:
5: Parcel ID:
6: Property Address:
7: Unnamed: 7
8: Assessed Value:
9: Plaintiff Max Bid:
10: Unnamed: 10
11: parcel_id
12: location
13: property_use_code
14: acreage
15: gross_sq_ft
16: finished_sq_ft
17: year_built
18: effective_year_built
19: bedrooms
20: full_bathrooms
21: 0_name
22: 0_address1
23: 0_address2
24: 0_cityStateZip
25: sale_date
26: sale_price
27: instrument
28: 1_name
29: 1_address1
30: 1_address2
31: 1_cityStateZip
32: 2_name
33: 2_address1
34: 2_address2
35: 2_cityStateZip
Enter the number of the column containing the street address: 6
Enter the number of the column containing city, state, and zip: 7
Sample data before parsing:
    Property Address:              Unnamed: 7
0      202 SE 14TH PL  GAINESVILLE, FL- 32601
1     1610 NE 17TH PL  GAINESVI

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Processing complete. Please check the downloaded file for results.


# Parse out the Owner Address - 1 column
When its listed in one column and the Street Name runs into the City like this:

"142  DOGWOOD WAYPANAMA CITY 32404"

**Instructions:**

Run each cell in order. Each cell must complete BEFORE running the next.

Follow the prompt to upload the file you just parsed. You must use the output file from the script above. You will be prompted for the column that contains complete addresses.

**You will need the "florida_cities.txt" file** to upload. You can find it here: https://docs.google.com/document/d/1JGuZJb1QR696GQBwZ1gDAY33bKG1_T9RKxRjanZPXnY/edit?usp=drive_link

When prompted upload it. Or drag and drop into the file section before running the script.

In [None]:
import pandas as pd
import re
from google.colab import files
import numpy as np
import json
import os
import requests



In [None]:
# Version 3
# List of common street suffixes
street_suffixes = [
    'St', 'Ave', 'Blvd', 'Dr', 'Rd', 'Ln', 'Way', 'Pl', 'Ct', 'Ter', 'Cir', 'Pkwy', 'Hwy',
    'Street', 'Avenue', 'Boulevard', 'Drive', 'Road', 'Lane', 'Place', 'Court', 'Terrace', 'Circle', 'Parkway', 'Highway'
]

# List of US state abbreviations
us_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

def get_florida_cities():
    try:
        # Try to read the local file
        with open('florida_cities.txt', 'r') as f:
            florida_cities = [city.strip().upper() for city in f.readlines()]
        return florida_cities
    except FileNotFoundError:
        print("Florida cities file not found. Please upload 'florida_cities.txt'")
        uploaded = files.upload()
        if 'florida_cities.txt' in uploaded:
            with open('florida_cities.txt', 'r') as f:
                florida_cities = [city.strip().upper() for city in f.readlines()]
            return florida_cities
        else:
            print("Florida cities file not uploaded. Using a default list.")
            return ["JACKSONVILLE", "MIAMI", "TAMPA", "ORLANDO", "ST. PETERSBURG", "HIALEAH", "TALLAHASSEE",
                    "FORT LAUDERDALE", "PORT ST. LUCIE", "CAPE CORAL", "PEMBROKE PINES", "HOLLYWOOD",
                    "GAINESVILLE", "MIRAMAR", "CORAL SPRINGS", "CLEARWATER", "PALM BAY", "POMPANO BEACH",
                    "WEST PALM BEACH", "LAKELAND", "DAVIE", "MIAMI GARDENS", "SUNRISE", "PLANTATION",
                    "BOCA RATON", "DELTONA", "MIAMI BEACH", "LARGO", "MELBOURNE", "PALM COAST", "DEERFIELD BEACH",
                    "BOYNTON BEACH", "LAUDERHILL", "WESTON", "FORT MYERS", "KISSIMMEE", "HOMESTEAD", "DELRAY BEACH",
                    "TAMARAC", "DAYTONA BEACH", "NORTH MIAMI", "WELLINGTON", "NORTH PORT", "JUPITER", "OCALA",
                    "PORT ORANGE", "MARGATE", "COCONUT CREEK", "SANFORD", "SARASOTA", "PENSACOLA", "BRADENTON",
                    "PALM BEACH GARDENS", "PINELLAS PARK", "CORAL GABLES", "DORAL", "BONITA SPRINGS", "APOPKA",
                    "TITUSVILLE", "NORTH MIAMI BEACH", "OAKLAND PARK", "FORT PIERCE", "NORTH LAUDERDALE",
                    "CUTLER BAY", "ALTAMONTE SPRINGS", "ST. CLOUD", "GREENACRES", "ORMOND BEACH", "OCOEE",
                    "HALLANDALE BEACH", "WINTER GARDEN", "LAKE WORTH", "ORANGE PARK", "MIDDLEBURG", "LYNN HAVEN",
                    "PANAMA CITY", "YOUNGSTOWN", "SOUTHPORT", "KEYSTONE HEIGHTS"]
def parse_owner_address(row):
    address = row['address']
    county = row['county'] if 'county' in row else None

    # Convert address to uppercase for case-insensitive matching
    address_upper = address.upper()

    # Extract zip code
    zip_match = re.search(r'\d{5}$', address_upper)
    if zip_match:
        zip_code = zip_match.group()
        address_upper = address_upper[:-5].strip()
    else:
        zip_code = None

    # Find the state
    state = None
    for us_state in us_states:
        if f", {us_state}" in address_upper:
            state = us_state
            address_upper = address_upper.replace(f", {us_state}", "")
            break

    if state is None:
        state = 'FL'  # Default to Florida if no state is found

    # Find the city and street
    city = None
    street = address_upper

    # First, try to find a known city
    for fl_city in florida_cities:
        if fl_city in address_upper:
            city = fl_city
            street = address_upper[:address_upper.index(city)].strip()
            break

    # If city wasn't found, try to split at the last occurrence of a street suffix
    if city is None:
        for suffix in street_suffixes:
            suffix_upper = suffix.upper()
            if suffix_upper in address_upper:
                parts = address_upper.rsplit(suffix_upper, 1)
                if len(parts) > 1:
                    street = (parts[0] + suffix_upper).strip()
                    city = parts[1].strip()
                    break

    # If still no city found, use the last word as city
    if city is None:
        parts = address_upper.rsplit(None, 1)
        if len(parts) > 1:
            street, city = parts

    # Clean up street and city
    street = re.sub(r'\s+', ' ', street).strip()
    if city:
        city = re.sub(r'\s+', ' ', city).strip()

    # Flag incomplete addresses
    flag = 'INCOMPLETE' if city is None or zip_code is None else ''

    return pd.Series({
        'Owner Street': street.title(),
        'Owner City': city.title() if city else f"UNKNOWN ({county})" if county else "UNKNOWN",
        'Owner State': state,
        'Owner Zip': zip_code if zip_code else "UNKNOWN",
        'Flag': flag
    })

# Main script
print("Please upload your data file (CSV or Excel)")
uploaded = files.upload()

# Get the filename of the uploaded file
filename = next(iter(uploaded))

# Read the file
if filename.endswith('.csv'):
    df = pd.read_csv(filename)
elif filename.endswith(('.xls', '.xlsx')):
    df = pd.read_excel(filename)
else:
    raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")

# Print column names and ask for input
print("\nAvailable columns:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

while True:
    try:
        address_col = int(input("\nEnter the number of the column containing the full Owner address: "))
        address_col = df.columns[address_col]
        break
    except (ValueError, IndexError):
        print("Invalid input. Please enter a valid column number.")

# Ask for county column if available
county_col = None
while True:
    county_input = input("\nEnter the number of the column containing the county (or press Enter if not available): ")
    if county_input == "":
        break
    try:
        county_col = int(county_input)
        county_col = df.columns[county_col]
        break
    except (ValueError, IndexError):
        print("Invalid input. Please enter a valid column number or press Enter to skip.")

# Download and prepare the list of Florida cities
florida_cities = get_florida_cities()

# Prepare input for parsing function
parse_input = df[[address_col]].rename(columns={address_col: 'address'})
if county_col:
    parse_input['county'] = df[county_col]

# Apply the parsing function
parsed_addresses = parse_input.apply(parse_owner_address, axis=1)

# Add parsed columns to the original dataframe
df = pd.concat([df, parsed_addresses], axis=1)

# Display the first few rows of the result
print("\nFirst few rows of the parsed data:")
print(df.head())

# Save the result
output_filename = 'parsed_owner_' + filename
df.to_csv(output_filename, index=False)
print(f"\nParsed data saved to {output_filename}")
files.download(output_filename)

# Print summary of flagged addresses
flagged_addresses = df[df['Flag'] == 'INCOMPLETE']
print(f"\nNumber of incomplete addresses: {len(flagged_addresses)}")
if len(flagged_addresses) > 0:
    print("First few incomplete addresses:")
    print(flagged_addresses[['Owner Street', 'Owner City', 'Owner State', 'Owner Zip']].head())

Please upload your data file (CSV or Excel)


Saving parsed_property_address Combined Base-Scraper Results_2024-07-08_11-54-45 (2).csv to parsed_property_address Combined Base-Scraper Results_2024-07-08_11-54-45 (2) (1).csv

Available columns:
0: County
1: Auction Starts
2: Auction Type:
3: Case #:
4: Final Judgment Amount:
5: Parcel ID:
6: Property Address:
7: Unnamed: 7
8: Assessed Value:
9: Plaintiff Max Bid:
10: Unnamed: 10
11: parcel_id
12: location
13: property_use_code
14: acreage
15: gross_sq_ft
16: finished_sq_ft
17: year_built
18: effective_year_built
19: bedrooms
20: full_bathrooms
21: 0_name
22: 0_address1
23: 0_address2
24: 0_cityStateZip
25: sale_date
26: sale_price
27: instrument
28: 1_name
29: 1_address1
30: 1_address2
31: 1_cityStateZip
32: 2_name
33: 2_address1
34: 2_address2
35: 2_cityStateZip
36: Property Street
37: Property City
38: Property State
39: Property Zip
40: Address Flag

Enter the number of the column containing the full Owner address: 22

Enter the number of the column containing the county (or pre

AttributeError: 'float' object has no attribute 'upper'