<a href="https://colab.research.google.com/github/mattwantshouses/name_parsing/blob/main/RMD_Dev_Address_Parsing_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parse Mailing and Primary Addresses from one cell into multiple cells

Must have columns named "mailing address" and "primary address".

In [None]:
# 1:  Imports
import pandas as pd
from datetime import datetime
import pytz
from google.colab import files
# Install usaddress

!pip install usaddress
import usaddress




In [None]:
# 2: File upload and processing
try:
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
    else:
        file_name = next(iter(uploaded))
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_name)
        else:
            print("Unsupported file type.")
except StopIteration:
    print("No file was found in the uploaded dictionary.")
except Exception as e:
    print(f"Failed to upload or process file: {e}")

Saving khaliq_lis_pendens (1).xlsx to khaliq_lis_pendens (1) (2).xlsx


In [None]:
# 3: Define the function to parse address using usaddress
def parse_address(address):
    try:
        parsed_address = usaddress.tag(address)
        components = {
            'Street_Address': '',
            'City': '',
            'St': '',
            'Zip': ''
        }

        # Extract relevant parts from the parsed address
        for key, value in parsed_address[0].items():
            if 'AddressNumber' in key or 'StreetName' in key or 'StreetNamePostType' in key:
                components['Street_Address'] += f"{value} "
            elif 'PlaceName' in key:
                components['City'] = value
            elif 'StateName' in key:
                components['St'] = value
            elif 'ZipCode' in key:
                components['Zip'] = value.strip().split('-')[0]

        # Clean up any extra spaces
        components = {k: v.strip() for k, v in components.items()}
        return components
    except usaddress.RepeatedLabelError:
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
    except Exception as e:
        # Handle other exceptions
        print(f"Error parsing address: {e}")
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}

In [None]:
# 4: Define Process Address Columns
def process_address_columns(df, column_name, prefix):
    address_components = df[column_name].dropna().apply(lambda x: pd.Series(parse_address(x)))
    address_components.columns = [f"{prefix}_{col}" for col in address_components.columns]
    return address_components


In [None]:
# 5: Process and Concatenate Address Columns
try:
    if not df.empty:
        # 5.1: Process primary address columns
        primary_parsed_df = process_address_columns(df, 'primary address', 'primary')

        # 5.2: Process mailing address columns
        mailing_parsed_df = process_address_columns(df, 'mailing address', 'mailing')

        # 5.3: Concatenate the original and processed DataFrames
        df = pd.concat([df, primary_parsed_df, mailing_parsed_df], axis=1)

        # 5.4: Display the first few rows of the DataFrame
        print(df.head())
except Exception as e:
    print(f"Failed to process address columns: {e}")


                owner name                             mailing address  \
0   GONZALEZ RENATA HESTER   7457 SWEET ROSE LN JACKSONVILLE, FL 32244   
1                      NaN                                         NaN   
2         BALLESTEROS JUAN  8990 BRIDGECREEK DR JACKSONVILLE, FL 32244   
3                      NaN                                         NaN   
4         GRIMES KENNETH M       2425 HOLMES ST JACKSONVILLE, FL 32207   

                              primary address  \
0   7457 SWEET ROSE LN Jacksonville FL 32244-   
1                                         NaN   
2  8990 BRIDGECREEK DR Jacksonville FL 32244-   
3                                         NaN   
4       2425 HOLMES ST Jacksonville FL 32207-   

                                   legal description  year built  \
0           42-77 35-3S-25E AMANDAS CROSSING LOT 163      2013.0   
1                                                NaN         NaN   
2  49-26 33-3S-25E ARGYLE FOREST CHIMNEY LAKES UN...    

In [None]:
# 6: Output file saving
est = pytz.timezone('US/Eastern')
current_time = datetime.now(est).strftime('%Y-%m-%d %H-%M-%S')
output_file_name = f"Address parse {file_name.split('.')[0]} {current_time}.csv"
df.to_csv(output_file_name, index=False)
print(f"File saved as {output_file_name}")

File saved as Address parse khaliq_lis_pendens (1) (2) 2024-06-17 22-00-23.csv


# Address Parsing

In [1]:
# 1:  Imports
import pandas as pd
from datetime import datetime
import pytz
from google.colab import files
# Install usaddress

!pip install usaddress
import usaddress

# 2: File upload and processing
try:
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
    else:
        file_name = next(iter(uploaded))
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_name)
        else:
            print("Unsupported file type.")
except StopIteration:
    print("No file was found in the uploaded dictionary.")
except Exception as e:
    print(f"Failed to upload or process file: {e}")
# 3: Define the function to parse address using usaddress
def parse_address(address):
    try:
        parsed_address = usaddress.tag(address)
        components = {
            'Street_Address': '',
            'City': '',
            'St': '',
            'Zip': ''
        }

        # Extract relevant parts from the parsed address
        for key, value in parsed_address[0].items():
            if 'AddressNumber' in key or 'StreetName' in key or 'StreetNamePostType' in key:
                components['Street_Address'] += f"{value} "
            elif 'PlaceName' in key:
                components['City'] = value
            elif 'StateName' in key:
                components['St'] = value
            elif 'ZipCode' in key:
                components['Zip'] = value.strip().split('-')[0]

        # Clean up any extra spaces
        components = {k: v.strip() for k, v in components.items()}
        return components
    except usaddress.RepeatedLabelError:
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
    except Exception as e:
        # Handle other exceptions
        print(f"Error parsing address: {e}")
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
# 4: Define Process Address Columns
def process_address_columns(df, column_name, prefix):
    address_components = df[column_name].dropna().apply(lambda x: pd.Series(parse_address(x)))
    address_components.columns = [f"{prefix}_{col}" for col in address_components.columns]
    return address_components

# 5: Process and Concatenate Address Columns
try:
    if not df.empty:
        # 5.1: Process primary address columns
        primary_parsed_df = process_address_columns(df, 'primary address', 'primary')

        # 5.2: Process mailing address columns
        mailing_parsed_df = process_address_columns(df, 'mailing address', 'mailing')

        # 5.3: Concatenate the original and processed DataFrames
        df = pd.concat([df, primary_parsed_df, mailing_parsed_df], axis=1)

        # 5.4: Display the first few rows of the DataFrame
        print(df.head())
except Exception as e:
    print(f"Failed to process address columns: {e}")

# 6: Output file saving
est = pytz.timezone('US/Eastern')
current_time = datetime.now(est).strftime('%Y-%m-%d %H-%M-%S')
output_file_name = f"Address parse {file_name.split('.')[0]} {current_time}.csv"
df.to_csv(output_file_name, index=False)
print(f"File saved as {output_file_name}")

Collecting usaddress
  Downloading usaddress-0.5.10-py2.py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m889.5 kB/s[0m eta [36m0:00:00[0m
Collecting probableparsing (from usaddress)
  Downloading probableparsing-0.0.1-py2.py3-none-any.whl (3.1 kB)
Collecting python-crfsuite>=0.7 (from usaddress)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, probableparsing, usaddress
Successfully installed probableparsing-0.0.1 python-crfsuite-0.9.10 usaddress-0.5.10


Saving chatgpt parsed_owner_names_with_keywords_removed.csv to chatgpt parsed_owner_names_with_keywords_removed.csv
Failed to process address columns: 'primary address'
File saved as Address parse chatgpt parsed_owner_names_with_keywords_removed 2024-06-18 17-34-30.csv


# Address Parsing - Use This
(Dynamically finds Address Columns)

In [4]:
# 1:  Imports
import pandas as pd
from datetime import datetime
import pytz
from google.colab import files
# Install usaddress

!pip install usaddress
import usaddress

# 2: File upload and processing
try:
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
    else:
        file_name = next(iter(uploaded))
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_name)
        else:
            print("Unsupported file type.")
except StopIteration:
    print("No file was found in the uploaded dictionary.")
except Exception as e:
    print(f"Failed to upload or process file: {e}")
# 3: Define the function to parse address using usaddress
def parse_address(address):
    try:
        parsed_address = usaddress.tag(address)
        components = {
            'Street_Address': '',
            'City': '',
            'St': '',
            'Zip': ''
        }

        # Extract relevant parts from the parsed address
        for key, value in parsed_address[0].items():
            if 'AddressNumber' in key or 'StreetName' in key or 'StreetNamePostType' in key:
                components['Street_Address'] += f"{value} "
            elif 'PlaceName' in key:
                components['City'] = value
            elif 'StateName' in key:
                components['St'] = value
            elif 'ZipCode' in key:
                components['Zip'] = value.strip().split('-')[0]

        # Clean up any extra spaces
        components = {k: v.strip() for k, v in components.items()}
        return components
    except usaddress.RepeatedLabelError:
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
    except Exception as e:
        # Handle other exceptions
        print(f"Error parsing address: {e}")
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
# 4: Define Process Address Columns
def process_address_column(df, column_name, prefix):
    address_components = df[column_name].dropna().apply(lambda x: pd.Series(parse_address(x)))
    address_components.columns = [f"{prefix}_{col}" for col in address_components.columns]
    return address_components

def identify_address_columns(df):
    address_keywords = ['address', 'street', 'city', 'zip']
    address_columns = []
    for col in df.columns:
        if any(keyword in col.lower() for keyword in address_keywords):
            address_columns.append(col)
    return address_columns

def process_address_columns(df, column_names):
    for column_name in column_names:
        prefix = column_name.replace(' ', '_').lower()
        try:
            address_components = df[column_name].dropna().apply(lambda x: pd.Series(parse_address(x)))
            address_components.columns = [f"{prefix}_{col}" for col in address_components.columns]
            df = pd.concat([df, address_components], axis=1)
        except Exception as e:
            print(f"Error processing column {column_name}: {e}")
    return df

# 5: Process and Concatenate Address Columns
try:
    if not df.empty:
        address_columns = identify_address_columns(df)
        df = process_address_columns(df, address_columns)
        print(df.head())
except Exception as e:
    print(f"Failed to process address columns: {e}")

        # 5.3: Concatenate the original and processed DataFrames
    df = pd.concat([df, primary_parsed_df, mailing_parsed_df], axis=1)

        # 5.4: Display the first few rows of the DataFrame
    print(df.head())
except Exception as e:
    print(f"Failed to process address columns: {e}")

# 6: Output file saving
est = pytz.timezone('US/Eastern')
current_time = datetime.now(est).strftime('%Y-%m-%d %H-%M-%S')
output_file_name = f"Address parse {file_name.split('.')[0]} {current_time}.csv"
df.to_csv(output_file_name, index=False)
print(f"File saved as {output_file_name}")

Collecting usaddress
  Downloading usaddress-0.5.10-py2.py3-none-any.whl (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting probableparsing (from usaddress)
  Downloading probableparsing-0.0.1-py2.py3-none-any.whl (3.1 kB)
Collecting python-crfsuite>=0.7 (from usaddress)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, probableparsing, usaddress
Successfully installed probableparsing-0.0.1 python-crfsuite-0.9.10 usaddress-0.5.10


Saving marketing prep - 061824 - khaliq_foreclosure_scrape_results - khaliq scraped results - name parsed.csv to marketing prep - 061824 - khaliq_foreclosure_scrape_results - khaliq scraped results - name parsed.csv
                                     mailing_address  \
0           38 S CENTRAL AVE VALLEY STREAM, NY 11580   
1  800 CORPORATE DR SUITE 210 FORT LAUDERDALE, FL...   
2  800 CORPORATE DR SUITE 210 FORT LAUDERDALE, FL...   
3              2501 JAMMES RD JACKSONVILLE, FL 32210   
4       5300 CHANDLER BEND DR JACKSONVILLE, FL 32224   

                     owner_name                               primary_address  \
0         CYCLONE OASIS 256 LLC         3355 CLAIRE LN Jacksonville FL 32216-   
1   NH NORTHLAKE APARTMENTS LLC          2445 DUNN AVE Jacksonville FL 32218-   
2           OAKS AT RED BAY LLC    7528 ARLINGTON EXPY Jacksonville FL 32211-   
3    JJTA15 REAL PROPERTIES LLC    5641 CALIFORNIA AVE Jacksonville FL 32244-   
4             CALVERT SPENCER T  5300 CHAN