<a href="https://colab.research.google.com/github/mattwantshouses/name_parsing/blob/main/Copy_of_RMD_Working_Scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Name Parser
This script takes the entire name from one cell and converts to First, Middle, Last... and puts Corporations in the Last Name column. Also does a check of common names to help find potential issues such as first name in the last name column.

**Here's how to run it**


1.   Upload the file as prompted.
2.   Choose the column number where the names are located.


> This is tricky. The first column is actually column number zero. So if the info is in the 2nd column choose 1.


3.   Check the files in this notebook for the output, and download it.


> "Lis Pendens Parsed Names [time]"






In [None]:
# 1. Import Libraries
import pandas as pd
import logging
from google.colab import files
import os
from datetime import datetime
import nltk
nltk.download('names')
from nltk.corpus import names


# 2. Setup Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 3. File Upload
def upload_file():
    uploaded = files.upload()
    for fn in uploaded.keys():
        logging.info(f'Uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
    return uploaded

def read_file_to_dataframe(uploaded_file):
    import io
    df = pd.read_csv(io.BytesIO(uploaded_file))
    return df

# 4. Data Reading
def get_data():
    uploaded = upload_file()
    for fn in uploaded.keys():
        return read_file_to_dataframe(uploaded[fn])
    return None

# 5. Name Parsing Function
def parse_name(full_name):
    # List of common keywords indicating a corporation or organization
    organization_keywords = [
        "corp", "corporation", "inc", "company", "co", "limited", "ltd", "incorporated", "plc", "corporate",
        "LLC", "LC", "limited liability company", "LLP", "LP", "partnership",
        "foundation", "trust", "nonprofit", "association", "committee",
        "church", "temple", "mosque", "synagogue", "ministry",
        "school", "university", "college", "institute", "academy",
        "department", "agency", "bureau", "office", "commission",
        "club", "society", "union", "organization", "group", "board", "council", "league", "bank"
    ]

    estate_keywords = ["deceased", "estate", "et al", "deceased estate", "personal representative", "representative"]

    parts = full_name.split()
    estate_part = ""

    # Check if the name contains any organization keywords
    if any(keyword.lower() in full_name.lower() for keyword in organization_keywords):
        return "", "", full_name, estate_part  # Place the entire name in the "Last Name" column

    # Check for estate keywords and extract them
    for keyword in estate_keywords:
        if keyword.lower() in full_name.lower():
            estate_part = " ".join([word for word in parts if keyword.lower() in word.lower()])
            parts = [word for word in parts if keyword.lower() not in word.lower()]
            break

    # Handle different cases for personal names
    if len(parts) == 1:
        last_name = parts[0]
        first_name = ""
        middle_name = ""
    elif len(parts) == 2:
        last_name = parts[0]
        first_name = parts[1]
        middle_name = ""
    elif len(parts) == 3:
        last_name = parts[0]
        first_name = parts[1]
        middle_name = parts[2]
    elif len(parts) > 3:
        # For names with more than three parts, handle suffixes and multiple middle names
        last_name = parts[0]
        first_name = parts[1]
        middle_name = " ".join(parts[2:])

        # Check for suffixes at the end
        suffixes = ["Jr.", "Sr.", "III", "II", "Jr", "Sr"]
        if any(suffix.lower() in parts[-1].lower() for suffix in suffixes):
            last_name += " " + parts[-1]
            middle_name = " ".join(parts[2:-1])
    else:
        first_name = ""
        middle_name = ""
        last_name = ""

    return first_name, middle_name, last_name, estate_part


# 6. Processing
def process_names(df, column_number):
    df['First Name'], df['Middle Name'], df['Last Name'], df['Estate'] = zip(*df.iloc[:, column_number].map(parse_name))
    return df


# 7. Deduplication
def remove_duplicates(df):
    return df.drop_duplicates()

# 8. Save Output
def save_output(df):
    current_time = datetime.now().strftime('%H:%M:%S')
    filename = f"Lis Pendens Parsed Names {current_time}.csv"
    df.to_csv(filename, index=False)
    file_size = os.path.getsize(filename)
    logging.info(f'File "{filename}" was saved at {current_time} with size {file_size} bytes')
    return filename

# 9. Frequency Calculation and Identification of Potential Issues
def analyze_names(df):
    first_name_counts = df['First Name'].value_counts()
    last_name_counts = df['Last Name'].value_counts()
    common_first_names = set(names.words())

    # Identify common last names in the first name column
    df['Potential Issue'] = df.apply(lambda row: row['First Name'] in last_name_counts.index and row['Last Name'] not in common_first_names, axis=1)

    return df

# Main Execution Flow
def main():
    logging.info("Starting the name parsing script.")
    df = get_data()
    if df is not None:
        column_number = int(input("Enter the column number containing the names (starting from 0): "))
        df = process_names(df, column_number)
        df = remove_duplicates(df)

        # Analyze and flag potential issues
        df = analyze_names(df)

        filename = save_output(df)
        logging.info("Script completed successfully.")
    else:
        logging.error("Failed to load data.")

main()



[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


Saving Lis Pendens A - Legal Description Parsing Test.csv to Lis Pendens A - Legal Description Parsing Test (18).csv
Enter the column number containing the names (starting from 0): 1


# Address Parser

Parse Mailing and Primary Addresses from one cell into multiple cells

Must have columns named "mailing address" and "primary address".

In [None]:
# 1:  Imports
import pandas as pd
from datetime import datetime
import pytz
from google.colab import files
# Install usaddress

!pip install usaddress
import usaddress

# 2: File upload and processing
try:
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
    else:
        file_name = next(iter(uploaded))
        if file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_name)
        else:
            print("Unsupported file type.")
except StopIteration:
    print("No file was found in the uploaded dictionary.")
except Exception as e:
    print(f"Failed to upload or process file: {e}")
# 3: Define the function to parse address using usaddress
def parse_address(address):
    try:
        parsed_address = usaddress.tag(address)
        components = {
            'Street_Address': '',
            'City': '',
            'St': '',
            'Zip': ''
        }

        # Extract relevant parts from the parsed address
        for key, value in parsed_address[0].items():
            if 'AddressNumber' in key or 'StreetName' in key or 'StreetNamePostType' in key:
                components['Street_Address'] += f"{value} "
            elif 'PlaceName' in key:
                components['City'] = value
            elif 'StateName' in key:
                components['St'] = value
            elif 'ZipCode' in key:
                components['Zip'] = value.strip().split('-')[0]

        # Clean up any extra spaces
        components = {k: v.strip() for k, v in components.items()}
        return components
    except usaddress.RepeatedLabelError:
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
    except Exception as e:
        # Handle other exceptions
        print(f"Error parsing address: {e}")
        return {"Street_Address": None, "City": None, "St": None, "Zip": None}
# 4: Define Process Address Columns
def process_address_columns(df, column_name, prefix):
    address_components = df[column_name].dropna().apply(lambda x: pd.Series(parse_address(x)))
    address_components.columns = [f"{prefix}_{col}" for col in address_components.columns]
    return address_components

# 5: Process and Concatenate Address Columns
try:
    if not df.empty:
        # 5.1: Process primary address columns
        primary_parsed_df = process_address_columns(df, 'primary address', 'primary')

        # 5.2: Process mailing address columns
        mailing_parsed_df = process_address_columns(df, 'mailing address', 'mailing')

        # 5.3: Concatenate the original and processed DataFrames
        df = pd.concat([df, primary_parsed_df, mailing_parsed_df], axis=1)

        # 5.4: Display the first few rows of the DataFrame
        print(df.head())
except Exception as e:
    print(f"Failed to process address columns: {e}")

# 6: Output file saving
est = pytz.timezone('US/Eastern')
current_time = datetime.now(est).strftime('%Y-%m-%d %H-%M-%S')
output_file_name = f"Address parse {file_name.split('.')[0]} {current_time}.csv"
df.to_csv(output_file_name, index=False)
print(f"File saved as {output_file_name}")

