<a href="https://colab.research.google.com/github/melekergin/success-with-systems/blob/main/title_Cleaning_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re

# Load the dataset from a CSV file
df = pd.read_csv('./insurance last.csv')

# Define a regex pattern to match various executive titles, including variations of 'Co-Founder'
pattern = r'\b(?:CEO|Co[- ]?Founder|CUO|COO|Founder|Chairman|Chief [A-Za-z ]+Officer|CMO|CTO|CFO|Founding Partner|Founding Director|Financial Director|Chief Financial and Operating Officer|Chief Finance and Operating Officer|Chief Financial Officer Operations|founding co-shareholder)\b'

# Function to standardize variations of 'Co-Founder' to a single format
def standardize_cofounder(title):
    # Use regex to replace different spellings of 'Co-Founder' with a standardized form
    return re.sub(r'\bCo[- ]?Founder\b', 'Co-Founder', title, flags=re.IGNORECASE)

def clean_title(title):
    # First, standardize 'Co-Founder' titles in the input
    title = standardize_cofounder(title)
    # Use regex to find all matches of the pattern in the title
    matches = re.findall(pattern, title, re.IGNORECASE)
    # Remove duplicate matches while preserving the order
    matches = sorted(set(matches), key=matches.index)
    # Limit the number of matches to the first two
    matches = matches[:2]
    # Map each found title to its short form as defined in title_mapping
    short_forms = [title_mapping.get(match.lower(), match) for match in matches]
    # Join the short forms with ' and ' or return 'Other' if no matches were found
    return ' and '.join(short_forms) if short_forms else 'Other'

# Mapping of full titles to their short forms for easier readability and consistency
title_mapping = {
    'ceo': 'CEO',
    'co-founder': 'Co-Founder',
    'cuo': 'CUO',
    'coo': 'COO',
    'founder': 'Founder',
    'cfo': 'CFO',
    'chairman': 'Chairman',
    'chief executive officer': 'CEO',
    'founding partner': 'Co-Founder',
    'founding director': 'Founder',
    'financial director': 'CFO',
    'chief financial officer':'CFO',
    'chief financial and operating officer': 'COO',
    'chief finance and operating officer': 'COO',
    'chief financial officer operations': 'COO',
    'chief finance and operations officer':'COO',
    'chief operating officer':'COO',
    'chief operations officer':'COO',
    'chief underwriting officer':'CUO',
    'chief technical officer':'CTO',
    'chief technology officer':'CTO',
    'chief product officer':'CPO',
    'chief commercial officer':'CCO',
    'chief marketing officer':'CMO',
    'chief visionary officer':'CVO',
    'chief digital officer':'CDO',
    'founding co-shareholder': 'Co-Founder',
    'chief investment officer':'CIO',
    'chief people officer':'CPO'
    # Additional mappings can be added as needed
}

# Apply the clean_title function to the 'Title' column to standardize titles
df['Cleaned_Titles'] = df['Title'].apply(clean_title)

# Display the counts of original vs cleaned titles for analysis
df[['Title', 'Cleaned_Titles']].value_counts()