In [28]:
import pandas as pd
import re

# Load the datasets
job_titles_df = pd.read_csv('all_titles.csv')
df = pd.read_csv('Company_type.csv')



import re

def clean_job_title(title):
    """
    Cleans job titles to a standardized format, handling special cases and variations.
    
    Parameters:
    - title (str): The original job title to be cleaned.
    
    Returns:
    - str: The cleaned job title, or None if no suitable title is found.
    """
    # Define a dictionary for special cases to handle variations of 'Co-Founder' and 'Co-Owner' uniformly
    special_cases = {
        'Co-Owner': 'Co-Owner', 
        'Co-Founder': 'Co-Founder',
        'Co Founder': 'Co-Founder',  # Including space variations
        'Co-founder': 'Co-Founder',  # Including hyphen variations
        'Co Owner': 'Co-Owner',      # Including space variations
        'Co-owner': 'Co-Owner'       # Including hyphen variations
    }
    
    # Iterate through special cases to find and replace variations with a standard format
    for special_case, replacement in special_cases.items():
        # Use regex to match titles with flexible spacing and hyphens
        if re.search(special_case.replace(' ', '[ -]*'), title, re.IGNORECASE):
            return replacement
    
    # Handle titles with 'Director' or 'Manager' followed by a comma and additional descriptors
    director_manager_regex = r'(Director|Manager),\s*(.*)'
    match = re.match(director_manager_regex, title)
    if match:
        # Reformat the title to move 'Director' or 'Manager' to the end
        return f'{match.group(2)} {match.group(1)}'
    
    # Loop through predefined job titles to find a match with the current title
    for index, row in job_titles_df.iterrows():
        # Use regex for case-insensitive matching of job titles
        if re.search(row['Title'], title, re.IGNORECASE):
            return row['Title']
        
    # Default handling for titles containing 'Director' or 'Manager' if no specific title is matched
    if 'Director' in title:
        return 'Director'
    elif 'Manager' in title:
        return 'Manager'
    # Return None if no suitable title is found
    return title

# Apply the cleaning function to each entry in the 'Title' column of the dataframe
df['cleaned_title'] = df['Title'].apply(clean_job_title)

# Display the counts of original and cleaned titles for comparison
df[['Title','cleaned_title']].value_counts()

Title                                                   cleaned_title                         
Director                                                Director                                  4
Managing Director                                       Managing Director                         4
Owner                                                   Owner                                     4
Chief Executive Officer                                 Chief Executive Officer                   3
Business Owner                                          Owner                                     2
CEO/President                                           President                                 2
CEO                                                     CEO                                       2
Sales Director                                          Sales Director                            2
Director of Sales                                       Director of Sales                         2
Found