# Introduction
In this notebook, we analyze name data to identify alternate spellings of names and understand their frequency and trends over time. We will use the fuzzywuzzy package for string matching. It provides functions to compare strings and determine how similar they are which we will apply to our names.  

In [1]:
#Import necessary libraries
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
#Load national data from SSA
names_df = pd.read_csv("Data/national_data.csv")
#Load state data from SSA
states_df = pd.read_csv("Data/states_data.csv")

# Data Preparation

In [3]:
# Splitting into two DataFrames based on sex column 
male_df = names_df[names_df['sex'] == 'M']
female_df = names_df[names_df['sex'] == 'F']

# Get unique names in male_df
unique_male_df = male_df.drop_duplicates(subset=['name'])

# Get unique names in female_df
unique_female_df = female_df.drop_duplicates(subset=['name'])

# Alternate Spellings Identification

In [4]:
# Define function to identify alternate spellings
def is_alternate_spelling(name1, name2):
    similarity_ratio = fuzz.ratio(name1.lower(), name2.lower())
    return similarity_ratio > 80  # Adjust threshold as needed

In [5]:
# Parts of code were created and optimized for speed and clarity by generative AI
# Create Funtion to find alternate spelling
def find_most_alternate_spellings(dataframe):
    # Identify alternate spellings using vectorized operations
    dataframe['alternate_spelling'] = False

    # Create a dictionary to store names and their alternate spellings
    alternate_spellings = {}

    for year in dataframe['year'].unique():
        same_year_df = dataframe[dataframe['year'] == year]

        # Create a dictionary to store names and their counts for the current year
        names_counts = same_year_df.groupby('name')['count'].sum().to_dict()

        for name, count in names_counts.items():
            # Check if the name already has alternate spellings
            if name in alternate_spellings:
                for alternate_name in alternate_spellings[name]:
                    # If there's an alternate spelling, mark it as such
                    if is_alternate_spelling(name, alternate_name):
                        dataframe.loc[dataframe['name'] == name, 'alternate_spelling'] = True
                        dataframe.loc[dataframe['name'] == alternate_name, 'alternate_spelling'] = True
            else:
                # Check for alternate spellings with other names in the current year
                alternate_names = [alt_name for alt_name in names_counts if alt_name != name and is_alternate_spelling(name, alt_name)]
                alternate_spellings[name] = alternate_names
                for alternate_name in alternate_names:
                    dataframe.loc[dataframe['name'].isin([name, alternate_name]), 'alternate_spelling'] = True

    # Group and aggregate data
    alternate_spellings_count = dataframe[dataframe['alternate_spelling']].groupby('name')['count'].sum().reset_index()

    # Calculate increase over time
    name_counts_over_time = dataframe.groupby(['name', 'year'])['count'].sum().reset_index()
    name_counts_over_time['previous_year_count'] = name_counts_over_time.groupby('name')['count'].shift(fill_value=0)
    name_counts_over_time['count_increase'] = name_counts_over_time['count'] - name_counts_over_time['previous_year_count']

    # Find name with most alternate spellings
    most_alternate_spellings = alternate_spellings_count.nlargest(10, 'count')

    # Find most similar names for each top ten name
    most_similar_names = {}
    for name in most_alternate_spellings['name']:
        similar_names = []
        for alt_name in alternate_spellings.get(name, []):
            if is_alternate_spelling(name, alt_name):
                similar_names.append(alt_name)
        most_similar_names[name] = similar_names

    return most_alternate_spellings, most_similar_names

# Analysis and Results (Male Names)

In [6]:
most_alternate_spellings, most_similar_names = find_most_alternate_spellings(unique_male_df)
print("\nTop Ten Names with the most alternate spellings:")
print(most_alternate_spellings)
print("\nMost similar names for each top ten name:")
print(most_similar_names)


Top Ten Names with the most alternate spellings:
         name  count
8003  William   9532
1311  Charles   5348
2849   George   5126
4270   Joseph   2632
7553   Thomas   2534
3031    Henry   2444
7965   Walter   1755
613    Arthur   1599
242    Albert   1493
6948   Samuel   1024

Most similar names for each top ten name:
{'William': ['Willam', 'Williams', 'Willian'], 'Charles': ['Charley', 'Charlie'], 'George': ['Gorge'], 'Joseph': ['Joeseph'], 'Thomas': ['Tomas'], 'Henry': ['Henery'], 'Walter': ['Walker'], 'Arthur': ['Arther', 'Aurthur', 'Authur'], 'Albert': ['Adelbert', 'Albertus', 'Elbert'], 'Samuel': ['Samual']}


# Analysis and Results (Female Names)

In [7]:
most_alternate_spellings, most_similar_names = find_most_alternate_spellings((unique_female_df))
print("\nTop Ten Names with the most alternate spellings:")
print(most_alternate_spellings)
print("\nMost similar names for each top ten name:")
print(most_similar_names)


Top Ten Names with the most alternate spellings:
            name  count
14399       Mary   7065
2021        Anna   2604
6931        Emma   2003
6727   Elizabeth   1939
15070     Minnie   1746
14081   Margaret   1578
7931         Ida   1472
3330      Bertha   1320
17640      Sarah   1288
2119       Annie   1258

Most similar names for each top ten name:
{'Mary': ['May'], 'Anna': ['Ana', 'Ann', 'Hanna'], 'Emma': ['Lemma'], 'Elizabeth': ['Elisabeth', 'Elizebeth'], 'Minnie': ['Linnie', 'Mintie', 'Vinnie', 'Winnie'], 'Margaret': ['Margarett', 'Margaretta', 'Margarette', 'Margarita', 'Margeret', 'Margret'], 'Ida': ['Lida', 'Vida'], 'Bertha': ['Berta', 'Birtha'], 'Sarah': ['Sara'], 'Annie': ['Anne', 'Fannie', 'Jannie', 'Nannie', 'Sannie', 'Vannie']}
