# Introduction
In this notebook, we are analyzing name data to identify alternate spellings of names and understand their frequency and trends over time. We will use clustering techniques and string similarity.


In [4]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import mplcursors
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from fuzzywuzzy import fuzz  # For string similarity computation

import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings('ignore')

# Loading Data
We start by loading two datasets:

In [5]:
#Load national data from SSA
names_df = pd.read_csv("Data/national_data.csv")
#Load state data from SSA
states_df = pd.read_csv("Data/states_data.csv")

# Data Preparation

In [6]:
# Splitting into two DataFrames based on sex
male_df = names_df[names_df['sex'] == 'M']
female_df = names_df[names_df['sex'] == 'F']

# Get unique names in male_df
unique_male_df = male_df.drop_duplicates(subset=['name'])

# Get unique names in female_df
unique_female_df = female_df.drop_duplicates(subset=['name'])

# Alternate Spellings Identification

In [7]:
# Define function to identify alternate spellings
def is_alternate_spelling(name1, name2):
    similarity_ratio = fuzz.ratio(name1.lower(), name2.lower())
    return similarity_ratio > 80  # Adjust threshold as needed

# Analysis and Results (Male Names)

In [8]:
# Identify alternate spellings using vectorized operations
unique_male_df['alternate_spelling'] = False
for year in unique_male_df['year'].unique():
    same_year_df = unique_male_df[unique_male_df['year'] == year]
    for i, row in same_year_df.iterrows():
        same_name_indices = same_year_df[same_year_df['name'] != row['name']].index
        for idx in same_name_indices:
            if is_alternate_spelling(row['name'], same_year_df.loc[idx, 'name']):
                unique_male_df.loc[i, 'alternate_spelling'] = True
                unique_male_df.loc[idx, 'alternate_spelling'] = True

# Group and aggregate data
alternate_spellings_count = unique_male_df[unique_male_df['alternate_spelling']].groupby('name')['count'].sum().reset_index()

# Calculate increase over time
name_counts_over_time = unique_male_df.groupby(['name', 'year'])['count'].sum().reset_index()
name_counts_over_time['previous_year_count'] = name_counts_over_time.groupby('name')['count'].shift(fill_value=0)
name_counts_over_time['count_increase'] = name_counts_over_time['count'] - name_counts_over_time['previous_year_count']

# Find name with most alternate spellings
most_alternate_spellings = alternate_spellings_count.nlargest(10, 'count')

print("Names with alternate spellings:")
print(alternate_spellings_count)

print("\nName with the most alternate spellings:")
print(most_alternate_spellings)

Names with alternate spellings:
          name  count
0        Aadan      5
1         Aadi     14
2        Aadit     13
3       Aadith      5
4       Aahaan      6
...        ...    ...
8378    Zyrion      8
8379    Zyshon      8
8380  Zyshonne     26
8381  Zytaveon      9
8382  Zytavion      5

[8383 rows x 2 columns]

Name with the most alternate spellings:
         name  count
8003  William   9532
1311  Charles   5348
2849   George   5126
4270   Joseph   2632
7553   Thomas   2534
3031    Henry   2444
7965   Walter   1755
613    Arthur   1599
242    Albert   1493
6948   Samuel   1024


# Analysis and Results (Female Names)

In [None]:
# Identify alternate spellings
unique_female_df['alternate_spelling'] = False
for i, row in unique_female_df.iterrows():
    same_name_indices = unique_female_df[(unique_female_df['name'] != row['name']) & (unique_female_df['year'] == row['year'])].index
    for idx in same_name_indices:
        if is_alternate_spelling(row['name'], unique_female_df.loc[idx, 'name']):
            unique_female_df.loc[i, 'alternate_spelling'] = True
            unique_female_df.loc[idx, 'alternate_spelling'] = True

# Group and aggregate data
alternate_spellings_count = unique_female_df[unique_female_df['alternate_spelling']].groupby('name')['count'].sum().reset_index()

# Calculate increase over time
name_counts_over_time = unique_female_df.groupby(['name', 'year'])['count'].sum().reset_index()
name_counts_over_time['previous_year_count'] = name_counts_over_time.groupby('name')['count'].shift(fill_value=0)
name_counts_over_time['count_increase'] = name_counts_over_time['count'] - name_counts_over_time['previous_year_count']

# Find name with most alternate spellings
most_alternate_spellings = alternate_spellings_count.nlargest(10, 'count')

print("Names with alternate spellings:")
print(alternate_spellings_count)

print("\nName with the most alternate spellings:")
print(most_alternate_spellings)
