This is a Python notebook for processing the name data.

It assumes that name data is in this form:

https://www.census.gov/topics/population/genealogy/data/2010_surnames.html
 - https://www2.census.gov/topics/genealogy/2010surnames/names.zip
 - names\Names_2010Census.csv

https://www.census.gov/topics/population/genealogy/data/1990_census/1990_census_namefiles.html
  - http://www2.census.gov/topics/genealogy/1990surnames/dist.all.last
  - 1990\dist.all.last.txt

https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-data
   - https://www.ssa.gov/oact/babynames/names.zip
   - names (1)\yob*.txt

In [5]:
from pathlib import Path
import pandas as pd
import numpy as np

In [106]:
root_dir = Path(r"M:\deldir\names")
list(root_dir.iterdir())

# Every name to probability dataframe we read
# The relative weight (unnomralized) of that dataframe compared to the others.
df_list = []
weight_list = []
file_list = []

In [4]:
sur_name_dir = root_dir / "names"
list(sur_name_dir.iterdir())

[WindowsPath('M:/deldir/names/names/Names_2010Census.csv'),
 WindowsPath('M:/deldir/names/names/Names_2010Census.xlsx')]

In [7]:
sur_name_df_raw = pd.read_csv(sur_name_dir / "Names_2010Census.csv")
sur_name_df_raw

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,3,1625252,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49
3,BROWN,4,1437026,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52
4,JONES,5,1425470,483.24,3004.80,55.19,38.48,0.44,1,2.61,2.29
...,...,...,...,...,...,...,...,...,...,...,...
162249,DIETZMANN,160975,100,0.03,90062.93,96,0,0,(S),0,(S)
162250,DOKAS,160975,100,0.03,90062.96,94,(S),0,0,(S),(S)
162251,DONLEA,160975,100,0.03,90062.99,94,0,0,0,0,6
162252,DORIOTT,160975,100,0.03,90063.03,89,0,(S),0,5,(S)


In [107]:
sur_name_df = sur_name_df_raw[['name']].copy()
sur_name_df['prob'] = sur_name_df_raw[['prop100k']]/100_000
sur_name_df = sur_name_df[sur_name_df["name"]!="ALL OTHER NAMES"]

df_list.append(sur_name_df)
weight_list.append(1.0)
file_list.append("Names_2010Census.csv")

sur_name_df

Unnamed: 0,name,prob
0,SMITH,8.281900e-03
1,JOHNSON,6.552400e-03
2,WILLIAMS,5.509700e-03
3,BROWN,4.871600e-03
4,JONES,4.832400e-03
...,...,...
162248,DOBBEN,3.000000e-07
162249,DIETZMANN,3.000000e-07
162250,DOKAS,3.000000e-07
162251,DONLEA,3.000000e-07


In [16]:
names1990_name_dir = root_dir / "1990"
list(names1990_name_dir.iterdir())

[WindowsPath('M:/deldir/names/1990/dist.all.last.txt'),
 WindowsPath('M:/deldir/names/1990/dist.female.first.txt'),
 WindowsPath('M:/deldir/names/1990/dist.male.first.txt')]

In [108]:
for file in names1990_name_dir.iterdir():
    df_names1990_raw = pd.read_fwf(file,names=['name','percent','cum','rank'])
    print(df_names1990_raw)
    df_names1990 = df_names1990_raw[['name']].copy()
    df_names1990['prob'] = df_names1990_raw[['percent']]/100
    df_names1990 = df_names1990[df_names1990_raw['percent']>0]
    print(df_names1990)
    df_list.append(df_names1990)
    weight_list.append(1.0)
    file_list.append(file.name)

            name  percent     cum  rank
0          SMITH    1.006   1.006     1
1        JOHNSON    0.810   1.816     2
2       WILLIAMS    0.699   2.515     3
3          JONES    0.621   3.136     4
4          BROWN    0.621   3.757     5
...          ...      ...     ...   ...
88794    AARDEMA    0.000  90.483   795
88795     AARANT    0.000  90.483   796
88796   AANDERUD    0.000  90.483   797
88797     AALUND    0.000  90.483   798
88798  AALDERINK    0.000  90.483   799

[88799 rows x 4 columns]
            name     prob
0          SMITH  0.01006
1        JOHNSON  0.00810
2       WILLIAMS  0.00699
3          JONES  0.00621
4          BROWN  0.00621
...          ...      ...
18834      AMEND  0.00001
18835     ALPHIN  0.00001
18836  ALLBRIGHT  0.00001
18837      AIKIN  0.00001
18838      ACRES  0.00001

[18839 rows x 2 columns]
           name  percent     cum  rank
0          MARY    2.629   2.629     1
1      PATRICIA    1.073   3.702     2
2         LINDA    1.035   4.736     3


In [39]:
baby_name_dir = root_dir / "names (1)"
list(baby_name_dir.iterdir())[-5:]

[WindowsPath('M:/deldir/names/names (1)/yob2016.txt'),
 WindowsPath('M:/deldir/names/names (1)/yob2017.txt'),
 WindowsPath('M:/deldir/names/names (1)/yob2018.txt'),
 WindowsPath('M:/deldir/names/names (1)/yob2019.txt'),
 WindowsPath('M:/deldir/names/names (1)/yob2020.txt')]

In [109]:
for file in baby_name_dir.glob("yob*.txt"):
    df_baby_raw = pd.read_csv(file,names=['name','sex','count'])
    df_baby_raw = df_baby_raw.sort_values(by='count', ascending=False)
    #print(df_baby_raw[:5])

    # assume the most common name (usually "John") is 1%
    one_percent_count = df_baby_raw['count'].iloc[0]
    #print(one_percent_count)
    df_baby = df_baby_raw[['name']].copy()
    df_baby['name'] = df_baby['name'].str.upper()
    df_baby['prob'] = df_baby_raw['count']/one_percent_count/100
    df_baby_male = df_baby[df_baby_raw['sex']=='M']
    df_baby_female = df_baby[df_baby_raw['sex']!='M']
    year = int(str(file)[-8:-4])
    print(year)
    print(df_baby_male[:5])
    #print(df_baby_female[:5])
    # assume that data from 50 years ago has 1/2 the weight
    weight = .5**((2020-np.min([year,2020]))/50)
    df_list.append(df_baby_male)
    weight_list.append(weight)
    file_list.append(file.name)
    df_list.append(df_baby_female)
    weight_list.append(weight)
    file_list.append(file.name)

1880
        name      prob
942     JOHN  0.010000
943  WILLIAM  0.009873
944    JAMES  0.006139
945  CHARLES  0.005539
946   GEORGE  0.005309
1881
        name      prob
938     JOHN  0.010000
939  WILLIAM  0.009721
940    JAMES  0.006205
941   GEORGE  0.005319
942  CHARLES  0.005287
1882
         name      prob
1028     JOHN  0.010000
1029  WILLIAM  0.009729
1030    JAMES  0.006165
1031   GEORGE  0.005434
1032  CHARLES  0.005327
1883
         name      prob
1054     JOHN  0.010000
1055  WILLIAM  0.009430
1056    JAMES  0.005872
1057  CHARLES  0.005426
1058   GEORGE  0.005325
1884
         name      prob
1172     JOHN  0.010000
1173  WILLIAM  0.009477
1174    JAMES  0.006064
1175   GEORGE  0.005284
1176  CHARLES  0.005115
1885
         name      prob
1197     JOHN  0.009592
1198  WILLIAM  0.008812
1199    JAMES  0.005669
1200   GEORGE  0.005121
1201  CHARLES  0.005038
1886
         name      prob
1282     JOHN  0.009127
1283  WILLIAM  0.008345
1284    JAMES  0.005415
1285   GEORGE  0.

In [111]:
def find_name(name):
    for df, weight, file in zip(df_list, weight_list, file_list):
        result_df  =df[df['name']==name]
        if len(result_df)>0:
            print(result_df,weight, file)

find_name('ALICE')


        name      prob
54058  ALICE  0.000001 1.0 Names_2010Census.csv
     name     prob
50  ALICE  0.00357 1.0 dist.female.first.txt
    name      prob
7  ALICE  0.001465 0.1435872943746294 yob1880.txt
       name      prob
1647  ALICE  0.000008 0.145591698308557 yob1881.txt
    name      prob
9  ALICE  0.001492 0.145591698308557 yob1881.txt
    name      prob
7  ALICE  0.001613 0.14762408267869132 yob1882.txt
       name      prob
1855  ALICE  0.000007 0.14968483807736607 yob1883.txt
     name      prob
10  ALICE  0.001673 0.14968483807736607 yob1883.txt
     name      prob
10  ALICE  0.001845 0.15177436054938084 yob1884.txt
       name     prob
1838  ALICE  0.00001 0.1538930516681145 yob1885.txt
     name      prob
10  ALICE  0.001842 0.1538930516681145 yob1885.txt
       name     prob
1887  ALICE  0.00001 0.15604131861270149 yob1886.txt
     name      prob
10  ALICE  0.001831 0.15604131861270149 yob1886.txt
       name     prob
1888  ALICE  0.00001 0.15821957424628497 yob1887.txt


In [113]:
find_name("VAETH")

        name      prob
35580  VAETH  0.000002 1.0 Names_2010Census.csv


In [102]:
df_list[-2][df_list[-2]['name']=="JOHN"]

Unnamed: 0,name,prob
17386,JOHN,0.004161
