Predicting baby name sex using PyTorch for Logistic Regression

### References

- [Logistic Regression with PyTorch](https://proai.org/pytorch-logistic-regression) by [Denny Loevlie](https://twitter.com/DennisLoevlie)
- [PyTorch OO design compared with SkLearn](https://jeancochrane.com/blog/pytorch-functional-api) by [Jean Cochrane](https://JeanCochrane.com)
- [Building Your First Network in PyTorch](https://t.co/m9I4e0tfrC) by [Ta-Ying Cheng](https://www.linkedin.com/in/tim-ta-ying-cheng-411857139/)
- [data.world US baby names since 1880](https://data.world/ssa/baby-names-for-us-states-territories)



In [1]:
import numpy as np
import pandas as pd
# neither year nor len are statistically significant predictors of sex
from pathlib import Path

In [2]:
pd.options.display.max_rows=7

In [3]:
CWD = Path('.').absolute().resolve()
DATA_DIR_NAME = '.nlpia2-data'
DATA_FILE = 'baby-names-region.csv.gz'
CWD

WindowsPath('C:/Users/maria/Dropbox/Tangible AI/gitlab/nlpia2/src/nlpia2/ch05')

In [4]:
parent = CWD
data_dir = parent / DATA_DIR_NAME 
filepath = data_dir / DATA_FILE
for i in range(10):
    print(filepath)
    if filepath.is_file():
        break
    parent = parent.parent
    data_dir = parent / DATA_DIR_NAME 
    filepath = data_dir / DATA_FILE
filepath

C:\Users\maria\Dropbox\Tangible AI\gitlab\nlpia2\src\nlpia2\ch05\.nlpia2-data\baby-names-region.csv.gz
C:\Users\maria\Dropbox\Tangible AI\gitlab\nlpia2\src\nlpia2\.nlpia2-data\baby-names-region.csv.gz
C:\Users\maria\Dropbox\Tangible AI\gitlab\nlpia2\src\.nlpia2-data\baby-names-region.csv.gz
C:\Users\maria\Dropbox\Tangible AI\gitlab\nlpia2\.nlpia2-data\baby-names-region.csv.gz


WindowsPath('C:/Users/maria/Dropbox/Tangible AI/gitlab/nlpia2/.nlpia2-data/baby-names-region.csv.gz')

In [5]:
def find_dir(dirname=DATA_DIR_NAME, parent=Path('.').absolute().resolve(), max_parents=20):
    for i in range(max_parents):
        data_dir = parent / dirname
        if data_dir.is_dir():
            return data_dir
        parent = parent.parent

DATA_DIR = find_dir()
DATA_DIR

WindowsPath('C:/Users/maria/Dropbox/Tangible AI/gitlab/nlpia2/.nlpia2-data')

In [6]:
def find_file(filename, parent=Path('.').absolute().resolve(), data_dir_name=DATA_DIR_NAME, max_parents=20):
    for i in range(max_parents):
        data_dir = parent / data_dir_name 
        filepath = data_dir / filename
        if filepath.is_file():
            return filepath
        parent = parent.parent

filepath = find_file(DATA_FILE)
filepath

WindowsPath('C:/Users/maria/Dropbox/Tangible AI/gitlab/nlpia2/.nlpia2-data/baby-names-region.csv.gz')

In [7]:
df = pd.read_csv(filepath)

In [8]:
np.random.seed(451)
df = df.sample(10_000)
df

Unnamed: 0,region,sex,year,name,count,freq
6139665,WV,F,1987,Brittani,10,0.000003
2565339,MD,F,1954,Ida,18,0.000005
22297,AK,M,1988,Maxwell,5,0.000001
...,...,...,...,...,...,...
4475894,OK,F,1950,Leah,9,0.000003
5744351,VA,F,2007,Carley,11,0.000003
5583882,TX,M,2019,Kartier,10,0.000003


In [9]:
names = df['name'].unique()
list(names[:10])

['Brittani',
 'Ida',
 'Maxwell',
 'Charlene',
 'Todd',
 'Aubrey',
 'Arianna',
 'Otis',
 'Trenton',
 'Faustino']

In [10]:
len(names) / len(df)

0.4025

In [11]:
# df = pd.get_dummies(df, columns=['region'])
# df.head()

In [12]:
df = df.rename(dict(name='name_', sex='sex_'), axis=1)
df.head()

Unnamed: 0,region,sex_,year,name_,count,freq
6139665,WV,F,1987,Brittani,10,3e-06
2565339,MD,F,1954,Ida,18,5e-06
22297,AK,M,1988,Maxwell,5,1e-06
5114650,TN,F,1972,Charlene,24,8e-06
2126395,KS,M,1954,Todd,11,3e-06


In [13]:
df = df.groupby(['name_', 'sex_']).sum()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaden,M,2008,51,0.000015
Aahana,F,2018,26,0.000009
Aahil,M,2019,5,0.000002
...,...,...,...,...
Zvi,M,2015,5,0.000002
Zya,F,2019,8,0.000003
Zylah,F,2008,5,0.000001


In [108]:
df_with_duplicates = pd.read_csv(filepath)
np.random.seed(451)
df_with_duplicates = df_with_duplicates.sample(10_000)
df_with_duplicates.set_index(['name', 'sex'])
groups = df_with_duplicates.groupby(['name', 'sex'])

In [113]:
groups = df_with_duplicates.groupby(['name', 'sex'])
for group in groups: 
    if len(group) > 1:
        print (group)

(('Aaden', 'M'),         region sex  year   name  count      freq
1258876     FL   M  2008  Aaden     51  0.000015)
(('Aahana', 'F'),        region sex  year    name  count      freq
612877     CA   F  2018  Aahana     26  0.000009)
(('Aahil', 'M'),         region sex  year   name  count      freq
5584635     TX   M  2019  Aahil      5  0.000002)
(('Aaleyah', 'F'),        region sex  year     name  count      freq
581605     CA   F  2010  Aaleyah     17  0.000005)
(('Aalia', 'F'),        region sex  year   name  count      freq
598948     CA   F  2014  Aalia      7  0.000002
618326     CA   F  2019  Aalia      6  0.000002)
(('Aaliyah', 'F'),         region sex  year     name  count      freq
6034503     WI   F  1998  Aaliyah     18  0.000005
2337545     LA   F  1998  Aaliyah     55  0.000017
3589731     NE   F  2016  Aaliyah     22  0.000007)
(('Aalyah', 'F'),        region sex  year    name  count      freq
546322     CA   F  2001  Aalyah      6  0.000002)
(('Aanya', 'F'),         reg

[8 rows x 6 columns])
(('Alana', 'F'),         region sex  year   name  count      freq
5608995     UT   F  1980  Alana      8  0.000003
4884841     RI   F  1983  Alana      5  0.000002
4767488     PA   F  2011  Alana     73  0.000023
529369      CA   F  1997  Alana    124  0.000039)
(('Alanna', 'F'),         region sex  year    name  count      freq
5352671     TX   F  1998  Alanna     28  0.000009
4615666     OR   F  2007  Alanna      5  0.000001
1337027     GA   F  1988  Alanna      6  0.000002
612341      CA   F  2018  Alanna     67  0.000023
940114      CT   F  2016  Alanna      7  0.000002
1742471     IL   F  1977  Alanna      7  0.000002
3926347     NV   F  2020  Alanna      6  0.000002)
(('Alayah', 'F'),         region sex  year    name  count      freq
6043567     WI   F  2008  Alayah      6  0.000002)
(('Alayna', 'F'),         region sex  year    name  count      freq
6146436     WV   F  2007  Alayna      8  0.000002)
(('Albert', 'M'),         region sex  year    name  count 

(('Amalia', 'F'),        region sex  year    name  count      freq
572699     CA   F  2008  Amalia     32  0.000009)
(('Amanda', 'F'),         region sex  year    name  count      freq
3852428     NM   F  1997  Amanda     79  0.000025
1945399     IN   F  1965  Amanda     51  0.000015
5887415     WA   F  1975  Amanda    179  0.000065
5098154     TN   F  1945  Amanda     10  0.000004)
(('Amani', 'F'),        region sex  year   name  count      freq
938830     CT   F  2014  Amani     11  0.000003)
(('Amarah', 'F'),        region sex  year    name  count      freq
553543     CA   F  2003  Amarah      8  0.000002)
(('Amari', 'F'),         region sex  year   name  count      freq
3220988     MS   F  2013  Amari     22  0.000007)
(('Amariah', 'F'),        region sex  year     name  count      freq
549640     CA   F  2002  Amariah      8  0.000002
565407     CA   F  2006  Amariah     10  0.000003)
(('Amarion', 'M'),         region sex  year     name  count      freq
2416564     LA   M  2006  A

(('Annalie', 'F'),         region sex  year     name  count      freq
5419981     TX   F  2017  Annalie      8  0.000003)
(('Annaliese', 'F'),        region sex  year       name  count      freq
332968     AZ   F  2016  Annaliese      9  0.000003)
(('Annalise', 'F'),         region sex  year      name  count      freq
3085029     MO   F  2006  Annalise     14  0.000004)
(('Annalynn', 'F'),         region sex  year      name  count      freq
1989218     IN   F  2013  Annalynn      5  0.000002)
(('Annamae', 'F'),         region sex  year     name  count      freq
4687591     PA   F  1941  Annamae     12  0.000005)
(('Anne', 'F'),         region sex  year  name  count      freq
5147145     TN   F  2010  Anne     15  0.000005
2677266     ME   F  1958  Anne     37  0.000009
426375      CA   F  1956  Anne    366  0.000093
...        ...  ..   ...   ...    ...       ...
92646       AL   F  2009  Anne     19  0.000006
4255468     OH   F  1912  Anne    111  0.000125
1974614     IN   F  2001  An

(('Asher', 'M'),         region sex  year   name  count      freq
3947511     NV   M  2017  Asher     44  0.000015
706574      CA   M  1992  Asher     17  0.000005)
(('Ashlea', 'F'),         region sex  year    name  count      freq
4061394     NY   F  1995  Ashlea      5  0.000002)
(('Ashlee', 'F'),         region sex  year    name  count      freq
2336074     LA   F  1996  Ashlee     22  0.000007
1747943     IL   F  1981  Ashlee     15  0.000005)
(('Ashleigh', 'F'),         region sex  year      name  count      freq
3089763     MO   F  2010  Ashleigh      7  0.000002
2582291     MD   F  1984  Ashleigh     20  0.000006
4612391     OR   F  2003  Ashleigh      9  0.000003)
(('Ashley', 'F'),         region sex  year    name  count      freq
6198288     WY   F  1994  Ashley     57  0.000017
1374466     GA   F  2014  Ashley     92  0.000029
6029055     WI   F  1991  Ashley    827  0.000236
611890      CA   F  2018  Ashley    358  0.000121
1000585     DC   F  1990  Ashley    158  0.000044


(('Belva', 'F'),         region sex  year   name  count      freq
3353534     NC   F  1955  Belva      6  0.000002)
(('Ben', 'M'),         region sex  year name  count      freq
849063      CO   M  1923  Ben     20  0.000009
1668985     ID   M  1983  Ben      6  0.000002
5061914     SD   M  1933  Ben      6  0.000003
...        ...  ..   ...  ...    ...       ...
2525499     MA   M  1982  Ben      7  0.000002
255223      AR   M  1992  Ben      6  0.000002
3866862     NM   M  1927  Ben     11  0.000005

[9 rows x 6 columns])
(('Benedict', 'M'),        region sex  year      name  count      freq
702352     CA   M  1990  Benedict      7  0.000002)
(('Benita', 'F'),         region sex  year    name  count      freq
6127505     WV   F  1955  Benita      8  0.000002
2573949     MD   F  1969  Benita      5  0.000002)
(('Benjamin', 'M'),         region sex  year      name  count      freq
25111       AK   M  2003  Benjamin     37  0.000011
6154797     WV   M  1921  Benjamin     33  0.000015
48

(('Bodhi', 'M'),         region sex  year   name  count      freq
5220176     TN   M  2016  Bodhi     23  0.000007
1525167     HI   M  2015  Bodhi     10  0.000003)
(('Bodie', 'M'),         region sex  year   name  count      freq
6108460     WI   M  2020  Bodie     10  0.000004)
(('Bonita', 'F'),         region sex  year    name  count      freq
4579427     OR   F  1929  Bonita      6  0.000003)
(('Bonni', 'F'),         region sex  year   name  count      freq
4010843     NY   F  1970  Bonni      8  0.000002)
(('Bonnie', 'F'),         region sex  year    name  count      freq
4294155     OH   F  1965  Bonnie    208  0.000061
1632204     ID   F  1956  Bonnie     52  0.000013
5100110     TN   F  1949  Bonnie    147  0.000044
...        ...  ..   ...     ...    ...       ...
5045848     SD   F  1970  Bonnie     13  0.000004
2742403     MI   F  1954  Bonnie    414  0.000109
4607723     OR   F  1996  Bonnie      5  0.000002

[8 rows x 6 columns])
(('Bonny', 'F'),         region sex  year  

(('Brooks', 'M'),         region sex  year    name  count      freq
3488929     NC   M  2008  Brooks     15  0.000004
954353      CT   M  1961  Brooks      5  0.000001)
(('Bruce', 'M'),         region sex  year   name  count      freq
3530748     ND   M  1916  Bruce     12  0.000007
3309674     MT   M  1945  Bruce     31  0.000012
3817826     NJ   M  2016  Bruce     24  0.000008
...        ...  ..   ...    ...    ...       ...
6076830     WI   M  1970  Bruce     89  0.000026
2242569     KY   M  1951  Bruce    184  0.000052
2501788     MA   M  1917  Bruce      7  0.000004

[10 rows x 6 columns])
(('Bruno', 'M'),         region sex  year   name  count      freq
2518787     MA   M  1966  Bruno      5  0.000002
2502021     MA   M  1918  Bruno     26  0.000013
1826276     IL   M  1928  Bruno     29  0.000014)
(('Bryan', 'M'),         region sex  year   name  count      freq
2013240     IN   M  1952  Bryan     21  0.000006
4397965     OH   M  1962  Bryan    321  0.000084
2376560     LA   M  

(('Carson', 'M'),         region sex  year    name  count      freq
3450926     NC   M  1957  Carson     12  0.000003
373081      AZ   M  2003  Carson     88  0.000026)
(('Carsyn', 'F'),         region sex  year    name  count      freq
5422355     TX   F  2018  Carsyn     24  0.000008)
(('Carter', 'M'),         region sex  year    name  count      freq
6183627     WV   M  2011  Carter     60  0.000019
2264640     KY   M  2001  Carter     49  0.000015
265907      AR   M  2013  Carter     89  0.000029
4839921     PA   M  1996  Carter     35  0.000011
2277087     KY   M  2020  Carter    153  0.000055
4396275     OH   M  1959  Carter      8  0.000002)
(('Cary', 'F'),         region sex  year  name  count      freq
2755509     MI   F  1967  Cary      8  0.000003
4004463     NY   F  1966  Cary      5  0.000002)
(('Cary', 'M'),         region sex  year  name  count      freq
5950411     WA   M  1968  Cary     17  0.000005
1200019     FL   M  1942  Cary      7  0.000003
3538222     ND   M  19

(('Cherish', 'F'),         region sex  year     name  count      freq
1953242     IN   F  1975  Cherish      6  0.000002)
(('Cherita', 'F'),         region sex  year     name  count      freq
2758864     MI   F  1970  Cherita      5  0.000001)
(('Cherri', 'F'),         region sex  year    name  count      freq
5304424     TX   F  1976  Cherri      7  0.000003
4485292     OK   F  1968  Cherri      5  0.000002)
(('Cherrie', 'F'),         region sex  year     name  count      freq
5876699     WA   F  1956  Cherrie      5  0.000001)
(('Cherry', 'F'),         region sex  year    name  count      freq
2178435     KY   F  1951  Cherry      7  0.000002)
(('Cheryl', 'F'),         region sex  year    name  count      freq
1045013     DE   F  1975  Cheryl     12  0.000004
1041425     DE   F  1957  Cheryl     91  0.000023)
(('Cheryle', 'F'),        region sex  year     name  count      freq
419831     CA   F  1951  Cheryle     25  0.000007)
(('Cheryll', 'F'),         region sex  year     name  cou

(('Cora', 'F'),         region sex  year  name  count      freq
1316157     GA   F  1964  Cora     13  0.000004
3645024     NH   F  2008  Cora      7  0.000002
2200171     KY   F  1989  Cora      8  0.000002)
(('Coral', 'F'),         region sex  year   name  count      freq
5364789     TX   F  2002  Coral     21  0.000006
2780935     MI   F  1990  Coral      6  0.000002
1140706     FL   F  1999  Coral     25  0.000008)
(('Coraline', 'F'),         region sex  year      name  count      freq
1814084     IL   F  2019  Coraline     20  0.000007
5153245     TN   F  2015  Coraline     15  0.000005)
(('Corbin', 'M'),         region sex  year    name  count      freq
372528      AZ   M  2002  Corbin     15  0.000005
716496      CA   M  1996  Corbin     28  0.000009
2417119     LA   M  2007  Corbin     22  0.000006
5561188     TX   M  2011  Corbin    189  0.000061)
(('Cordelia', 'F'),         region sex  year      name  count      freq
5296206     TX   F  1971  Cordelia      6  0.000002
5241819

[10 rows x 6 columns])
(('Daniela', 'F'),         region sex  year     name  count      freq
5727152     VA   F  1991  Daniela      7  0.000002
1140346     FL   F  1999  Daniela    134  0.000041
320475      AZ   F  2007  Daniela    114  0.000033)
(('Daniella', 'F'),         region sex  year      name  count      freq
1802650     IL   F  2013  Daniella     47  0.000015
2474878     MA   F  1994  Daniella      9  0.000003)
(('Danielle', 'F'),         region sex  year      name  count     freq
1568786     IA   F  2004  Danielle     33  0.00001)
(('Danielle', 'M'),         region sex  year      name  count      freq
5505397     TX   M  1981  Danielle      5  0.000002)
(('Danika', 'F'),         region sex  year    name  count      freq
5148948     TN   F  2011  Danika      5  0.000002)
(('Danish', 'M'),        region sex  year    name  count      freq
710448     CA   M  1993  Danish      5  0.000001)
(('Danny', 'M'),         region sex  year   name  count      freq
1880733     IL   M  1996  

(('Dedra', 'F'),         region sex  year   name  count      freq
4945968     SC   F  1979  Dedra     13  0.000004)
(('Dedrick', 'M'),         region sex  year     name  count      freq
5506346     TX   M  1982  Dedrick     11  0.000003)
(('Dee', 'F'),         region sex  year name  count      freq
3054836     MO   F  1968  Dee      8  0.000003
2756479     MI   F  1968  Dee      9  0.000003
2079489     KS   F  1960  Dee     24  0.000006)
(('Dee', 'M'),         region sex  year name  count      freq
4532146     OK   M  1935  Dee      6  0.000003)
(('Deion', 'M'),         region sex  year   name  count      freq
4422904     OH   M  1994  Deion     15  0.000005)
(('Deirdre', 'F'),         region sex  year     name  count      freq
3978043     NY   F  1947  Deirdre     40  0.000012)
(('Dejah', 'F'),         region sex  year   name  count      freq
4747199     PA   F  1997  Dejah     10  0.000003)
(('Dejane', 'F'),        region sex  year    name  count      freq
535722     CA   F  1998  De

(('Dia', 'F'),         region sex  year name  count      freq
1805944     IL   F  2014  Dia      5  0.000002)
(('Diamond', 'F'),        region sex  year     name  count      freq
597397     CA   F  2014  Diamond     26  0.000008)
(('Dian', 'F'),        region sex  year  name  count      freq
430343     CA   F  1958  Dian     12  0.000003)
(('Diana', 'F'),         region sex  year   name  count      freq
397286      CA   F  1924  Diana     18  0.000008
4756503     PA   F  2004  Diana     43  0.000013
1378231     GA   F  2016  Diana     30  0.000010
1640611     ID   F  1985  Diana      6  0.000002
4283552     OH   F  1955  Diana    619  0.000162
3204629     MS   F  1985  Diana     10  0.000003)
(('Diane', 'F'),         region sex  year   name  count      freq
5113256     TN   F  1970  Diane     35  0.000010
5899881     WA   F  1991  Diane     11  0.000003
4943120     SC   F  1974  Diane     16  0.000006
...        ...  ..   ...    ...    ...       ...
2324330     LA   F  1982  Diane     

(('Eduard', 'M'),        region sex  year    name  count      freq
882025     CO   M  2004  Eduard      5  0.000002)
(('Eduardo', 'F'),        region sex  year     name  count      freq
499018     CA   F  1988  Eduardo      6  0.000002)
(('Eduardo', 'M'),         region sex  year     name  count      freq
2851795     MI   M  1965  Eduardo      6  0.000002
2148003     KS   M  2007  Eduardo     22  0.000006)
(('Edward', 'M'),         region sex  year    name  count      freq
5066416     SD   M  1956  Edward     39  0.000010
2416190     LA   M  2006  Edward     36  0.000011
4542298     OK   M  1961  Edward     92  0.000023
...        ...  ..   ...     ...    ...       ...
4388072     OH   M  1947  Edward   1115  0.000325
5184931     TN   M  1964  Edward    204  0.000056
104298      AL   M  1913  Edward    125  0.000122

[11 rows x 6 columns])
(('Edwin', 'M'),         region sex  year   name  count      freq
352168      AZ   M  1966  Edwin     20  0.000006
1017711     DC   M  1959  Edwin  

(('Ervin', 'M'),         region sex  year   name  count      freq
2254724     KY   M  1978  Ervin      5  0.000002
5769253     VA   M  1926  Ervin     17  0.000008
4156906     NY   M  1958  Ervin      9  0.000002
3450737     NC   M  1957  Ervin     48  0.000012
5567416     TX   M  2013  Ervin     13  0.000004)
(('Erwin', 'M'),         region sex  year   name  count      freq
3761016     NJ   M  1929  Erwin     10  0.000005)
(('Eryn', 'F'),         region sex  year  name  count      freq
2784521     MI   F  1993  Eryn     11  0.000003
5905534     WA   F  1997  Eryn     11  0.000003)
(('Esequiel', 'M'),         region sex  year      name  count      freq
3865878     NM   M  1922  Esequiel      6  0.000003)
(('Esme', 'F'),         region sex  year  name  count      freq
2613088     MD   F  2018  Esme      6  0.000002)
(('Esmeralda', 'F'),         region sex  year       name  count      freq
5901736     WA   F  1993  Esmeralda     12  0.000004)
(('Esmirna', 'F'),        region sex  year   

4260218     OH   F  1920  Francis      8  0.000004)
(('Francis', 'M'),         region sex  year     name  count      freq
2664514     MD   M  2019  Francis     14  0.000005
6169750     WV   M  1961  Francis      6  0.000002
4385553     OH   M  1942  Francis    125  0.000049)
(('Francisco', 'M'),         region sex  year       name  count      freq
4845999     PA   M  2002  Francisco     30  0.000009
2860148     MI   M  1977  Francisco     13  0.000005
3616626     NE   M  2000  Francisco     17  0.000005)
(('Franco', 'M'),         region sex  year    name  count      freq
6237147     PR   M  2017  Franco     11  0.000004)
(('Frank', 'F'),         region sex  year   name  count      freq
1295246     GA   F  1931  Frank      6  0.000003)
(('Frank', 'M'),         region sex  year   name  count      freq
2894007     MI   M  2012  Frank     14  0.000005
1848520     IL   M  1964  Frank    526  0.000143
3535718     ND   M  1939  Frank     22  0.000011
...        ...  ..   ...    ...    ...    

5392205     TX   F  2010  Gia     67  0.000021)
(('Gian', 'M'),         region sex  year  name  count      freq
3810323     NJ   M  2009  Gian      9  0.000003)
(('Giancarlos', 'M'),        region sex  year        name  count      freq
752182     CA   M  2009  Giancarlos      7  0.000002)
(('Gianna', 'F'),         region sex  year    name  count      freq
3748464     NJ   F  2016  Gianna    180  0.000058
6151408     WV   F  2020  Gianna     28  0.000010)
(('Gideon', 'M'),         region sex  year    name  count      freq
2883137     MI   M  2003  Gideon     15  0.000005)
(('Gil', 'M'),         region sex  year name  count      freq
6229902     PR   M  2003  Gil     11  0.000003)
(('Gilbert', 'M'),         region sex  year     name  count      freq
1024556     DC   M  1981  Gilbert      5  0.000002
2388739     LA   M  1968  Gilbert     23  0.000007
3426810     NC   M  1917  Gilbert     37  0.000020)
(('Gilberto', 'M'),        region sex  year      name  count      freq
676318     CA   M

(('Hassie', 'F'),         region sex  year    name  count      freq
5090768     TN   F  1930  Hassie      6  0.000003)
(('Hattie', 'F'),         region sex  year    name  count      freq
3037932     MO   F  1941  Hattie      8  0.000003
4618646     OR   F  2011  Hattie      8  0.000003
1988515     IN   F  2013  Hattie     21  0.000007
3671597     NJ   F  1925  Hattie     11  0.000005)
(('Haven', 'M'),         region sex  year   name  count      freq
6182544     WV   M  2007  Haven      5  0.000001)
(('Hawa', 'F'),         region sex  year  name  count      freq
2955665     MN   F  2006  Hawa      7  0.000002)
(('Haydee', 'F'),        region sex  year    name  count      freq
552721     CA   F  2003  Haydee     15  0.000005)
(('Hayden', 'F'),         region sex  year    name  count      freq
1381681     GA   F  2018  Hayden     33  0.000011
5754774     VA   F  2015  Hayden     43  0.000014
2102668     KS   F  2006  Hayden      8  0.000002)
(('Hayden', 'M'),         region sex  year    n

(('Ina', 'F'),         region sex  year name  count      freq
5686387     VA   F  1925  Ina     19  0.000009
903351      CT   F  1940  Ina      5  0.000002)
(('Inaaya', 'F'),         region sex  year    name  count      freq
4103055     NY   F  2012  Inaaya      7  0.000002)
(('Indiana', 'F'),         region sex  year     name  count      freq
1382749     GA   F  2018  Indiana      6  0.000002)
(('Indigo', 'F'),         region sex  year    name  count      freq
2612047     MD   F  2017  Indigo      6  0.000002)
(('Inell', 'F'),         region sex  year   name  count      freq
5229367     TX   F  1915  Inell      7  0.000004)
(('Inez', 'F'),         region sex  year  name  count      freq
400672      CA   F  1929  Inez     27  0.000013
1143899     FL   F  2000  Inez      5  0.000001
1299821     GA   F  1940  Inez     51  0.000024
2565461     MD   F  1954  Inez      8  0.000002
52159       AL   F  1951  Inez     16  0.000005)
(('Ingrid', 'F'),         region sex  year    name  count     

(('Jaiden', 'M'),         region sex  year    name  count      freq
5577204     TX   M  2017  Jaiden    127  0.000042
744440      CA   M  2007  Jaiden    208  0.000060
2658981     MD   M  2013  Jaiden     31  0.000010)
(('Jaidyn', 'F'),         region sex  year    name  count      freq
4345995     OH   F  2006  Jaidyn     18  0.000005
4964447     SC   F  2007  Jaidyn      8  0.000002
835818      CO   F  2010  Jaidyn      7  0.000002
6146578     WV   F  2007  Jaidyn      5  0.000001)
(('Jailynn', 'F'),         region sex  year     name  count      freq
4769874     PA   F  2012  Jailynn      9  0.000003)
(('Jaime', 'F'),         region sex  year   name  count      freq
5607702     UT   F  1978  Jaime     38  0.000013)
(('Jaime', 'M'),         region sex  year   name  count      freq
1843225     IL   M  1957  Jaime     23  0.000006
4559562     OK   M  1998  Jaime      6  0.000002
2848398     MI   M  1960  Jaime     11  0.000003)
(('Jaimee', 'F'),         region sex  year    name  count   

(('Jasmyne', 'F'),         region sex  year     name  count      freq
1974968     IN   F  2001  Jasmyne      8  0.000002)
(('Jason', 'F'),         region sex  year   name  count      freq
2770873     MI   F  1981  Jason      5  0.000002)
(('Jason', 'M'),         region sex  year   name  count      freq
6173622     WV   M  1975  Jason    568  0.000207
4429580     OH   M  2001  Jason    330  0.000100
2023320     IN   M  1972  Jason   1354  0.000468
2032689     IN   M  1989  Jason    263  0.000076
123993      AL   M  1950  Jason      5  0.000002
5009812     SC   M  1988  Jason    176  0.000053)
(('Jasper', 'M'),         region sex  year    name  count      freq
4866479     PA   M  2018  Jasper     51  0.000017
1393533     GA   M  1924  Jasper     25  0.000011
1242060     FL   M  1997  Jasper      8  0.000003
5184293     TN   M  1962  Jasper      7  0.000002
1874079     IL   M  1990  Jasper      6  0.000002
2373983     LA   M  1943  Jasper     11  0.000004
5492734     TX   M  1971  Jasper 

(('Jerald', 'M'),         region sex  year    name  count      freq
4229419     NY   M  2008  Jerald      7  0.000002
2249638     KY   M  1966  Jerald      7  0.000002
1838928     IL   M  1951  Jerald     36  0.000010)
(('Jeramiah', 'M'),         region sex  year      name  count      freq
2890468     MI   M  2009  Jeramiah     11  0.000003)
(('Jereme', 'M'),         region sex  year    name  count      freq
2863485     MI   M  1981  Jereme      5  0.000002)
(('Jeremiah', 'M'),         region sex  year      name  count      freq
4203576     NY   M  1995  Jeremiah     93  0.000029
2638726     MD   M  1984  Jeremiah     33  0.000010
3612085     NE   M  1985  Jeremiah     31  0.000010)
(('Jeremias', 'M'),         region sex  year      name  count      freq
737304      CA   M  2004  Jeremias     11  0.000003
4194571     NY   M  1989  Jeremias      6  0.000002)
(('Jeremie', 'M'),         region sex  year     name  count      freq
6228448     PR   M  2000  Jeremie     11  0.000003)
(('Jeremy

(('Jorge', 'M'),         region sex  year   name  count      freq
1068051     DE   M  2014  Jorge      7  0.000002
1023669     DC   M  1978  Jorge      5  0.000002)
(('Jorja', 'F'),         region sex  year   name  count      freq
4509340     OK   F  2006  Jorja      5  0.000001)
(('Jose', 'M'),         region sex  year  name  count      freq
4638559     OR   M  1965  Jose      9  0.000003
2636678     MD   M  1979  Jose     14  0.000005
853238      CO   M  1940  Jose     36  0.000017
2834368     MI   M  1935  Jose     16  0.000008
5823763     VA   M  2014  Jose     64  0.000020)
(('Josefina', 'F'),        region sex  year      name  count      freq
425439     CA   F  1955  Josefina     19  0.000005)
(('Joseline', 'F'),        region sex  year      name  count      freq
511644     CA   F  1992  Joseline     32  0.000009)
(('Joseph', 'F'),         region sex  year    name  count      freq
1744842     IL   F  1979  Joseph     20  0.000007)
(('Joseph', 'M'),         region sex  year    nam

(('Kara', 'F'),         region sex  year  name  count      freq
5403211     TX   F  2013  Kara     57  0.000018
3069551     MO   F  1989  Kara     87  0.000025
2109363     KS   F  2015  Kara      5  0.000002
2958265     MN   F  2009  Kara     21  0.000006
331534      AZ   F  2015  Kara     14  0.000004
810982      CO   F  1980  Kara     38  0.000012)
(('Karalee', 'F'),         region sex  year     name  count      freq
5607023     UT   F  1976  Karalee      6  0.000002)
(('Kareem', 'M'),         region sex  year    name  count      freq
1860409     IL   M  1977  Kareem     17  0.000006)
(('Karen', 'F'),         region sex  year   name  count      freq
3705805     NJ   F  1980  Karen    189  0.000060
6331        AK   F  1980  Karen     16  0.000005
2106499     KS   F  2011  Karen      5  0.000002
1042663     DE   F  1963  Karen    135  0.000036
1047123     DE   F  1988  Karen     12  0.000004
3045462     MO   F  1955  Karen    837  0.000219)
(('Kari', 'F'),         region sex  year  nam

583408     CA   F  2010  Kelis      6  0.000002)
(('Kellan', 'M'),         region sex  year    name  count      freq
4253462     NY   M  2020  Kellan     21  0.000008)
(('Kellee', 'F'),         region sex  year    name  count      freq
4318706     OH   F  1986  Kellee      7  0.000002)
(('Kelley', 'F'),         region sex  year    name  count      freq
2574875     MD   F  1971  Kelley     14  0.000004
3730575     NJ   F  2002  Kelley      5  0.000002
1547577     IA   F  1960  Kelley     19  0.000005)
(('Kelli', 'F'),         region sex  year   name  count      freq
4959701     SC   F  2001  Kelli      8  0.000002
282783      AZ   F  1957  Kelli      6  0.000001
194066      AR   F  1967  Kelli     20  0.000006
2094366     KS   F  1992  Kelli     23  0.000007
4006507     NY   F  1968  Kelli     94  0.000030)
(('Kellie', 'F'),         region sex  year    name  count      freq
1639688     ID   F  1982  Kellie      8  0.000003
2095985     KS   F  1995  Kellie     11  0.000003)
(('Kelly', 'F

5362863     TX   F  2001  Kinzie      7  0.000002)
(('Kip', 'M'),         region sex  year name  count      freq
648683      CA   M  1954  Kip     21  0.000006
1859681     IL   M  1976  Kip      8  0.000003)
(('Kira', 'F'),         region sex  year  name  count      freq
3095016     MO   F  2015  Kira     14  0.000004
6148930     WV   F  2013  Kira      6  0.000002
3208645     MS   F  1992  Kira      5  0.000001)
(('Kiran', 'M'),         region sex  year   name  count      freq
5584531     TX   M  2019  Kiran      6  0.000002)
(('Kirby', 'M'),         region sex  year   name  count      freq
362945      AZ   M  1988  Kirby      6  0.000002
3123727     MO   M  1961  Kirby     12  0.000003)
(('Kirk', 'M'),         region sex  year  name  count      freq
706094      CA   M  1992  Kirk     56  0.000016
5586862     TX   M  2020  Kirk      8  0.000003)
(('Kirstin', 'F'),         region sex  year     name  count      freq
1339161     GA   F  1990  Kirstin      7  0.000002)
(('Kirsty', 'F'),  

(('Lawanda', 'F'),         region sex  year     name  count      freq
5273550     TX   F  1956  Lawanda     18  0.000005
2313465     LA   F  1969  Lawanda     10  0.000003)
(('Lawrence', 'M'),         region sex  year      name  count      freq
4628604     OR   M  1927  Lawrence     41  0.000019
2155907     KS   M  2019  Lawrence      8  0.000003
6175112     WV   M  1980  Lawrence     21  0.000007
6152414     WV   M  1914  Lawrence     62  0.000048
2119112     KS   M  1931  Lawrence     90  0.000046
1905715     IL   M  2013  Lawrence     18  0.000006)
(('Lawson', 'M'),         region sex  year    name  count      freq
3477540     NC   M  1997  Lawson      5  0.000002
5582663     TX   M  2019  Lawson     79  0.000027
3159415     MO   M  2016  Lawson     21  0.000007)
(('Layla', 'F'),         region sex  year   name  count      freq
5718325     VA   F  1980  Layla      5  0.000002
1004564     DC   F  2003  Layla      8  0.000002
2965972     MN   F  2017  Layla     85  0.000028
3586715   

639671     CA   M  1943  Lon      9  0.000003)
(('Lona', 'F'),         region sex  year  name  count      freq
3349442     NC   F  1949  Lona      5  0.000002)
(('London', 'F'),         region sex  year    name  count      freq
5624114     UT   F  2005  London     21  0.000006
2343223     LA   F  2004  London      7  0.000002
936910      CT   F  2011  London     24  0.000008)
(('London', 'M'),         region sex  year    name  count      freq
2151351     KS   M  2012  London      9  0.000003
3491307     NC   M  2010  London     27  0.000009
3941109     NV   M  2005  London      7  0.000002)
(('Londyn', 'F'),         region sex  year    name  count      freq
2108927     KS   F  2015  Londyn     14  0.000004)
(('Londynn', 'F'),         region sex  year     name  count      freq
2357937     LA   F  2019  Londynn      6  0.000002)
(('Long', 'M'),        region sex  year  name  count      freq
719914     CA   M  1997  Long      8  0.000003)
(('Loni', 'F'),         region sex  year  name  co

6147983     WV   F  2011  Madilyn     11  0.000004)
(('Madilynn', 'F'),         region sex  year      name  count      freq
5159431     TN   F  2020  Madilynn     24  0.000009)
(('Madison', 'F'),         region sex  year     name  count      freq
2691995     ME   F  2014  Madison     41  0.000013
5051355     SD   F  1996  Madison     34  0.000011
6150246     WV   F  2017  Madison     38  0.000013
2332765     LA   F  1992  Madison     30  0.000009
1565396     IA   F  1998  Madison    260  0.000079)
(('Madison', 'M'),         region sex  year     name  count      freq
4250388     NY   M  2018  Madison      6  0.000002)
(('Madisyn', 'F'),         region sex  year     name  count      freq
1972601     IN   F  1999  Madisyn     23  0.000007
3925264     NV   F  2018  Madisyn      5  0.000002)
(('Madyson', 'F'),         region sex  year     name  count      freq
2688993     ME   F  2002  Madyson      5  0.000002)
(('Mae', 'F'),         region sex  year name  count      freq
5625526     UT   F

2531165     MA   M  1993  Markus      8  0.000002)
(('Marla', 'F'),         region sex  year   name  count      freq
2310273     LA   F  1965  Marla     16  0.000005
1711735     IL   F  1952  Marla     87  0.000024
4581791     OR   F  1940  Marla      5  0.000002
1765445     IL   F  1992  Marla      5  0.000001
436610      CA   F  1962  Marla    142  0.000037
4481438     OK   F  1961  Marla     23  0.000006)
(('Marlee', 'F'),         region sex  year    name  count      freq
3094866     MO   F  2015  Marlee     22  0.000007
928072      CT   F  1995  Marlee      5  0.000002)
(('Marlen', 'F'),        region sex  year    name  count      freq
580926     CA   F  2010  Marlen     41  0.000013)
(('Marlena', 'F'),        region sex  year     name  count      freq
569448     CA   F  2007  Marlena     12  0.000003)
(('Marlene', 'F'),         region sex  year     name  count      freq
2335734     LA   F  1995  Marlene      6  0.000002
4368635     OH   F  2019  Marlene      6  0.000002
3514469   

(('Meaghan', 'F'),         region sex  year     name  count      freq
474943      CA   F  1980  Meaghan     16  0.000005
4331358     OH   F  1996  Meaghan     12  0.000004)
(('Mechelle', 'F'),         region sex  year      name  count      freq
4296702     OH   F  1967  Mechelle     11  0.000003)
(('Medha', 'F'),         region sex  year   name  count      freq
5417512     TX   F  2016  Medha      5  0.000002)
(('Megan', 'F'),        region sex  year   name  count      freq
316463     AZ   F  2004  Megan    124  0.000037)
(('Megha', 'F'),         region sex  year   name  count      freq
3728984     NJ   F  2001  Megha      8  0.000002)
(('Meghan', 'F'),         region sex  year    name  count      freq
2939568     MN   F  1985  Meghan    104  0.000032
3641792     NH   F  1996  Meghan     37  0.000011
5898660     WA   F  1990  Meghan     49  0.000014
6196638     WY   F  1982  Meghan      9  0.000003)
(('Meghann', 'F'),         region sex  year     name  count      freq
5895913     WA   

(('Mitchel', 'M'),         region sex  year     name  count      freq
4650281     OR   M  1995  Mitchel     11  0.000003
4404100     OH   M  1970  Mitchel      5  0.000001)
(('Mitchell', 'M'),         region sex  year      name  count      freq
3127032     MO   M  1968  Mitchell     45  0.000014
2637491     MD   M  1981  Mitchell     20  0.000006
3250639     MS   M  1965  Mitchell     30  0.000009
1594154     IA   M  1953  Mitchell     15  0.000004
1025338     DC   M  1984  Mitchell      8  0.000003
973544      CT   M  2009  Mitchell      9  0.000003
1854553     IL   M  1971  Mitchell     48  0.000015)
(('Mittie', 'F'),         region sex  year    name  count      freq
3330082     NC   F  1917  Mittie     17  0.000009
5087665     TN   F  1924  Mittie      6  0.000003)
(('Mitzi', 'F'),       region sex  year   name  count      freq
57225     AL   F  1959  Mitzi     17  0.000004)
(('Moana', 'F'),        region sex  year   name  count      freq
334344     AZ   F  2017  Moana      8  0.000

(('Nidia', 'F'),         region sex  year   name  count      freq
5378911     TX   F  2006  Nidia     11  0.000003)
(('Niels', 'M'),        region sex  year   name  count      freq
671724     CA   M  1973  Niels      5  0.000002)
(('Nija', 'F'),         region sex  year  name  count      freq
5363624     TX   F  2001  Nija      5  0.000002)
(('Nika', 'F'),        region sex  year  name  count      freq
593659     CA   F  2013  Nika     20  0.000006)
(('Niki', 'F'),         region sex  year  name  count      freq
2463597     MA   F  1979  Niki      7  0.000002)
(('Nikki', 'F'),         region sex  year   name  count     freq
2777763     MI   F  1988  Nikki     34  0.00001)
(('Niklaus', 'M'),        region sex  year     name  count      freq
389510     AZ   M  2019  Niklaus      6  0.000002)
(('Niko', 'M'),         region sex  year  name  count      freq
5577656     TX   M  2017  Niko     31  0.000010
3806202     NJ   M  2005  Niko      5  0.000001)
(('Nikolas', 'M'),         region sex 

(('Paulette', 'F'),         region sex  year      name  count      freq
5293677     TX   F  1970  Paulette     38  0.000011
4062911     NY   F  1996  Paulette     11  0.000003
5599060     UT   F  1956  Paulette      5  0.000001)
(('Paulina', 'F'),         region sex  year     name  count      freq
2436846     MA   F  1927  Paulina      5  0.000002)
(('Pauline', 'F'),         region sex  year     name  count      freq
5996196     WI   F  1928  Pauline     57  0.000027
4472770     OK   F  1944  Pauline     30  0.000012
482363      CA   F  1983  Pauline     55  0.000017
...        ...  ..   ...      ...    ...       ...
277101      AZ   F  1941  Pauline     13  0.000006
3067953     MO   F  1986  Pauline      5  0.000002
1074203     FL   F  1925  Pauline     70  0.000032

[8 rows x 6 columns])
(('Paulo', 'M'),        region sex  year   name  count      freq
751123     CA   M  2009  Paulo     21  0.000006)
(('Paxton', 'M'),         region sex  year    name  count      freq
5216978     TN   

(('Randall', 'M'),         region sex  year     name  count      freq
2138992     KS   M  1988  Randall      8  0.000002
6067815     WI   M  1947  Randall     97  0.000028
1058257     DE   M  1958  Randall     13  0.000003
4813754     PA   M  1962  Randall    225  0.000059
4545557     OK   M  1969  Randall     56  0.000017)
(('Randee', 'F'),         region sex  year    name  count      freq
5313726     TX   F  1981  Randee     10  0.000003)
(('Randel', 'M'),        region sex  year    name  count      freq
654014     CA   M  1959  Randel     13  0.000003)
(('Randell', 'M'),         region sex  year     name  count      freq
131496      AL   M  1964  Randell      6  0.000002
4815633     PA   M  1964  Randell      6  0.000002)
(('Randi', 'F'),         region sex  year   name  count      freq
5732888     VA   F  1997  Randi      7  0.000002
1327104     GA   F  1977  Randi      5  0.000002)
(('Randolph', 'M'),         region sex  year      name  count      freq
5433047     TX   M  1912  Ra

(('Riley', 'F'),         region sex  year   name  count      freq
1004343     DC   F  2002  Riley      7  0.000002
1648776     ID   F  2009  Riley     27  0.000008
930897      CT   F  2001  Riley     48  0.000015
1967165     IN   F  1993  Riley     16  0.000005
6201005     WY   F  2015  Riley      8  0.000003)
(('Riley', 'M'),         region sex  year   name  count      freq
5457217     TX   M  1938  Riley     11  0.000005
2360891     LA   M  1915  Riley      5  0.000003
1520389     HI   M  2000  Riley     17  0.000005
2875351     MI   M  1996  Riley     84  0.000026
2270850     KY   M  2011  Riley     63  0.000020
1890013     IL   M  2003  Riley    169  0.000051)
(('Rishi', 'M'),         region sex  year   name  count      freq
780074      CA   M  2019  Rishi     26  0.000009
2539920     MA   M  2006  Rishi      5  0.000001)
(('Rita', 'F'),         region sex  year  name  count      freq
5247376     TX   F  1933  Rita    107  0.000058
2485414     MA   F  2006  Rita     10  0.000003
10

5036034     SD   F  1927  Rosella      6  0.000003)
(('Roselyn', 'F'),         region sex  year     name  count      freq
330690      AZ   F  2014  Roselyn      7  0.000002
4059315     NY   F  1994  Roselyn      5  0.000002)
(('Rosemarie', 'F'),         region sex  year       name  count      freq
593550      CA   F  2013  Rosemarie     23  0.000007
4689553     PA   F  1944  Rosemarie    149  0.000059
1715808     IL   F  1956  Rosemarie     37  0.000009
3668811     NJ   F  1918  Rosemarie     10  0.000005)
(('Rosemary', 'F'),         region sex  year      name  count      freq
6011054     WI   F  1961  Rosemary     30  0.000008
3901166     NV   F  1943  Rosemary      6  0.000002
2282549     LA   F  1920  Rosemary     47  0.000022
449767      CA   F  1969  Rosemary    100  0.000031)
(('Rosetta', 'F'),         region sex  year     name  count      freq
3031866     MO   F  1929  Rosetta     16  0.000008
2744786     MI   F  1956  Rosetta     10  0.000003
4935831     SC   F  1961  Rosetta  

4003982     NY   F  1966  Sabina     11  0.000003)
(('Sabrina', 'F'),         region sex  year     name  count      freq
6021938     WI   F  1980  Sabrina     33  0.000011
3579320     NE   F  1992  Sabrina      7  0.000002
941963      CT   F  2019  Sabrina      7  0.000002
2317134     LA   F  1974  Sabrina     28  0.000010)
(('Sade', 'F'),         region sex  year  name  count      freq
1757700     IL   F  1988  Sade     43  0.000013)
(('Sadie', 'F'),         region sex  year   name  count      freq
421711      CA   F  1952  Sadie      7  0.000002
3187741     MS   F  1956  Sadie     29  0.000007
4958206     SC   F  1999  Sadie     13  0.000004
...        ...  ..   ...    ...    ...       ...
1531652     IA   F  1922  Sadie     10  0.000005
3745775     NJ   F  2014  Sadie     98  0.000031
395994      CA   F  1922  Sadie     18  0.000008

[8 rows x 6 columns])
(('Saeed', 'M'),        region sex  year   name  count      freq
723110     CA   M  1998  Saeed      5  0.000002)
(('Sage', 'F'),

(('Shae', 'F'),         region sex  year  name  count      freq
4337548     OH   F  2000  Shae      6  0.000002)
(('Shai', 'M'),         region sex  year  name  count      freq
4219868     NY   M  2003  Shai      5  0.000002)
(('Shaina', 'F'),         region sex  year    name  count      freq
5312480     TX   F  1980  Shaina      5  0.000002)
(('Shakeya', 'F'),         region sex  year     name  count      freq
4049018     NY   F  1990  Shakeya     10  0.000003)
(('Shakia', 'F'),         region sex  year    name  count      freq
1145809     FL   F  2001  Shakia      6  0.000002)
(('Shakina', 'F'),         region sex  year     name  count      freq
4734476     PA   F  1987  Shakina      5  0.000002)
(('Shakira', 'F'),         region sex  year     name  count      freq
6216598     PR   F  1999  Shakira    113  0.000034)
(('Shakur', 'M'),         region sex  year    name  count      freq
4212052     NY   M  1999  Shakur      6  0.000002
3479516     NC   M  1999  Shakur      5  0.000002)
(

3581928     NE   F  1999  Shyanne      8  0.000002)
(('Shyla', 'F'),         region sex  year   name  count      freq
5893645     WA   F  1983  Shyla      6  0.000002)
(('Sianna', 'F'),        region sex  year    name  count      freq
614024     CA   F  2018  Sianna     10  0.000003)
(('Sid', 'M'),         region sex  year name  count      freq
2848648     MI   M  1960  Sid      5  0.000001)
(('Sidney', 'F'),         region sex  year    name  count      freq
1997874     IN   F  2020  Sidney      5  0.000002
5422763     TX   F  2018  Sidney     16  0.000005)
(('Sidney', 'M'),         region sex  year    name  count      freq
3460355     NC   M  1973  Sidney     18  0.000007
4399687     OH   M  1964  Sidney     14  0.000004
1433614     GA   M  1991  Sidney     25  0.000007
3478029     NC   M  1998  Sidney     15  0.000005
1446556     GA   M  2003  Sidney     17  0.000005)
(('Sierra', 'F'),         region sex  year    name  count      freq
5733179     VA   F  1998  Sierra    147  0.000045

(('Sylvia', 'F'),         region sex  year    name  count      freq
3177879     MS   F  1938  Sylvia     84  0.000041
6191804     WY   F  1952  Sylvia      7  0.000002
4683191     PA   F  1935  Sylvia    304  0.000156
5303263     TX   F  1976  Sylvia    205  0.000074)
(('Sylvie', 'F'),        region sex  year    name  count      freq
914186     CT   F  1967  Sylvie      6  0.000002)
(('Symone', 'F'),         region sex  year    name  count      freq
1372215     GA   F  2012  Symone      7  0.000002)
(('Symphony', 'F'),         region sex  year      name  count      freq
4968179     SC   F  2011  Symphony      6  0.000002)
(('Tabatha', 'F'),         region sex  year     name  count      freq
2197749     KY   F  1985  Tabatha     28  0.000009
5712996     VA   F  1973  Tabatha     23  0.000008
3209232     MS   F  1993  Tabatha      5  0.000001)
(('Tabetha', 'F'),         region sex  year     name  count      freq
1126462     FL   F  1990  Tabetha      5  0.000001)
(('Tabitha', 'F'),      

(('Thelma', 'F'),         region sex  year    name  count      freq
4704340     PA   F  1960  Thelma     27  0.000007
6132497     WV   F  1967  Thelma      9  0.000003
3675434     NJ   F  1935  Thelma     29  0.000015
1471908     HI   F  1916  Thelma     21  0.000012)
(('Theo', 'M'),         region sex  year  name  count      freq
5984136     WA   M  2016  Theo     43  0.000014)
(('Theodora', 'F'),         region sex  year      name  count      freq
2915984     MN   F  1939  Theodora      6  0.000003
1686717     IL   F  1917  Theodora     10  0.000005
1086546     FL   F  1952  Theodora      5  0.000001)
(('Theodore', 'M'),         region sex  year      name  count      freq
5933552     WA   M  1913  Theodore     12  0.000012
5062264     SD   M  1935  Theodore     10  0.000005
104381      AL   M  1913  Theodore     26  0.000025
...        ...  ..   ...       ...    ...       ...
5811103     VA   M  2002  Theodore     33  0.000010
5953903     WA   M  1976  Theodore     25  0.000009
19982

(('Troy', 'M'),         region sex  year  name  count      freq
3477782     NC   M  1998  Troy     45  0.000014
3877228     NM   M  1963  Troy     33  0.000009
3665789     NH   M  2019  Troy      6  0.000002
3488627     NC   M  2008  Troy     50  0.000015
1905524     IL   M  2013  Troy     34  0.000011
3004421     MN   M  1999  Troy     21  0.000006
5950659     WA   M  1969  Troy    202  0.000062)
(('Trudy', 'F'),         region sex  year   name  count      freq
1545600     IA   F  1956  Trudy     16  0.000004
5103301     TN   F  1954  Trudy     10  0.000003)
(('Truman', 'M'),        region sex  year    name  count      freq
716971     CA   M  1996  Truman     13  0.000004
768609     CA   M  2015  Truman     21  0.000007)
(('Trystan', 'M'),        region sex  year     name  count      freq
756788     CA   M  2011  Trystan     27  0.000009)
(('Tucker', 'M'),         region sex  year    name  count      freq
2711320     ME   M  1996  Tucker      9  0.000003
2275700     KY   M  2018  Tuck

4988951     SC   M  1942  Wardell      6  0.000002)
(('Waris', 'M'),        region sex  year   name  count      freq
775653     CA   M  2017  Waris      7  0.000002)
(('Warner', 'M'),         region sex  year    name  count      freq
1387464     GA   M  1912  Warner      5  0.000006)
(('Warren', 'M'),         region sex  year    name  count      freq
5933897     WA   M  1915  Warren     20  0.000012
3244485     MS   M  1952  Warren     26  0.000007
3016934     MN   M  2015  Warren     15  0.000005
...        ...  ..   ...     ...    ...       ...
944350      CT   M  1919  Warren     23  0.000012
5197391     TN   M  1988  Warren      9  0.000003
5522726     TX   M  1993  Warren     50  0.000015

[9 rows x 6 columns])
(('Waunita', 'F'),         region sex  year     name  count      freq
6111464     WV   F  1917  Waunita      5  0.000003)
(('Wayland', 'M'),         region sex  year     name  count      freq
5457110     TX   M  1938  Wayland     16  0.000008)
(('Waylon', 'M'),         regi

(('Zakai', 'M'),         region sex  year   name  count      freq
1452893     GA   M  2007  Zakai      5  0.000001)
(('Zakary', 'M'),         region sex  year    name  count      freq
1672177     ID   M  1995  Zakary      6  0.000002
775951      CA   M  2017  Zakary      6  0.000002)
(('Zamora', 'F'),         region sex  year    name  count      freq
1380905     GA   F  2017  Zamora      7  0.000002)
(('Zander', 'M'),         region sex  year    name  count      freq
2716700     ME   M  2018  Zander      8  0.000003)
(('Zandra', 'F'),        region sex  year    name  count      freq
492850     CA   F  1986  Zandra      6  0.000002)
(('Zane', 'M'),         region sex  year  name  count      freq
5013017     SC   M  1993  Zane      8  0.000002
2029141     IN   M  1982  Zane     11  0.000003
1668528     ID   M  1981  Zane      5  0.000002)
(('Zaniyah', 'F'),         region sex  year     name  count      freq
4348762     OH   F  2007  Zaniyah      5  0.000001
3410130     NC   F  2012  Zani

In [14]:
df['name'] = df.index.get_level_values('name_')
df['sex'] = df.index.get_level_values('sex_')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aaden,M,2008,51,1.5e-05,Aaden,M
Aahana,F,2018,26,9e-06,Aahana,F
Aahil,M,2019,5,2e-06,Aahil,M
Aaleyah,F,2010,17,5e-06,Aaleyah,F
Aalia,F,4033,13,4e-06,Aalia,F


In [15]:
df.query('name == "Chris"')

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chris,F,1983,5,2e-06,Chris,F
Chris,M,7850,239,6.9e-05,Chris,M


In [114]:
counts[('Timothy',)]

sex
F       5
M    3538
Name: count, dtype: int64

In [16]:
df['istrain'] = np.random.rand(len(df)) < .9
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex,istrain
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaden,M,2008,51,1.5e-05,Aaden,M,True
Aahana,F,2018,26,9e-06,Aahana,F,True
Aahil,M,2019,5,2e-06,Aahil,M,True
Aaleyah,F,2010,17,5e-06,Aaleyah,F,True
Aalia,F,4033,13,4e-06,Aalia,F,True


In [17]:
# A list of dicts or a dict of dicts is fastest way to create dataframe from groups of rows
# https://stackoverflow.com/users/8727339/mohit-motwani
# https://stackoverflow.com/a/57001947/623735

df_most_common = {}
for name, group in df.groupby('name'):
    row_dict = group.iloc[group['count'].argmax()].to_dict()
    df_most_common[(name, row_dict['sex'])] = row_dict
df_most_common = pd.DataFrame(df_most_common).T
df_most_common

Unnamed: 0,Unnamed: 1,year,count,freq,name,sex,istrain
Aaden,M,2008,51,0.000015,Aaden,M,True
Aahana,F,2018,26,0.000009,Aahana,F,True
Aahil,M,2019,5,0.000002,Aahil,M,True
...,...,...,...,...,...,...,...
Zvi,M,2015,5,0.000002,Zvi,M,True
Zya,F,2019,8,0.000003,Zya,F,True
Zylah,F,2008,5,0.000001,Zylah,F,True


In [71]:
df = df_with_duplicates;
for name, group in df_with_duplicates.groupby('name'):
    if len(group) > 1: 
        df.drop(group['count'].idxmin(), inplace=True)

In [66]:
df = df_with_duplicates
group = df.groupby('name').get_group('Chris')
group['count'].idxmax()

('Chris', 'M')

In [72]:

df.query('name == "Chris"')

Unnamed: 0_level_0,Unnamed: 1_level_0,year,count,freq,name,sex,istrain
name_,sex_,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Chris,M,7850,239,6.9e-05,Chris,M,True


In [19]:
df_most_common['istest'] = ~df_most_common['istrain'].astype(bool)
print(df_most_common['name sex count istrain istest'.split(' ')])

            name sex count istrain  istest
Aaden  M   Aaden   M    51    True   False
Aahana F  Aahana   F    26    True   False
Aahil  M   Aahil   M     5    True   False
...          ...  ..   ...     ...     ...
Zvi    M     Zvi   M     5    True   False
Zya    F     Zya   F     8    True   False
Zylah  F   Zylah   F     5    True   False

[4025 rows x 5 columns]


In [77]:
df_most_common[['istest', 'istrain']].sum() / len(df_most_common)

istest     0.095652
istrain    0.904348
dtype: float64

In [21]:
istest = df_most_common['istest']
istest

Aaden   M    False
Aahana  F    False
Aahil   M    False
             ...  
Zvi     M    False
Zya     F    False
Zylah   F    False
Name: istest, Length: 4025, dtype: bool

In [22]:
istest.sum()

385

In [23]:
istest_idx = df_most_common[istest].index
istest_idx[:4]

MultiIndex([('Abelardo', 'M'),
            ( 'Adaline', 'F'),
            ( 'Adalynn', 'F'),
            ( 'Addelyn', 'F')],
           )

In [24]:
df['istrain'].sum() / len(df)

0.9043478260869565

In [25]:
df['istest'] = df_most_common['istest']
df['istest'] = df['istest'].fillna(False)
df['istrain'] = ~df['istest']
df['istrain'].sum() / len(df)

0.9043478260869565

In [26]:
df[~df['istrain'] & ~df['istest']]

Unnamed: 0,Unnamed: 1,year,count,freq,name,sex,istrain,istest


In [80]:
istrain = df['istrain']
istrain.sum() / len(istrain)

0.9040993788819875

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), lowercase=False)
vectorizer

In [81]:
unique_names = df['name'][istrain].unique()
vectorizer.fit(unique_names)
vecs = vectorizer.transform(df['name'])
vecs

<4025x3657 sparse matrix of type '<class 'numpy.float64'>'
	with 57656 stored elements in Compressed Sparse Row format>

In [82]:
vecs = pd.DataFrame(vecs.toarray())
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = df.index
print(vecs.iloc[:,:5])

                    A        Aa       Aad       Aah  Aal
name_  sex_                                             
Aaden  M     0.193989  0.393903  0.505031  0.000000  0.0
Aahana F     0.183496  0.372597  0.000000  0.454943  0.0
Aahil  M     0.186079  0.377841  0.000000  0.461346  0.0
...               ...       ...       ...       ...  ...
Zvi    M     0.000000  0.000000  0.000000  0.000000  0.0
Zya    F     0.000000  0.000000  0.000000  0.000000  0.0
Zylah  F     0.000000  0.000000  0.000000  0.000000  0.0

[4025 rows x 5 columns]


In [31]:
import torch
torch

<module 'torch' from 'C:\\Users\\maria\\.conda\\envs\\nlpia2\\lib\\site-packages\\torch\\__init__.py'>

In [32]:
class LogisticRegressionNN(torch.nn.Module):

    def __init__(self, num_features, num_outputs=1):
         super().__init__()
         self.linear = torch.nn.Linear(num_features, num_outputs)

    def forward(self, X):
        return torch.sigmoid(self.linear(X))

In [33]:
def make_tensor(X):
    """ Convert numpy ndarray to torch.Tensor """
    X = getattr(X, 'values', X)
    return X if isinstance(X, torch.Tensor) else torch.Tensor(X)

def make_array(x):
    """ Convert torch.Tensor to numpy 1-D array """
    if hasattr(x, 'detach'):
        return torch.squeeze(x).detach().numpy()
    return x

In [34]:
num_features = vecs.shape[1]  # number of unique n-grams in our "vocabulary"
num_outputs = 1    # number of nesses (sexes) to predict, we're predicting only femaleness

In [35]:
from tqdm import tqdm
import time
import json
import copy

# Fraction of the tensors y_pred and y that are the same 
# (y_pred == y).sum() / len(y)
def measure_binary_accuracy(y_pred, y):
    """ Round y_pred and y then count the preds that are equal to the truth to compute fraction correct """
    y_pred = make_array(y_pred).round()
    y = make_array(y).round()
    num_correct = (y_pred == y).sum()
    return num_correct / len(y)

In [36]:
def measure_performance(model, X_train, X_test, y_train, y_test, criterion):
    with torch.no_grad():
        # Calculating the loss and accuracy for the train dataset
        accuracy_train = measure_binary_accuracy(model(X_train), y_train)
        outputs_test = torch.squeeze(model(X_test))
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        loss_test = criterion(outputs_test, y_test)
        return dict(i=i, 
                    # loss_train=loss.item(),
                    accuracy_train=accuracy_train,
                    loss_test=loss_test.item(),
                    accuracy_test=accuracy_test)

In [37]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

LogisticRegressionNN(
  (linear): Linear(in_features=3657, out_features=1, bias=True)
)

In [38]:
learning_rate = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.05
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [44]:
df[['count']].columns

Index(['count'], dtype='object')

In [73]:
# BCE: Binary Cross Entropy
criterion = torch.nn.BCELoss(weight=torch.Tensor(df[['count']].values))
criterion

BCELoss()

In [84]:
X = vecs.values
y = (df[['sex']] == 'F').values
X_train = torch.Tensor(X[istrain])
X_test = torch.Tensor(X[~istrain])
y_train = torch.Tensor(y[istrain])
y_test = torch.Tensor(y[~istrain])

In [95]:
num_epochs = 200
loss_func_train = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][istrain].values))
hyperparam_values ={'lr': 0.02,'momentum': 0.001}
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_values)
pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)

for epoch in pbar_epochs:
    optimizer.zero_grad() # Setting our stored gradients equal to zero
    outputs = model(X_train)
    loss_train = loss_func_train(outputs, y_train) 
    loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
    optimizer.step() # Updates weights and biases with the optimizer (SGD)

Epoch:: 100%|██████████| 200/200 [00:01<00:00, 138.24it/s]


In [118]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_values)
loss_func_test = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][~istrain].values)) 

for epoch in range(num_epochs):
    optimizer.zero_grad() 
    outputs = model(X_train)
    loss_train = loss_func_train(outputs, y_train) 
    loss_train.backward() 
    optimizer.step() 
    outputs_test = model(X_test)
    loss_test = loss_func_test(outputs_test, y_test)
    accuracy_test = measure_binary_accuracy(outputs_test, y_test)
    if epoch % 20 == 19:
        print((f'Epoch {epoch}: loss_train/test: {loss_train.item():.4f}/{loss_test.item():.4f}, accuracy_test: {accuracy_test:.4f}'))

Epoch 19: loss_train/test: 80.1816/75.3989, accuracy_test: 0.4275
Epoch 39: loss_train/test: 75.0748/74.4430, accuracy_test: 0.5933
Epoch 59: loss_train/test: 71.0529/73.7784, accuracy_test: 0.6503
Epoch 79: loss_train/test: 67.7637/73.2873, accuracy_test: 0.6839
Epoch 99: loss_train/test: 64.9957/72.9028, accuracy_test: 0.6891
Epoch 119: loss_train/test: 62.6145/72.5862, accuracy_test: 0.6995
Epoch 139: loss_train/test: 60.5302/72.3139, accuracy_test: 0.7073
Epoch 159: loss_train/test: 58.6803/72.0716, accuracy_test: 0.7073
Epoch 179: loss_train/test: 57.0198/71.8502, accuracy_test: 0.7202
Epoch 199: loss_train/test: 55.5152/71.6437, accuracy_test: 0.7280


In [105]:
num_epochs = 100
#hyperparam_values = {'lr': 0.289}
hyperparam_values ={'lr': 0.02,'momentum': 0.001}
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_values)
criterion_train = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][istrain].values))  # BCE: Binary Cross Entropy
criterion_test = torch.nn.BCELoss(weight=torch.Tensor(df[['count']][~istrain].values))  # BCE: Binary Cross Entropy
X = vecs.values
y = (df[['sex']] == 'F').values
X_train = torch.Tensor(X[istrain])
X_test = torch.Tensor(X[~istrain])
y_train = torch.Tensor(y[istrain])
y_test = torch.Tensor(y[~istrain])

pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)
results = [None] * num_epochs
for epoch in range(num_epochs):
    optimizer.zero_grad() # Setting our stored gradients equal to zero
    outputs = model(X_train)
    loss_train = criterion_train(outputs, y_train) 
    loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
    epoch_loss_train = loss_train.item()
    optimizer.step() # Updates weights and biases with the optimizer (SGD)
    #print(f'Train loss: {np.round(loss_train.detach().numpy(), 4):0.4f}')
    outputs_test = model(X_test)
    loss_test = criterion_test(outputs_test, y_test).item()
    accuracy_test = measure_binary_accuracy(outputs_test, y_test)
    #results[epoch] = dict(loss_train=loss_train, loss_test=loss_test, accuracy_test=accuracy_test)
    if epoch % 20 == 19:
        print((f'Epoch {epoch}: loss_train/test: {loss_train:.4f}/{loss_test:.4f}, accuracy_test: {accuracy_test}'))
    #pbar_epochs.set_description(f'loss_train/test: {loss_train:.4f}/{loss_test:.4f}')
print((f'loss_train/test: {loss_train:.4f}/{loss_test:.4f}, accuracy_test: {accuracy_test}'))

Epoch::   0%|          | 0/100 [28:01<?, ?it/s]


Epoch 19: loss_train/test: 80.2628/75.2845, accuracy_test: 0.43005181347150256
Epoch 39: loss_train/test: 75.1420/74.3274, accuracy_test: 0.6139896373056994
Epoch 59: loss_train/test: 71.1092/73.6625, accuracy_test: 0.6580310880829016
Epoch 79: loss_train/test: 67.8115/73.1716, accuracy_test: 0.6839378238341969
Epoch 99: loss_train/test: 65.0367/72.7877, accuracy_test: 0.6839378238341969
Epoch 119: loss_train/test: 62.6499/72.4720, accuracy_test: 0.6968911917098446
Epoch 139: loss_train/test: 60.5611/72.2008, accuracy_test: 0.7020725388601037
Epoch 159: loss_train/test: 58.7074/71.9598, accuracy_test: 0.7124352331606217
Epoch 179: loss_train/test: 57.0437/71.7398, accuracy_test: 0.7202072538860104
Epoch 199: loss_train/test: 55.5364/71.5347, accuracy_test: 0.727979274611399
loss_train/test: 55.5364/71.5347, accuracy_test: 0.727979274611399


In [None]:
# Create new majority sex column to hold the most common sex for names used for both sexes
df['majority_sex'] = df['sex']

# Overwrite the minority sex with the majority.
for name_, sex_ in df_most_common.index:
    opposite_sex = 'F' if sex_ == 'M' else 'M'
    try:
        # if you put majority_sex after .iloc this fails to overwrite data and generates warning
        df['majority_sex'].loc[(name_, opposite_sex)] = sex_
        print(f'overwrote ({name_}, {opposite_sex}) with {sex_}')
    except KeyError:
        pass


In [None]:
dupe_names = df[df['name'].duplicated()]['name'].values
df[df['name'].isin(dupe_names)]

In [None]:
weight_test = torch.Tensor(df[['count']].loc[df_most_common.index].values)
criterion_test = torch.nn.BCELoss(weight=weight_test)
criterion_test

In [None]:
def rand_range(min_value=0.001, max_value=1):
    scale = max_value - min_value
    return scale * np.random.rand() + min_value

In [None]:
def rand_range_log(min_value=0.001, max_value=1):
    min_log = np.log(min_value)
    max_log = np.log(max_value)
    return np.exp(rand_range(np.log(min_value), np.log(max_value)))

Create random hyperparameter table for optimizer learning_rate and momentum

In [None]:
# lr: learning_rate
hyperparam_ranges = dict(lr=[0.02, 1.0], momentum=[0.00001, 1.0])
hyperparam_table = []
num_attempts = 30
for i in range(num_attempts):
    hyperparam_values = dict()
    for k, v in hyperparam_ranges.items():
        hyperparam_values[k] = rand_range_log(*hyperparam_ranges[k])
    hyperparam_table.append(hyperparam_values)
pd.DataFrame(hyperparam_table)

In [None]:
model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
model

In [None]:
optimizer = torch.optim.SGD(model.parameters(), **hyperparam_table[0])
optimizer

In [None]:
weight_train = torch.Tensor(df['count'][istrain].values.reshape(-1, 1) / df['count'][istrain].mean())
weight_train.sum()

In [None]:
df['count'][istrain].values.reshape(sum(istrain), 1)

In [None]:
weight_train.shape

In [None]:
weight_test = torch.Tensor(df['count'][~istrain].values.reshape(-1, 1) / df['count'][~istrain].mean())
weight_test.sum()

In [None]:
# pbar = tqdm(hyperparam_table, desc='Training attempt', total=len(hyperparam_table))
num_epochs=200

t0 = time.time()
for i, hyperparam_values in enumerate(hyperparam_table):
    t1 = time.time()
    model = LogisticRegressionNN(num_features=vecs.shape[1], num_outputs=1)
    optimizer = torch.optim.SGD(model.parameters(), lr=hyperparam_values['lr'])
    # BCE: Binary Cross Entropy weighted by the number of babies with that first name and sex
    criterion_train = torch.nn.BCELoss(weight=weight_train)
    criterion_test = torch.nn.BCELoss(weight=weight_test)
    X = vecs.values
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor((df[['sex']] == 'F').astype(int).values[istrain])
    y_test = torch.Tensor((df[['majority_sex']] == 'F').astype(int).values[~istrain])
    
    pbar_epochs = tqdm(range(num_epochs), desc='Epoch:', total=num_epochs)
    results = [None] * num_epochs
    for epoch in pbar_epochs:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion_train(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        loss_train = loss_train.item()
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
        # print(f'Train loss: {np.round(loss_train.detach().numpy(), 4):0.4f}')
        outputs_test = model(X_test)
        loss_test = criterion_test(outputs_test, y_test).item()
        accuracy_test = measure_binary_accuracy(outputs_test, y_test)
        results[epoch] = dict(loss_train=loss_train, loss_test=loss_test, accuracy_test=accuracy_test)
        # pbar_epochs.set_description(f'loss_train/test: {loss_train:.4f}/{loss_test:.4f}')
    t2 = time.time()
    results[-1]['time_per_attempt'] = t2 - t1
    results[-1]['total_time'] = t2 - t0
    hyperparam_table[i].update(results[-1])
    print(f'attempt {i}/{len(hyperparam_table)}')
    for k, v in hyperparam_table[i].items():
        print(f'{k}: {v:04f}')

In [None]:
istest = df['istest']
df['name'][istest].unique()


In [None]:
hyperparam_table[-1]




In [None]:
hyperparam_df = pd.DataFrame(hyperparam_table).sort_values('accuracy_test')
hyperparam_df

In [None]:
def fit(model=model, X=vecs.values, y=(df[['sex']] == 'F').values, optimizer=None,
        num_epochs=30, learning_rate=.1, criterion=criterion, optimizer=optimizer):
    pbar = tqdm(range(num_epochs), desc='Epoch', total=num_epochs)
    X_train = torch.Tensor(X[istrain])
    X_test = torch.Tensor(X[~istrain])
    y_train = torch.Tensor(y[istrain])
    y_test = torch.Tensor(y[~istrain])

    results = []
    for i in pbar:
        optimizer.zero_grad() # Setting our stored gradients equal to zero
        outputs = model(X_train)
        loss_train = criterion(outputs, y_train) 
        loss_train.backward() # Computes the gradient of the given tensor w.r.t. the weights/bias
        optimizer.step() # Updates weights and biases with the optimizer (SGD)
    return results

In [None]:
test_loss = hyperparam_df['loss_test']
test_accuracy = hyperparam_df['accuracy_test']
hyperparam_df.plot(x='lr', y='momentum', kind='scatter', logy=True, logx=True, grid=True, marker='o', s=100*test_accuracy**2, alpha=test_loss*.7)

In [None]:
results = fit()

In [None]:
pd.DataFrame(results)

In [None]:
# model.score(vecs[~istrain], y[~istrain], sample_weight=df['count'][~istrain])

In [None]:
# model.classes_


In [None]:
names = ['Dewey', 'Kemal', 'Copeland', 'Vishvesh']
ourvecs = vectorizer.transform(names)
ourvecs = pd.DataFrame.sparse.from_spmatrix(ourvecs)
ourvecs.columns = vectorizer.get_feature_names_out()
ourvecs.index = list(zip(names, 'M'*len(names)))
ourvecs

In [None]:
ourtensors = 

In [None]:
names = ['Maria', 'Syndee', 'Aditi', 'Constance']
vecs = vectorizer.transform(names)
vecs = pd.DataFrame.sparse.from_spmatrix(vecs)
vecs.columns = vectorizer.get_feature_names_out()
vecs.index = list(zip(names, 'M'*len(names)))
pd.DataFrame(model.predict_proba(vecs)[:,0], index=vecs.index)

In [None]:
class LogisticRegressionNumpyNN(LogisticRegressionNN):

    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

    def predict_proba(self, X):
        return self.forward(make_tensor(X))
    
    def predict(self, X):
        return (np.array(self.forward(make_tesnor(X))) > 0.5).astype(int)
    
# ', '.join([v for v in dir(LogisticRegression) if v[0] != '_'])