# Setup

In [1]:
!pip install -r requirements.txt

Collecting pandas==1.4.3
  Downloading pandas-1.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 10.0 MB/s eta 0:00:01
[?25hCollecting matplotlib==3.5.2
  Downloading matplotlib-3.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 65.7 MB/s eta 0:00:01
[?25hCollecting scikit-learn==1.1.1
  Downloading scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
[K     |████████████████████████████████| 31.2 MB 35.9 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.34.4-py3-none-any.whl (944 kB)
[K     |████████████████████████████████| 944 kB 35.6 MB/s eta 0:00:01
[?25hCollecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.2 MB)
[K     |█████████████████

In [104]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression

seed = 0

data = pd.read_csv('name_gender.csv')
print(f'Size of dataset: {len(data)}')

Size of dataset: 95025


In [105]:
data.describe()

Unnamed: 0,name,gender
count,95025,95025
unique,95025,2
top,Aaban&&,F
freq,1,60304


In [106]:
data.isnull().values.any()

False

In [107]:
data['gender'].value_counts()

F    60304
M    34721
Name: gender, dtype: int64

# Data cleaning

Remove non-alphabetic characters

In [108]:
names = data['name'].str.contains('\W|\d|_').values.sum()

# List of non-alphabetic names
print(data[data['name'].str.contains('\W|\d|_')])

            name gender
0        Aaban&&      M
1         Aabha*      F
4          Aada_      F
10       Aadhav+      M
13      Aadhira4      F
...          ...    ...
94826   Zyair770      M
94874  Zyheir887      M
94915    Zykir24      M
94957  Zymirah11      F
94995     Zyri*&      F

[65 rows x 2 columns]


In [109]:
data['name'] = data['name'].str.replace('\W|\d|_','',regex=True)

In [110]:
data['gender'].value_counts()

F    60304
M    34721
Name: gender, dtype: int64

In [111]:
# Split dataset stratified by gender before tokenizing to avoid leakage of test set info into training features
X_train, X_test, y_train, y_test = train_test_split(data['name'], data['gender'], test_size=0.1, random_state=seed, stratify=data['gender'])

# Modelling

https://arxiv.org/pdf/2102.03692.pdf

What’s in a Name? – Gender Classification of Names with
Character Based Machine Learning Models

In [23]:
error analysis on names based on frequency
maybe upsample rarer names on training set, keep test set the same

naive bayes
random forest
lstm
char-bert

SyntaxError: invalid syntax (<ipython-input-23-9ec9268595f4>, line 1)

## Logistic regression

Different features
1) BOW of ngrams

2) TF-IDF of ngrams
scale down the impact of tokens that are common in a corpus and hence less informative (does it apply here?)

3) Class scaling

In [98]:
ngram_upper = int(np.floor(np.mean(data['name'].apply(len))))
print(f'Average word length: {ngram_upper}')

Average word length: 6


A minimum of 2 characters and a maximum of 6 (mean word length) is used for constructing ngrams.

### BOW

For hyerparameter tuning, stratified 5-fold CV will be used. However, tokenization will only be performed after splitting the dataset in order to prevent the validation set from having any info on the features from the other folds. This procedure will be combined with gridsearchcv

In [None]:
# 5 fold CV
# Tokenization
# Grid search

pipeline = make_pipeline(
        CountVectorizer(analyzer='char_wb', ngram_range=(2,ngram_upper)),
        LogisticRegression(max_iter=300)
        )

parameters = {'logisticregression__C':[0.8, 1, 2]}
lr = GridSearchCV(pipeline, param_grid=parameters, cv=5, verbose=2)
lr.fit(X_train, y_train)

# counts = ngram_vectorizer.fit_transform(test['name'])
# counts.toarray().astype(int)
# ngram_vectorizer.get_feature_names_out()

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ..........................logisticregression__C=0.8; total time=  17.1s


In [128]:
lr.best_params_

{'logisticregression__C': 1}

### tfidf