In [None]:
# Libs
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

# Load and prepare data
nomes = pd.read_csv("../raw_data/nomes.csv")
nomes["first_name"] = nomes.first_name.str.lower().str.strip()


In [None]:
# Create function to sample and prepare data
def draw_names_sample(nomes, n):

    smp = nomes.sample(n)
    y = np.random.binomial(1, smp.prop_female, n)
    x = "_" + smp.first_name + "_"
    return y, x

In [None]:
# Define pipelines
nb_pipe = Pipeline([("vect", CountVectorizer(analyzer="char_wb")),
                    ("feat", SelectKBest(chi2)), 
                    ("clf", MultinomialNB())
                    ])

svm_pipe = Pipeline([("vect", CountVectorizer(analyzer="char_wb")),
                    ("feat", SelectKBest(chi2)), 
                    ("clf", SGDClassifier())
                    ])

xgb_pipe = Pipeline([("vect", CountVectorizer(analyzer="char_wb")),
                    ("feat", SelectKBest(chi2)), 
                    ("clf", XGBClassifier())
                    ])

In [None]:
# Grid search params
ngram = [(1, 3), (1, 4), (1, 5)]
k = np.array(range(800, 1601, 200))

grid_nb = [{
    "vect__ngram_range" : ngram,
    "feat__k" : k,
    "clf__alpha" : (0.5, 0.75, 1)
    }]

grid_svm = [{
    "vect__ngram_range" : ngram,
    "feat__k" : k,
    "clf__alpha" : [1e-4, 1e-3, 1e-2, 1e-1],
    "clf__max_iter" : [10, 20, 50, 100, 200, 300, 1000]
    }]

grid_xgb = [{
    "vect__ngram_range" : ngram,
    "feat__k" : k,
    "clf__max_depth" : [3, 4, 5, 6, 7, 8, 9, 10],
    "clf__min_child_weight" : [0.1, 1, 100,1000],
    "clf__gamma" : [0.1, 1, 100,1000]
    }]


In [None]:
# A sample for testing
np.random.seed(222)
Y, X = draw_names_sample(nomes, 10000)

In [None]:
# Run grid search with repeated kfold
rcv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=222)
gs_nb = GridSearchCV(nb_pipe, grid_nb, cv=rcv, n_jobs=-1).fit(X, Y)
gs_svm = GridSearchCV(svm_pipe, grid_svm, cv=rcv, n_jobs=-1).fit(X, Y)
gs_xgb = GridSearchCV(xgb_pipe, grid_xgb, cv=rcv, n_jobs=-1).fit(X, Y)

In [None]:
# Best results NB
pd.DataFrame(gs_nb.cv_results_).sort_values("rank_test_score").head()

In [None]:
# Best results SVM
pd.DataFrame(gs_svm.cv_results_).sort_values("rank_test_score").head()

In [None]:
# Best results XGB
pd.DataFrame(gs_svm.cv_results_).sort_values("rank_test_score").head()