In [None]:
# Libs
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow import keras
import pandas as pd
import numpy as np

# Plot size
plt.rcParams["figure.figsize"] = [10, 5]

# Load and prepare data
nomes = pd.read_csv("../raw_data/nomes.csv")
nomes.loc[:, "first_name"] = nomes.first_name.str.lower().str.strip()

In [None]:
# Plot names' length
(nomes.assign(tam = nomes.first_name.str.len())
    .groupby("tam")
    .size()
    .plot.bar()
)

In [None]:
# Create function to sample and prepare data
def draw_names_sample(nomes, n):

    smp = nomes.sample(n)
    y = np.random.binomial(1, smp.prop_female, n)
    x = smp.first_name
    return y, x

In [None]:
# A sample for testing
np.random.seed(222)
Y, first_names = draw_names_sample(nomes, 50000)

In [None]:
# Valid letters
chars = set("".join(first_names))
chars.add("DUMMY")
chars_index = dict((c, i) for i, c in enumerate(chars))
index_chars = dict((i, c) for i, c in enumerate(chars))

In [None]:
# Function to one-hot encode names
def one_hot(nome, chars_index, pad=20):

    nome_trunc = nome[:pad]
    res = []
    for i in [chars_index[ch] for ch in nome_trunc]:
        nome_vector = np.zeros(len(chars_index))
        nome_vector[i] = 1
        res.append(nome_vector)
    
    for i in range(0, pad - len(res)):
        dummy = np.zeros(len(chars_index))
        dummy[chars_index["DUMMY"]] = 1
        res.append(dummy)

    return np.vstack(res)

# Transform names
X_first_names = list(map(lambda x: one_hot(x, chars_index), first_names))
X_first_names = np.asarray(X_first_names)


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_first_names, Y, random_state=222)

In [None]:
# Build a test model
model = keras.Sequential()
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True), 
                               input_shape=(20, len(chars_index))))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activity_regularizer=keras.regularizers.l2(0.002)))
model.add(layers.Activation("sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
# Defines early stoping
cb = EarlyStopping(patience=3)

In [None]:
# Training
history = model.fit(X_train, y_train, batch_size=256, 
                    epochs=50, verbose=1, validation_data=(X_test, y_test),
                    callbacks = cb)

In [None]:
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.legend(["Train", "Validation"])
plt.show()