In [1]:
import pandas as pd
import numpy as np
import re
import random
import time

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
full_name_df=pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-full-name-dup.csv")
full_name_no_dup_df=pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-full-name.csv")
first_name_df = pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-first-name.csv")
last_name_df = pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-last-name.csv")
middle_name_df = pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-middle-name.csv")
middle_first_name_df = pd.read_csv("/content/gdrive/MyDrive/Data /ML datasets/processed-middle-first-name.csv")

In [5]:
def lowerize(text):
  patterns = {
  '[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
  '[đ]': 'd',
  '[èéẻẽẹêềếểễệ]': 'e',
  '[ìíỉĩị]': 'i',
  '[òóỏõọôồốổỗộơờớởỡợ]': 'o',
  '[ùúủũụưừứửữự]': 'u',
  '[ỳýỷỹỵ]': 'y'
  }
  output = text
  for regex, replace in patterns.items():
    output = re.sub(regex, replace, output)
    # deal with upper case
    output = re.sub(regex.upper(), replace.upper(), output)
  return output.lower()

In [10]:
class TF_IDF():
  def __init__(self, corpus, dictionary=None, max_count=None, min_count=None, normalize_tf=False, smooth=True, normalize_tfidf=None):
    self.corpus=corpus
    self.max_count=max_count
    self.min_count=min_count
    self.normalize_tf=normalize_tf
    self.smooth=smooth
    self.normalize_tfidf=normalize_tfidf
    self.dictionary = dictionary if dictionary!=None else self.create_dictionary()
    self.num_word=len(self.dictionary)
    self.word_to_index = self.map_word_to_index()
    self.num_document = len(self.corpus)
    self.matrix_word_count = self.create_count_matrix()
  #return the word in dictionary given index
  def retrieve_word(self, index):
    return self.dictionary[index]
  def create_dictionary(self):
    if self.max_count==None and self.min_count==None:
      set_word = set()
      for doc in self.corpus:
        set_word = set_word.union(set(self.word_extraction(doc.lower())))
    else:
      set_word = set()
      map_word_count = self.map_word_to_count()
      for doc in self.corpus:
        list_word=self.word_extraction(doc.lower())
        for word in list_word:
          if self.min_count!=None:
            if map_word_count[word] < self.min_count:
              continue
          if self.max_count!=None:
            if map_word_count[word] > self.max_count:
              continue
          set_word.add(word)
    return sorted(list(set_word))
  def retrieve_index(self, word):
    return self.word_to_index[word.lower()]

  def word_extraction(self, document):
    split_word=document.split()
    return split_word

  def map_word_to_count(self):
    dict_word_count = dict()
    for i in range(len(self.corpus)):
      list_word = self.word_extraction(self.corpus[i].lower())
      for j in range(len(list_word)):
        dict_word_count[list_word[j]] = dict_word_count.get(list_word[j],0)+1
    return dict_word_count

  def map_word_to_index(self):
    dict_encode=dict()
    for i in range(len(self.dictionary)):
      dict_encode[self.dictionary[i]]=i
    return dict_encode

  def create_count_matrix(self):
    mat = np.zeros((self.num_document, self.num_word))
    for i in range(len(self.corpus)):
      document = self.corpus[i].lower()
      list_word = self.word_extraction(document)
      for j in range(len(list_word)):
        ind = self.retrieve_index(list_word[j])
        mat[i, ind]+=1
    return mat

  def compute_tf(self):
    length_name = np.sum(self.matrix_word_count, axis=1)
    if self.normalize_tf==True:
      return self.matrix_word_count/np.reshape(length_name, (-1,1))
    else:
      return self.matrix_word_count

  def compute_idf(self):
    tmp = np.copy(self.matrix_word_count)
    tmp[tmp!=0]=1
    num_doc_having_word = np.sum(tmp, axis=0)
    if self.smooth == True:
      num_doc_having_word = np.log((self.num_document+1) / (num_doc_having_word+1)) + 1
    else:
      num_doc_having_word = np.log(self.num_document / num_doc_having_word) + 1

    return np.reshape(num_doc_having_word, (1, self.num_word))
  def compute_tf_idf(self):
    tf = self.compute_tf()
    idf = self.compute_idf()
    tfidf = tf * idf
    if self.normalize_tfidf==None:
      return tfidf
    elif self.normalize_tfidf == "l2":
      sum_squares = np.reshape(np.diag(tfidf.dot(tfidf)), (1, -1))
      return tfidf / sum_squares
    elif self.normalize_tfidf == "l1":
      sum_row = np.reshape(np.sum(tfidf, axis=1), (1, -1))
      return tfidf / sum_row

In [11]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

In [12]:
model = Sequential()
model.add(Dense(100, input_dim=100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))

In [13]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


Test with fullname dataset with duplicates and truncated

In [14]:
full_name_data = full_name_df["Full_Name"].tolist()
full_name_label = full_name_df["Gender"].to_numpy()

In [15]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
tfidf_full_name_data=TF_IDF(full_name_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
normalizer = Normalizer(copy=False)
full_name_normalized = normalizer.fit_transform(tfidf_full_name_data)
full_name_svd = svd.fit_transform(tfidf_full_name_data)
full_name_svd.shape

(26373, 100)

In [16]:
fullname_train_X, fullname_test_X, fullname_train_y, fullname_test_y = train_test_split(full_name_svd, full_name_label.reshape(-1,1), test_size=0.2, random_state=42)

In [17]:
start_time = time.time()
model.fit(fullname_train_X,fullname_train_y,epochs=150,validation_split=0.2, batch_size=200)
print("Time: "+str(time.time()-start_time))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [18]:
loss, accuracy = model.evaluate(fullname_test_X, fullname_test_y)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.2716262936592102
Test Accuracy: 0.9309952855110168


Test with fullname dataset with no truncated svd

In [26]:
raw_fullname_train_X, raw_fullname_test_X, raw_fullname_train_y, raw_fullname_test_y = train_test_split(tfidf_full_name_data, full_name_label, test_size=0.2, random_state=42)

In [None]:
model1 = Sequential()
model1.add(Dense(100, input_dim=tfidf_full_name_data.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model1.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model1.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model1.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model1.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model1.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
model1.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
start_time = time.time()
model1.fit(raw_fullname_train_X,raw_fullname_train_y,epochs=150,validation_split=0.2, batch_size=200)
print("Time: "+str(time.time()-start_time))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
loss, accuracy = model1.evaluate(raw_fullname_test_X, raw_fullname_test_y)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.23045121133327484
Test Accuracy: 0.9560189843177795


Test with dataset with drop-duplicate and truncated_svd

In [28]:
fullname_no_dup_data = full_name_no_dup_df["Full_Name"].tolist()
fullname_no_dup_label = full_name_no_dup_df["Gender"].to_numpy()

In [29]:
from sklearn.decomposition import TruncatedSVD
tfidf_fullname_no_dup_data=TF_IDF(fullname_no_dup_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
full_name_no_dup_svd = svd.fit_transform(tfidf_fullname_no_dup_data)

In [None]:
fn_no_dup_train_X, fn_no_dup_test_X, fn_no_dup_train_y, fn_no_dup_test_y = train_test_split(full_name_no_dup_svd, fullname_no_dup_label, test_size=0.2, random_state=42)

In [None]:
model2 = Sequential()
model2.add(Dense(100, input_dim=full_name_no_dup_svd.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model2.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model2.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model2.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model2.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model2.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
model2.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
start_time=time.time()
model2.fit(fn_no_dup_train_X,fn_no_dup_train_y,epochs=150,validation_split=0.2, batch_size=200)
print("Time: "+str(time.time()-start_time))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
loss, accuracy = model2.evaluate(fn_no_dup_test_X, fn_no_dup_test_y)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.2641112208366394
Test Accuracy: 0.9304515719413757


Test with drop duplicate dataset without truncated

In [None]:
raw_no_dup_train_X, raw_no_dup_test_X, raw_no_dup_train_y, raw_no_dup_test_y = train_test_split(tfidf_fullname_no_dup_data, fullname_no_dup_label, test_size=0.2, random_state=42)

In [None]:
model3 = Sequential()
model3.add(Dense(100, input_dim=tfidf_fullname_no_dup_data.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model3.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model3.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model3.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model3.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model3.add(Dense(1, activation='sigmoid'))

In [None]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
model3.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
start_time=time.time()
model3.fit(raw_no_dup_train_X,raw_no_dup_train_y,epochs=150,validation_split=0.2, batch_size=200)
print("Time: "+str(time.time()-start_time))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [None]:
loss, accuracy = model3.evaluate(raw_no_dup_test_X, raw_no_dup_test_y)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.21395626664161682
Test Accuracy: 0.9413185119628906


Test some metrics

In [9]:

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import log_loss, precision_recall_curve, PrecisionRecallDisplay, roc_curve, RocCurveDisplay, roc_auc_score

class Metric:
    def __init__(self, y_true:np.ndarray, y_pred:np.ndarray, y_proba:np.ndarray=None):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_proba = y_proba

    # Accuracy
    def getAccuracy(self):
        return accuracy_score(self.y_true, self.y_pred)

    # Recall
    def getRecallClass0(self):
        return recall_score(self.y_true, self.y_pred, pos_label=0, average="binary")
    def getRecallClass1(self):
        return recall_score(self.y_true, self.y_pred, pos_label=1, average="binary")
    def getMacroAvgRecall(self):
        return recall_score(self.y_true, self.y_pred, average="macro")
    def getWeightedAvgRecall(self):
        return recall_score(self.y_true, self.y_pred, average="weighted")

    # Precision
    def getPrecisionClass0(self):
        return precision_score(self.y_true, self.y_pred, pos_label=0, average="binary")
    def getPrecisionClass1(self):
        return precision_score(self.y_true, self.y_pred, pos_label=1, average="binary")
    def getMacroAvgPrecision(self):
        return precision_score(self.y_true, self.y_pred, average="macro")
    def getWeightedAvgPrecision(self):
        return precision_score(self.y_true, self.y_pred, average="weighted")

    # F1
    def getF1Class0(self):
        return f1_score(self.y_true, self.y_pred, pos_label=0, average="binary")
    def getF1Class1(self):
        return f1_score(self.y_true, self.y_pred, pos_label=1, average="binary")
    def getMacroAvgF1(self):
        return f1_score(self.y_true, self.y_pred, average="macro")
    def getWeightedAvgF1(self):
        return f1_score(self.y_true, self.y_pred, average="weighted")

    # Confusion matrix
    def getConfusionMatrix(self):
        cm = confusion_matrix(self.y_true, self.y_pred)
        cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1])
        return cm, cm_disp

    # Classification report
    def getClassificationReport(self):
        return classification_report(self.y_true, self.y_pred)

    ##########

    # Log loss
    def getLogLoss(self):
        return log_loss(self.y_true, self.y_proba)

    # Precision-recall curve
    def getPrecisionRecallCurveClass0(self):
        prec, recall, _ = precision_recall_curve(self.y_true, 1 - self.y_proba, pos_label=0)
        pr_disp = PrecisionRecallDisplay(prec, recall)
        return pr_disp

    def getPrecisionRecallCurveClass1(self):
        prec, recall, _ = precision_recall_curve(self.y_true, self.y_proba, pos_label=1)
        pr_disp = PrecisionRecallDisplay(prec, recall)
        return pr_disp

    # ROC curve
    def getRocCurveClass0(self):
        fpr, tpr, _ = roc_curve(self.y_true, 1 - self.y_proba, pos_label=0)
        roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr)
        return roc_disp

    def getRocCurveClass1(self):
        fpr, tpr, _ = roc_curve(self.y_true, self.y_proba, pos_label=1)
        roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr)
        return roc_disp

    def getRocAucScore(self):
        return roc_auc_score(self.y_true, self.y_proba)

Test on fullname dataset

In [19]:
fullname_y_pred_prob = model.predict(fullname_test_X)
fullname_y_pred = (fullname_y_pred_prob > 0.5).astype(int)
fullname_metric = Metric(fullname_test_y, fullname_y_pred, fullname_y_pred_prob)



In [20]:
print(fullname_metric.getPrecisionClass0())
print(fullname_metric.getRecallClass0())
print(fullname_metric.getF1Class0())

0.9664328657314629
0.866576819407008
0.9137849360492657


In [21]:
print(fullname_metric.getPrecisionClass1())
print(fullname_metric.getRecallClass1())
print(fullname_metric.getF1Class1())

0.909423604757548
0.9780255821580847
0.9424778761061946


In [22]:
print(fullname_metric.getAccuracy())
print(fullname_metric.getLogLoss())
print(fullname_metric.getRocAucScore())

0.9309952606635071
0.17900650175697372
0.9798640769203343


Test on first name dataset

In [24]:
first_name_data = first_name_df["Full_Name"].tolist()
first_name_label = first_name_df["Gender"].to_numpy()

In [31]:
first_name_X = TF_IDF(first_name_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
first_name_svd = svd.fit_transform(first_name_X)

In [37]:
firstname_train_X, firstname_test_X, firstname_train_y, firstname_test_y = train_test_split(first_name_svd, first_name_label, test_size=0.2, random_state=42)

In [38]:
firstname_model = Sequential()
firstname_model.add(Dense(100, input_dim=firstname_train_X.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
firstname_model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
firstname_model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
firstname_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
firstname_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
firstname_model.add(Dense(1, activation='sigmoid'))

In [40]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
firstname_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [41]:
firstname_model.fit(firstname_train_X,firstname_train_y,epochs=150,validation_split=0.2, batch_size=200)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ff79036b160>

In [42]:
firstname_y_pred_prob = firstname_model.predict(firstname_test_X)
firstname_y_pred = (firstname_y_pred_prob>0.5).astype(int)
firstname_metric = Metric(firstname_test_y, firstname_y_pred, firstname_y_pred_prob)



In [43]:
print(firstname_metric.getPrecisionClass0())
print(firstname_metric.getRecallClass0())
print(firstname_metric.getF1Class0())

0.8822571893651655
0.7265415549597856
0.7968635138446459


In [44]:
print(firstname_metric.getPrecisionClass1())
print(firstname_metric.getRecallClass1())
print(firstname_metric.getF1Class1())

0.8264814289764673
0.9307151979565773
0.8755068328577865


In [45]:
print(firstname_metric.getAccuracy())
print(firstname_metric.getLogLoss())
print(firstname_metric.getRocAucScore())

0.8456238361266294
0.350798346820495
0.9189351866118376


Test with middle name dataset

In [49]:
middle_name_data = middle_name_df["Full_Name"].tolist()
middle_name_label = middle_name_df["Gender"].to_numpy()

In [47]:
middle_name_X = TF_IDF(middle_name_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
middle_name_svd = svd.fit_transform(middle_name_X)

In [51]:
middle_train_X, middle_test_X, middle_train_y, middle_test_y = train_test_split(middle_name_svd, middle_name_label, test_size=0.2, random_state=42)

In [55]:
middle_model = Sequential()
middle_model.add(Dense(100, input_dim=firstname_train_X.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
middle_model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
middle_model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
middle_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
middle_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
middle_model.add(Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
middle_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [56]:
middle_model.fit(middle_train_X,middle_train_y,epochs=150,validation_split=0.2, batch_size=200)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ff72a9989a0>

In [57]:
middlename_y_pred_prob = middle_model.predict(middle_test_X)
middlename_y_pred = (middlename_y_pred_prob>0.5).astype(int)
middlename_metric = Metric(middle_test_y, middlename_y_pred, middlename_y_pred_prob)



In [58]:
print(middlename_metric.getPrecisionClass0())
print(middlename_metric.getRecallClass0())
print(middlename_metric.getF1Class0())

0.9558521560574949
0.8148796498905908
0.8797543113630995


In [59]:
print(middlename_metric.getPrecisionClass1())
print(middlename_metric.getRecallClass1())
print(middlename_metric.getF1Class1())

0.8733912002394493
0.9713715046604527
0.919779353821907


In [60]:
print(middlename_metric.getAccuracy())
print(middlename_metric.getLogLoss())
print(middlename_metric.getRocAucScore())

0.903762525997353
0.23518215810629775
0.9610463218990288


Test on last name dataset

In [61]:
last_name_data = last_name_df["Full_Name"].tolist()
last_name_label = last_name_df["Gender"].to_numpy()

In [62]:
last_name_X = TF_IDF(last_name_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
last_name_svd = svd.fit_transform(last_name_X)

In [63]:
last_train_X, last_test_X, last_train_y, last_test_y = train_test_split(last_name_svd, last_name_label, test_size=0.2, random_state=42)

In [68]:
lastname_model = Sequential()
lastname_model.add(Dense(100, input_dim=firstname_train_X.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
lastname_model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
lastname_model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
lastname_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
lastname_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
lastname_model.add(Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
lastname_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [69]:
lastname_model.fit(last_train_X,last_train_y,epochs=150,validation_split=0.2, batch_size=200)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ff7202573d0>

In [76]:
lastname_y_pred_prob = lastname_model.predict(last_test_X)
lastname_y_pred = (lastname_y_pred_prob>0.5).astype(int)
lastname_metric = Metric(last_test_y, lastname_y_pred, lastname_y_pred_prob)




In [73]:
print(lastname_metric.getPrecisionClass0())
print(lastname_metric.getRecallClass0())
print(lastname_metric.getF1Class0())

0.0
0.0
0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
print(lastname_metric.getPrecisionClass1())
print(lastname_metric.getRecallClass1())
print(lastname_metric.getF1Class1())

0.5767225325884544
1.0
0.7315460021259006


In [75]:
print(lastname_metric.getAccuracy())
print(lastname_metric.getLogLoss())
print(lastname_metric.getRocAucScore())

0.5767225325884544
0.681336264571845
0.5


Test on middle first name dataset


In [77]:
middle_first_name_data = middle_first_name_df["Full_Name"].tolist()
middle_first_name_label = middle_first_name_df["Gender"].to_numpy()

In [78]:
middle_first_name_X = TF_IDF(middle_first_name_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
middle_first_name_svd = svd.fit_transform(middle_first_name_X)

In [84]:
midfirst_train_X, midfirst_test_X, midfirst_train_y, midfirst_test_y = train_test_split(middle_first_name_svd, middle_first_name_label, test_size=0.2, random_state=42)

In [85]:
midfirstname_model = Sequential()
midfirstname_model.add(Dense(100, input_dim=firstname_train_X.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
midfirstname_model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
midfirstname_model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
midfirstname_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
midfirstname_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
midfirstname_model.add(Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
midfirstname_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [86]:
midfirstname_model.fit(midfirst_train_X, midfirst_train_y,epochs=150,validation_split=0.2, batch_size=200)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ff720481de0>

In [90]:
midfirstname_y_pred_prob = midfirstname_model.predict(midfirst_test_X)
midfirstname_y_pred = (midfirstname_y_pred_prob>0.5).astype(int)
midfirstname_metric = Metric(midfirst_test_y, midfirstname_y_pred, midfirstname_y_pred_prob)



In [91]:
print(midfirstname_metric.getPrecisionClass0())
print(midfirstname_metric.getRecallClass0())
print(midfirstname_metric.getF1Class0())

0.9547785547785548
0.9187976671152983
0.9364426154549612


In [92]:
print(midfirstname_metric.getPrecisionClass1())
print(midfirstname_metric.getRecallClass1())
print(midfirstname_metric.getF1Class1())

0.943875968992248
0.9691181152499204
0.9563305058121269


In [93]:
print(midfirstname_metric.getAccuracy())
print(midfirstname_metric.getLogLoss())
print(midfirstname_metric.getRocAucScore())

0.9482309124767225
0.1441853367701607
0.9846669663257723


Test on raw data set

In [94]:
raw_df = pd.read_csv("/content/gdrive/MyDrive/Data /name_full.csv")

In [95]:
raw_data = raw_df["Full_Name"].tolist()
raw_label = raw_df["Gender"].to_numpy()

In [96]:
raw_X = TF_IDF(raw_data).compute_tf_idf()
svd = TruncatedSVD(n_components=100)
raw_svd = svd.fit_transform(raw_X)

In [97]:
raw_train_X, raw_test_X, raw_train_y, raw_test_y = train_test_split(raw_svd, raw_label, test_size=0.2, random_state=42)

In [98]:
test_model = Sequential()
test_model.add(Dense(100, input_dim=raw_train_X.shape[1], activation='relu', kernel_regularizer=regularizers.l2(0.01)))
test_model.add(Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
test_model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
test_model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
test_model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
test_model.add(Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
test_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [103]:
start_time=time.time()
test_model.fit(raw_train_X, raw_train_y, epochs=150, validation_split=0.15, batch_size=1000)
print("Time: "+str(time.time()-start_time))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [104]:
loss, accuracy = test_model.evaluate(raw_test_X, raw_test_y)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.2211216390132904
Test Accuracy: 0.9476820230484009
