# Project scope
Model design and training for Singaporean nationality, gender and race prediction

## 1. Data preprocessing
install required packages

In [1]:
# ! pip install pandas

observe data using VScode

In [2]:
import pandas as pd
raw_data=pd.read_csv('../data/training.csv')
raw_data.head()

Unnamed: 0,Name,Gender,Race,Nationality
0,Ho Siew Lai,Female,Chinese,Singaporean
1,Ng Beng Nam,Male,Chinese,Singaporean
2,Esther Tan,Female,Chinese,Singaporean
3,Pg Gwee,Male,Chinese,Singaporean
4,Nayli Qistina,Female,Chinese,Singaporean


## 1.1 Data cleaning
remove missing values

In [3]:
def clean_data(raw_data):
    # Drop rows with missing data across all columns
    raw_data = raw_data.dropna()
    # Drop duplicate rows across all columns
    raw_data = raw_data.drop_duplicates()
    # Filter rows based on columns: 'Gender', 'Race', 'Nationality'
    raw_data = raw_data[(raw_data['Gender'] != "Unknown") & (raw_data['Race'] != "Unknown") & (raw_data['Nationality'] != "Unknown")]
    # Filter rows based on columns: 'Gender', 'Race', 'Nationality'
    raw_data = raw_data[(raw_data['Gender'] != "Not Specified") & (raw_data['Race'] != "Not Specified") & (raw_data['Nationality'] != "Not Specified")]
    # Filter rows based on columns: 'Race', 'Nationality'
    raw_data = raw_data[(raw_data['Race'] != "Not specified") & (raw_data['Nationality'] != "Not specified")]
    # Replace 'Unknown (Non-specific)' with 'Non-specific'
    raw_data['Race'] = raw_data['Race'].replace('Unknown (Non-specific)', 'Non-specific')
    return raw_data

raw_data_clean = clean_data(raw_data.copy())
raw_data_clean.head()

Unnamed: 0,Name,Gender,Race,Nationality
0,Ho Siew Lai,Female,Chinese,Singaporean
1,Ng Beng Nam,Male,Chinese,Singaporean
2,Esther Tan,Female,Chinese,Singaporean
3,Pg Gwee,Male,Chinese,Singaporean
4,Nayli Qistina,Female,Chinese,Singaporean


In [4]:
raw_data_clean.shape

(8862, 4)

deletes titles in name and remove leading/trailing space

In [5]:
# Regular expression pattern for common titles (only at the beginning of the string)
pattern = r'^\b(Mr\.?|Mrs\.?|Ms\.?|Dr\.?|Doc\.?|Prof\.?|Sir|Madam|Miss)\b'

# Remove the titles from the "name" column
raw_data_clean['Name'] = raw_data_clean['Name'].replace(pattern, '', regex=True)

# Strip leading/trailing whitespace
raw_data_clean['Name'] = raw_data_clean['Name'].str.strip()
raw_data_clean.shape

(8862, 4)

export backup

In [6]:
# Export DataFrame to a CSV file
raw_data_clean.to_csv('../data/data_cleaned.csv', index=False)

## 1.2 Text data encoding

In [7]:
def clean_data(raw_data_clean):
    # Multi-label encode column 'Gender' using delimiter 'Female'
    loc_0 = raw_data_clean.columns.get_loc('Gender')
    raw_data_clean_encoded = raw_data_clean['Gender'].str.get_dummies(sep='Female').add_prefix('Gender_')
    raw_data_clean = pd.concat([raw_data_clean.iloc[:,:loc_0], raw_data_clean_encoded, raw_data_clean.iloc[:,loc_0+1:]], axis=1)
    # One-hot encode column: 'Race'
    raw_data_clean = pd.get_dummies(raw_data_clean, columns=['Race'])
    # One-hot encode column: 'Nationality'
    raw_data_clean = pd.get_dummies(raw_data_clean, columns=['Nationality'])
    return raw_data_clean

data_encoded = clean_data(raw_data_clean.copy())
data_encoded.head()

Unnamed: 0,Name,Gender_Male,Race_African,Race_Arabic,Race_Bangladeshi,Race_Bengali,Race_British,Race_Burmese,Race_Caucasian,Race_Chinese,...,Nationality_Nepali,Nationality_Other,Nationality_Russian,Nationality_Singapore,Nationality_Singaporean,Nationality_Spanish,Nationality_Sri Lankan,Nationality_Thai,Nationality_Turkish,Nationality_Vietnamese
0,Ho Siew Lai,0,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
1,Ng Beng Nam,1,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
2,Esther Tan,0,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
3,Pg Gwee,1,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,Nayli Qistina,0,False,False,False,False,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [8]:
def merge_race(data_encoded):
     # Define the race columns to keep
    races_to_keep = ['Race_Chinese', 'Race_Indian', 'Race_Malay']

    # Identify all race columns
    race_columns = [col for col in data_encoded.columns if col.startswith('Race_')]

    # Columns to be summed into 'Race_Other'
    other_race_columns = [col for col in race_columns if col not in races_to_keep]

    # Create 'Race_Other' column by summing the values of the other race columns
    data_encoded['Race_Other'] = data_encoded[other_race_columns].sum(axis=1) > 0

    # Ensure 'Race_Other' is not in the list of columns to drop
    other_race_columns = [col for col in other_race_columns if col != 'Race_Other']
    
    # Drop the original other race columns
    data_encoded.drop(columns=other_race_columns, inplace=True)

    return data_encoded

# Apply the function to your encoded data
data_encoded = merge_race(data_encoded)

In [9]:
def merge_nationality(data_encoded):
     # Define the race columns to keep
    nationality_to_keep = ['Nationality_Singaporean', 'Nationality_Singapore']

    # Identify all race columns
    nationality_columns = [col for col in data_encoded.columns if col.startswith('Nationality_')]

    # Columns to be summed into 'Race_Other'
    other_nationality_columns = [col for col in nationality_columns if col not in nationality_to_keep]

    # Create 'Race_Other' column by summing the values of the other race columns
    data_encoded['Nationality_Foreigner'] = data_encoded[other_nationality_columns].sum(axis=1) > 0

    # Ensure 'Race_Other' is not in the list of columns to drop
    other_nationality_columns = [col for col in other_nationality_columns if col != 'Nationality_Foreigner']
    
    # Drop the original other race columns
    data_encoded.drop(columns=other_nationality_columns, inplace=True)

    data_encoded['Nationality_Singaporean']= data_encoded['Nationality_Singaporean']+ data_encoded['Nationality_Singapore']
    data_encoded.drop(columns=['Nationality_Singapore'], inplace=True)
    return data_encoded

# Apply the function to your encoded data
data_encoded = merge_nationality(data_encoded)

export backup

In [10]:
# Export DataFrame to a CSV file
data_encoded.to_csv('../data/data_encoded.csv', index=False)

## Model structure design

In [11]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras

2024-01-05 09:00:55.312948: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-05 09:00:55.334432: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-05 09:00:55.334456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-05 09:00:55.335004: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-05 09:00:55.338947: I tensorflow/core/platform/cpu_feature_guar

In [12]:
# bert_base_en_uncased
# 12-layer BERT model where all input is lowercased. Trained on English Wikipedia + BooksCorpus.

In [13]:
# from keras_nlp.models import BertTokenizer

# tokenizer = BertTokenizer.from_preset("bert_base_en")
# tokenized_names = [tokenizer(name) for name in data_encoded['Name']]

In [14]:
import keras_nlp
preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_base_en")
import tensorflow as tf
names = tf.constant(data_encoded['Name'].to_list())
preprocessed_data = preprocessor(names)
token_ids = preprocessed_data['token_ids']

2024-01-05 09:00:58.686509: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-05 09:00:58.689782: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-05 09:00:58.689810: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-05 09:00:58.693177: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-05 09:00:58.693204: I external/local_xla/xla/stream_executor

In [15]:
import numpy as np

# Determine the maximum length from the token IDs
max_length = token_ids.shape[1]
# segment_ids are all zeros, as there's only one segment (single sentence/names)
segment_ids = np.zeros_like(token_ids)

# padding_mask is 1 where token_ids are not zero (actual tokens), and 0 where they are zero (padding)
padding_mask = np.where(token_ids != 0, 1, 0)

In [16]:
# Counting the number of race-related columns
num_race_classes = len([col for col in data_encoded.columns if col.startswith('Race_')])

# Counting the number of nationality-related columns
num_nationality_classes = len([col for col in data_encoded.columns if col.startswith('Nationality_')])

gender_labels = data_encoded['Gender_Male'].values
gender_labels = gender_labels.reshape(-1, 1)

race_columns = [col for col in data_encoded.columns if col.startswith('Race_')]
race_labels = data_encoded[race_columns].values

nationality_columns = [col for col in data_encoded.columns if col.startswith('Nationality_')]
nationality_labels = data_encoded[nationality_columns].values

In [17]:
# ! pip install -U scikit-learn

In [18]:
from sklearn.model_selection import train_test_split

# Stack the labels along the second axis (axis=1)
labels = np.concatenate([gender_labels, race_labels, nationality_labels], axis=1)

# Convert the token_ids (Tensor) to a NumPy array
token_ids_np = token_ids.numpy()

inputs = np.concatenate([token_ids_np, segment_ids, padding_mask], axis=-1)
# Now use train_test_split with the NumPy array
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs, 
    labels,
    test_size=0.2, 
    random_state=42
)

In [19]:
# Assuming each component has the same length
length = max_length  # replace with actual length of each component

train_token_ids = train_inputs[:, :length]
train_segment_ids = train_inputs[:, length:2*length]
train_padding_mask = train_inputs[:, 2*length:]

test_token_ids = test_inputs[:, :length]
test_segment_ids = test_inputs[:, length:2*length]
test_padding_mask = test_inputs[:, 2*length:]

In [20]:
# Assuming the first column is gender, the next 'num_race_classes' columns are for race, and the rest are for nationality
train_gender_labels = train_labels[:, 0]
train_race_labels = train_labels[:, 1:1+num_race_classes]
train_nationality_labels = train_labels[:, 1+num_race_classes:]

test_gender_labels = test_labels[:, 0]
test_race_labels = test_labels[:, 1:1+num_race_classes]
test_nationality_labels = test_labels[:, 1+num_race_classes:]


In [21]:
def smooth_labels(labels, smoothing_factor=0.1):
    num_classes = labels.shape[1]  # Assuming one-hot encoded labels
    smooth_value = smoothing_factor / num_classes
    new_labels = labels * (1 - smoothing_factor) + smooth_value
    return new_labels

# Apply label smoothing
alpha = 0.1  # Example smoothing factor

# Apply smoothing to race and nationality labels only (as gender labels are binary)
train_race_labels = smooth_labels(train_race_labels, alpha)
train_nationality_labels = smooth_labels(train_nationality_labels, alpha)

test_race_labels = smooth_labels(test_race_labels, alpha)
test_nationality_labels = smooth_labels(test_nationality_labels, alpha)

In [22]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Assuming train_gender_labels, train_race_labels, train_nationality_labels are your training labels

# Compute class weights for Gender
weights_gender = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_gender_labels),
    y=train_gender_labels
)
weights_gender = {i: weight for i, weight in enumerate(weights_gender)}

# Compute class weights for Race
weights_race = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(train_race_labels, axis=1)),
    y=np.argmax(train_race_labels, axis=1)
)
weights_race = {i: weight for i, weight in enumerate(weights_race)}

# Compute class weights for Nationality
weights_nationality = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(train_nationality_labels, axis=1)),
    y=np.argmax(train_nationality_labels, axis=1)
)
weights_nationality = {i: weight for i, weight in enumerate(weights_nationality)}


In [23]:
# Create sample weights for gender
sample_weights_gender = np.array([weights_gender[label] for label in train_gender_labels])

# Create sample weights for race
sample_weights_race = np.array([weights_race[label] for label in np.argmax(train_race_labels, axis=1)])

# Create sample weights for nationality
sample_weights_nationality = np.array([weights_nationality[label] for label in np.argmax(train_nationality_labels, axis=1)])

In [24]:
combined_sample_weights = (sample_weights_gender + sample_weights_race + sample_weights_nationality) / 3

In [25]:
from keras_nlp.models import BertBackbone
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers import Dropout
from keras.regularizers import L1,L2
from keras.layers import BatchNormalization
from keras.initializers import RandomUniform

# Define the inputs
input_ids = Input(shape=(max_length,), dtype='int32', name='input_ids')
segment_ids = Input(shape=(max_length,), dtype='int32', name='segment_ids')
padding_mask = Input(shape=(max_length,), dtype='int32', name='padding_mask')

backbone = BertBackbone.from_preset("bert_base_en")
backbone_output = backbone({'token_ids': input_ids, 'segment_ids': segment_ids, 'padding_mask': padding_mask})
# Assuming 'pooled_output' is the pooled output from BERT
cls_output = backbone_output['pooled_output']


# Add Dropout and L2 Regularization in the classification layers
initializer = RandomUniform(minval=0.0, maxval=1.0)
classifier_layer = Dense(128, activation='relu', 
                         kernel_regularizer=L1(0.01),
                        activity_regularizer=L2(0.01),
                        kernel_initializer=initializer)(cls_output)
batch_norm = BatchNormalization()(classifier_layer)
dropout = Dropout(0.3)(batch_norm)
gender_output = Dense(1, 
                      activation='sigmoid', 
                      name='gender_output',
                      kernel_regularizer=L1(0.01),
                        activity_regularizer=L2(0.01),
                        kernel_initializer=initializer)(dropout)
# Define output layers for each classification task
race_output = Dense(num_race_classes, activation='softmax', name='race_output',kernel_regularizer=L1(0.01),
                        activity_regularizer=L2(0.01),
                        kernel_initializer=initializer)(dropout)
nationality_output = Dense(num_nationality_classes, activation='softmax', name='nationality_output',kernel_regularizer=L1(0.01),
                        activity_regularizer=L2(0.01),
                        kernel_initializer=initializer)(dropout)

In [26]:
model = Model(inputs=[input_ids,segment_ids,padding_mask], outputs=[gender_output, race_output, nationality_output])

In [27]:
from keras.callbacks import EarlyStopping

callback =EarlyStopping(monitor="val_loss",
    patience=0,
    mode="auto",
    restore_best_weights=True,
    start_from_epoch=5,
)

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=10000,
    decay_rate=0.9)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
    loss={
        'gender_output': 'binary_crossentropy',
        'race_output': 'categorical_crossentropy',
        'nationality_output': 'categorical_crossentropy'
    },
    metrics={
        'gender_output': ['accuracy'],
        'race_output': ['accuracy'],
        'nationality_output': ['accuracy']
    }
)

In [28]:
model.fit(
    {'input_ids': train_token_ids, 'segment_ids': train_segment_ids, 'padding_mask': train_padding_mask},
    {'gender_output': train_gender_labels, 'race_output': train_race_labels, 'nationality_output': train_nationality_labels},
    # class_weight={
    #     'gender_output': weights_gender,
    #     'race_output': weights_race,
    #     'nationality_output': weights_nationality
    # },
    sample_weight=combined_sample_weights,
    epochs=30,
    batch_size=8,
    shuffle=True,
    verbose="auto",
    validation_split=0.2,
)

Epoch 1/30


2024-01-05 09:01:25.211290: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f080801e990 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-05 09:01:25.211320: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090 Laptop GPU, Compute Capability 8.9
2024-01-05 09:01:25.704210: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-05 09:01:32.706758: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1704445332.582687     521 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1704445332.690372     521 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m708/709[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 132ms/step - gender_output_accuracy: 0.6467 - loss: 7529.7876 - nationality_output_accuracy: 0.8670 - race_output_accuracy: 0.6881

W0000 00:00:1704445470.284520     518 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 211ms/step - gender_output_accuracy: 0.6468 - loss: 7513.3574 - nationality_output_accuracy: 0.8670 - race_output_accuracy: 0.6881 - val_gender_output_accuracy: 0.6671 - val_loss: 413.0852 - val_nationality_output_accuracy: 0.8822 - val_race_output_accuracy: 0.6911
Epoch 2/30
[1m  1/709[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:41[0m 143ms/step - gender_output_accuracy: 0.8750 - loss: 413.1082 - nationality_output_accuracy: 0.8750 - race_output_accuracy: 0.6250

W0000 00:00:1704445482.578834     517 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 141ms/step - gender_output_accuracy: 0.6599 - loss: 398.3272 - nationality_output_accuracy: 0.8936 - race_output_accuracy: 0.7050 - val_gender_output_accuracy: 0.6671 - val_loss: 346.7770 - val_nationality_output_accuracy: 0.8822 - val_race_output_accuracy: 0.6911
Epoch 3/30
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 141ms/step - gender_output_accuracy: 0.6718 - loss: 325.2985 - nationality_output_accuracy: 0.9010 - race_output_accuracy: 0.7213 - val_gender_output_accuracy: 0.6671 - val_loss: 255.1484 - val_nationality_output_accuracy: 0.8822 - val_race_output_accuracy: 0.6911
Epoch 4/30
[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 142ms/step - gender_output_accuracy: 0.6654 - loss: 228.7589 - nationality_output_accuracy: 0.8953 - race_output_accuracy: 0.7084 - val_gender_output_accuracy: 0.6671 - val_loss: 148.0195 - val_nationality_output_accuracy: 0.8822 - val_race_out