# PREDICT VALUE FOR MISSING QUESTION TOPIC WITH RELABELLING V2

## Import Libraries

In [1]:
import os
os.environ['TF_MIN_GPU_MULTIPROCESSOR_COUNT'] = '6' # Needed so I can use my old GPU with the new one
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # Turns off oneDNN custom operations
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # Hides message regarding TensorFlow optimization
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

2025-12-07 22:06:34.494721: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765105594.581336   16397 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765105594.609180   16397 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765105594.670207   16397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765105594.670335   16397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765105594.670338   16397 computation_placer.cc:177] computation placer alr

## Import CSV File & Remove Duplicates
NB:

'question_topic_valid.csv' represents records from the original dataset where 'question_topic' was not empty while 'question_topic_null.csv' are records with missing values for 'question_topic'.

In [2]:
# Import patitioned libraries of original dataset
# These datasets were created in another notebook (see 'data_wrangling_new_features.ipynb')
df_topic_exists = pd.read_csv('../data/question_topic_valid_relabel_r2.csv', usecols=[0,2,3,4,13,14]) # Import only essential columns
df_topic_null = pd.read_csv('../data/question_topic_null.csv')

# Drops duplicate 'question_content' due to multiple 'question_id' in the dataset
df_topic_exists.drop_duplicates(subset='question_id',inplace=True)
df_topic_exists.reset_index(drop=True, inplace=True)

## Tokenize The Questions

In [3]:
def list_of_sentences(col):
    sentence_list = []
    for text in col:
        splitted_text = text.lower().split()
        sentence_list.append(splitted_text)
    return sentence_list

sentences_topic_exist = list_of_sentences(df_topic_exists.question_content)
sentences_topic_null = list_of_sentences(df_topic_null.question_content)

# Initiate the tokenizer with a out_of_vocabulary token 
tokenizer_X = Tokenizer(oov_token="<OOV>")

# Generate word indexes for all sentences 
tokenizer_X.fit_on_texts(sentences_topic_exist+sentences_topic_null)

# Generate separate sequences for both with topic values and missing values
X = tokenizer_X.texts_to_sequences(sentences_topic_exist)
X_topic_null = tokenizer_X.texts_to_sequences(sentences_topic_null)

## Determine Word Counts & Maximum Sentence Length

In [4]:
print(f'The total number of words from all questions is {len(tokenizer_X.word_counts)}.')

max_len = 0
for l in X + X_topic_null: # Include questions from the entire dataset
    if len(l) > max_len:
        max_len = len(l)

print(f'The highest number of words in any sentence is {max_len}.')


The total number of words from all questions is 1292953.
The highest number of words in any sentence is 197.


In [5]:
max_features = 40000     # Use 40000 most frequent words from the total of 1292953 words

## Create Train & Test Datasets & Prepare For Model

In [6]:
# Split data into training and test sets
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X, df_topic_exists.question_topic, test_size=0.2,
                                                                stratify=df_topic_exists.question_topic, random_state=42)

# Format X and y for model
X_train = np.array(sequence.pad_sequences(X_train_df, maxlen=max_len))
X_test = np.array(sequence.pad_sequences(X_test_df, maxlen=max_len))

y_train_one_hot = pd.get_dummies(y_train_df)
y_train = y_train_one_hot.to_numpy()
y_test_one_hot = pd.get_dummies(y_test_df)
y_test = y_test_one_hot.to_numpy()

## Configure Model

In [7]:
# Create transformer block class
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.01):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [8]:
# Define the model with an embedding layer, transformer block, and output layer
embed_dim = 32 # Embedding dimension for each word vector
num_heads = 4  # The number of attention heads in the multi-head attention layer
ff_dim = 64    # Number of units in the feed forward layer

inputs = layers.Input(shape=(max_len,))

embedding_layer = layers.Embedding(input_dim=max_features, output_dim=embed_dim)
out = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
out = transformer_block(out, training=True)
out = layers.GlobalAveragePooling1D()(out)
out = layers.Dropout(0.1)(out)
out = layers.Dense(20, activation='relu')(out)
out = layers.Dropout(0.1)(out)
outputs = layers.Dense(148, activation='softmax')(out)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

I0000 00:00:1765105989.222164   16397 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5518 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9
I0000 00:00:1765105989.224889   16397 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 2857 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1050 Ti, pci bus id: 0000:07:00.0, compute capability: 6.1


## Compile & Train Model

In [9]:
# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=5, batch_size=512, validation_split=0.2)

2025-12-07 22:13:12.133844: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2115021944 exceeds 10% of free system memory.


Epoch 1/5


I0000 00:00:1765105998.814131   16696 service.cc:152] XLA service 0x705480007360 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1765105998.814198   16696 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Ti, Compute Capability 8.9
I0000 00:00:1765105998.814203   16696 service.cc:160]   StreamExecutor device (1): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
I0000 00:00:1765105999.387329   16696 cuda_dnn.cc:529] Loaded cuDNN version 91002
I0000 00:00:1765106019.016249   16696 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 44ms/step - accuracy: 0.8485 - loss: 0.6461 - val_accuracy: 0.9186 - val_loss: 0.2867
Epoch 2/5
[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 39ms/step - accuracy: 0.9127 - loss: 0.3183 - val_accuracy: 0.9206 - val_loss: 0.2619
Epoch 3/5
[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 39ms/step - accuracy: 0.9161 - loss: 0.2865 - val_accuracy: 0.9209 - val_loss: 0.2537
Epoch 4/5
[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 39ms/step - accuracy: 0.9184 - loss: 0.2677 - val_accuracy: 0.9218 - val_loss: 0.2465
Epoch 5/5
[1m5243/5243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 39ms/step - accuracy: 0.9203 - loss: 0.2546 - val_accuracy: 0.9218 - val_loss: 0.2445


<keras.src.callbacks.history.History at 0x705661a26e90>

## Evaluate Model

In [10]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)

[1m26212/26212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 7ms/step - accuracy: 0.9218 - loss: 0.2461
Test Accuracy: 0.9217656254768372


## Extract Failed Predictions

In [11]:
# Store in a list the column names for one-hot encoding (question_topic)
one_hot_columns = list(y_test_one_hot.columns)

# Store predictions for X_test
y_pred = model.predict(X_test)

# Add predictions column to y_test_df
y_test_df = y_test_df.to_frame()
y_test_df['predictions'] = [one_hot_columns[i] for i in np.argmax(y_pred, axis=1)]

# Merge index associated rows from the original source dataset along with the predictions 
test_df = pd.merge(df_topic_exists, y_test_df, left_index=True, right_index=True)

# Create new dataframe that stores rows from test df where predictions were incorrect plus adds the predictions column
false_predictions = pd.DataFrame()
for i,v in test_df.iterrows():
    if v.question_topic_x != v.predictions:
        row = pd.DataFrame({'question_language' : [v.question_language], 'question_content' : [v.question_content],
                            'question_user_status' : [v.question_user_status], 'question_user_country_code' : [v.question_user_country_code],
                            'question_topic': [v.question_topic_x], 'predictions' : [v.predictions]
                            })
        false_predictions = pd.concat([false_predictions, row], ignore_index=True)

[1m26212/26212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 4ms/step


# Export Test df For Predictions Versus Actual Analysis

In [12]:
test_df.to_csv('../data/prediction_vs_actual_topic_r2.csv', index=False)

## Check For Any Indicators For Failure Rate

In [13]:
print(f'The test data % failure rate by {(false_predictions.question_language.value_counts() / test_df.question_language.value_counts()) * 100}\n')
print(f'The test data % failure rate by {(false_predictions.question_user_country_code.value_counts() / test_df.question_user_country_code.value_counts()) * 100}\n')
print(f'The test data % failure rate by {(false_predictions.question_user_status.value_counts() / test_df.question_user_status.value_counts()) * 100}')

The test data % failure rate by question_language
eng     6.288004
lug     9.215363
nyn     6.350424
swa    10.070384
Name: count, dtype: float64

The test data % failure rate by question_user_country_code
ke    9.148681
ug    6.301055
tz    7.397424
gb    6.666667
Name: count, dtype: float64

The test data % failure rate by question_user_status
live         7.505945
zombie       8.530012
destroyed    8.345033
blocked      8.276858
Name: count, dtype: float64


In [15]:
question_topic_failed = false_predictions.question_topic.value_counts().rename_axis('question_topic').reset_index(name='failed_prediction')
question_topic_total = test_df.question_topic_x.value_counts().rename_axis('question_topic').reset_index(name='total')
question_topic = pd.merge(question_topic_failed, question_topic_total, how='inner')
question_topic['percentage_failed'] = (question_topic['failed_prediction'] / question_topic['total']) * 100
question_topic = question_topic.sort_values(by=['percentage_failed'],ascending=False).reset_index(drop=True)

print(f'The top 60 failure rates \n {question_topic.head(60)}\n')
print(f'The middle 28 failure rates \n {question_topic[60:88]}\n')
print(f'The bottom 60 failure rates \n {question_topic.tail(60)}')

The top 60 failure rates 
         question_topic  failed_prediction  total  percentage_failed
0            courgette                 40     40         100.000000
1                 flax                 53     53         100.000000
2           gooseberry                 27     27         100.000000
3           guinea-pig                 33     33         100.000000
4          castor-bean                  6      6         100.000000
5                  rye                  6      6         100.000000
6           blackberry                  4      4         100.000000
7            cranberry                  1      1         100.000000
8              setaria                  3      3         100.000000
9                vetch                  4      4         100.000000
10            mulberry                  4      4         100.000000
11            leucaena                  4      4         100.000000
12        purple-vetch                  1      1         100.000000
13              celer

# Free Memory For Next Step

NB: Optional step if system resources are limited

In [16]:
# %xdel sentences_topic_exist
# %xdel X 
# %xdel X_train_df
# %xdel X_test_df
# %xdel y_train_df
# %xdel y_test_df
# %xdel X_train
# %xdel X_test
# %xdel y_train_one_hot
# %xdel y_train
# %xdel y_test_one_hot
# %xdel y_test
# %xdel y_pred
# %xdel false_predictions
# %xdel df_topic_exists

## Make Predictions For Missing question_topic Values

In [17]:
# Create X input and make predictions
X_topic_null_predict = np.array(sequence.pad_sequences(X_topic_null, maxlen=max_len))
y_pred_topic_null = model.predict(X_topic_null_predict)

# Convert predictions to labels
topic_null_predictions = [one_hot_columns[i] for i in np.argmax(y_pred_topic_null, axis=1)]

# Insert predictions into 'question_topic' column for null dataframe
df_topic_null['question_topic'] = topic_null_predictions

[1m     2/110555[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:53:01[0m 94ms/step

2025-12-07 22:52:05.196307: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2787730452 exceeds 10% of free system memory.


[1m110555/110555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m466s[0m 4ms/step


2025-12-07 23:00:45.392459: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2094335568 exceeds 10% of free system memory.


# Free Memory For Next Step

NB: Optional step if system resources are limited

In [18]:
# %xdel X_topic_null_predict
# %xdel y_pred_topic_null
# %xdel topic_null_predictions

# Export To CSV File

In [19]:
# Import full datset without missing values for 'question_topic'
# NB: Could not do this before due to resource limit on my computer
chunks = pd.read_csv('../data/question_topic_valid.csv',
                     dtype={'question_user_gender': str, 'response_user_gender': str}, # Removes mixed dtypes error message
                     chunksize=100000
                    )
df_topic_exists = pd.DataFrame()

for chunk in chunks:
    df_topic_exists = pd.concat([df_topic_exists,chunk], axis=0)


# Combine dataset without missing values with the predicted values to recreate the full dataset
df_no_missing = pd.concat([df_topic_exists, df_topic_null], axis=0)


# Export the predicted values only and the full dataset now with no missing values
df_topic_null.to_csv('../data/question_topic_predicted_r2.csv', index=False)
df_no_missing.to_csv('../data/question_topic_no_missing_r2.csv', index=False)