In [21]:
import pandas as pd
import random
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input
from keras.layers.merge import concatenate
from keras.models import Model
from sklearn.metrics import classification_report

# Load the Excel file and read the first sheet into a DataFrame
df = pd.read_excel(r'Dataset.xlsx', sheet_name=0, usecols='A:E', header=None)

# Concatenate all the columns into a single column
df = pd.DataFrame(df.apply(lambda x: ''.join(x.dropna().astype(str)), axis=1))

first_col = df.columns[0]
df.rename(columns={first_col: 'data'}, inplace=True)

import re
rows = []
for index, row in df.iloc[1:].iterrows():
    words = re.split(r"\s{2,}", row[0])
    new_list = [item.split('\t') for item in words]
    if len(new_list[0]) == 5:
        rows.append({'item_type': new_list[0][0], 'content': new_list[0][2]})

new_df = pd.DataFrame(rows)

new_df['content']=new_df['content'].str.lower()
new_df['item_type']=new_df['item_type'].str.lower()

train_size = int(new_df.shape[0]*0.8)
train_new_df = new_df[:train_size]
val_df = new_df[train_size:]  

item_type_dict = {item_typ: i for i, item_typ in enumerate(new_df['item_type'].unique())}
new_df['item_type_id'] = new_df['item_type'].apply(lambda x: item_type_dict[x])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_df['content'])
sequences = tokenizer.texts_to_sequences(new_df['content'])

max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

labels = to_categorical(new_df['item_type_id'])

# Define the model architecture
embedding_dim = 50

content_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length)
embedded_sequences = embedding_layer(content_input)
lstm_layer = LSTM(50)(embedded_sequences)

dense_layer = Dense(10, activation='relu')(lstm_layer)
output_layer = Dense(len(item_type_dict), activation='softmax')(dense_layer)

model = Model(inputs=content_input, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(padded_sequences, labels, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
# LSTM without NER confusion matrix
import numpy as np
from sklearn.metrics import confusion_matrix

val_df['item_type_id'] = val_df['item_type'].apply(lambda x: item_type_dict[x])

# Get the predicted labels for the validation set
val_sequences = tokenizer.texts_to_sequences(val_df['content'])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')
y_pred = np.argmax(model.predict(val_padded_sequences), axis=-1)

# Get the true labels for the validation set
y_true = val_df['item_type_id']

# Print the confusion matrix
class_names = list(item_type_dict.keys())
print(confusion_matrix(y_true, y_pred, labels=list(item_type_dict.values())))
print("Class names:", class_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


[[    0  1614     0     0     0     0     0]
 [    0 16105     0     0     0     0     0]
 [    0  8414     0     0     0     0     0]
 [    0  1601     0     0     0     0     0]
 [    0   431     0     0     0     0     0]
 [    0    40     0     0     0     0     0]
 [    0     1     0     0     0     0     0]]
Class names: ['conditional', 'codelist', 'question', 'statement', 'instruction', 'loop', '']


In [32]:
# Get the predicted labels and ground truth labels for the validation set
val_padded_sequences = padded_sequences[int(len(padded_sequences)*0.8):]
val_labels = labels[int(len(labels)*0.8):]

val_pred_probs = model.predict(val_padded_sequences)
val_pred_labels = val_pred_probs.argmax(axis=1)
val_true_labels = val_labels.argmax(axis=1)

# Compute evaluation metrics
report = classification_report(val_true_labels, val_pred_labels, target_names=item_type_dict)
print(report)

              precision    recall  f1-score   support

 conditional       0.98      0.98      0.98      1614
    codelist       1.00      0.99      1.00     16105
    question       0.96      0.97      0.96      8414
   statement       0.83      0.76      0.79      1601
 instruction       0.68      0.78      0.73       431
        loop       0.54      0.93      0.68        40
                   0.00      0.00      0.00         1

    accuracy                           0.97     28206
   macro avg       0.71      0.77      0.73     28206
weighted avg       0.97      0.97      0.97     28206



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
from sklearn.metrics import classification_report

# Compute evaluation metrics
report = classification_report(val_true_labels, val_pred_labels, target_names=item_type_dict.keys())
print(report)

              precision    recall  f1-score   support

 conditional       0.00      0.00      0.00      1614
    codelist       0.57      1.00      0.73     16105
    question       0.00      0.00      0.00      8414
   statement       0.00      0.00      0.00      1601
 instruction       0.00      0.00      0.00       431
        loop       0.00      0.00      0.00        40
                   0.00      0.00      0.00         1

    accuracy                           0.57     28206
   macro avg       0.08      0.14      0.10     28206
weighted avg       0.33      0.57      0.42     28206



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
import numpy as np
from sklearn.metrics import classification_report

# Get the true labels
true_labels = new_df['item_type_id']

# Get the predicted labels
predicted_labels = model.predict([padded_sequences, padded_label_sequences])
predicted_labels = np.argmax(predicted_labels, axis=1)

# Get the class names
class_names = list(item_type_dict.keys())

# Compute the classification report
report = classification_report(true_labels, predicted_labels, target_names=class_names)

# Print the report
print(report)

              precision    recall  f1-score   support

 conditional       0.54      0.03      0.05      8031
    codelist       0.86      0.92      0.89     83866
    question       0.71      0.91      0.80     38974
   statement       0.57      0.00      0.01      7024
 instruction       0.00      0.00      0.00      2823
        loop       0.00      0.00      0.00       304
                   0.00      0.00      0.00         5

    accuracy                           0.80    141027
   macro avg       0.38      0.27      0.25    141027
weighted avg       0.76      0.80      0.75    141027



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# BiLSTM without NER

In [28]:
import pandas as pd
import re
import random
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input, Bidirectional, concatenate
from keras.models import Model

# Load the Excel file and read the first sheet into a DataFrame
df = pd.read_excel(r'Dataset.xlsx', sheet_name=0, usecols='A:E', header=None)

# Concatenate all the columns into a single column
df = pd.DataFrame(df.apply(lambda x: ''.join(x.dropna().astype(str)), axis=1))

first_col = df.columns[0]
df.rename(columns={first_col: 'data'}, inplace=True)
print(df)

rows = []
for index, row in df.iloc[1:].iterrows():
    words = re.split(r"\s{2,}", row[0])
    new_list = [item.split('\t') for item in words]
    if len(new_list[0]) == 5:
        rows.append({'item_type': new_list[0][0], 'content': new_list[0][2]})

new_df = pd.DataFrame(rows)
print(new_df)

new_df['content']=new_df['content'].str.lower()

new_df['item_type']=new_df['item_type'].str.lower()

train_size = int(new_df.shape[0]*0.8)
train_new_df = new_df[:train_size]
val_df = new_df[train_size:]  

item_type_dict = {item_typ: i for i, item_typ in enumerate(new_df['item_type'].unique())}
new_df['item_type_id'] = new_df['item_type'].apply(lambda x: item_type_dict[x])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_df['content'])
sequences = tokenizer.texts_to_sequences(new_df['content'])

max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

labels = to_categorical(new_df['item_type_id'])

content_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(content_input)
lstm_layer = Bidirectional(LSTM(50))(embedding_layer)

dense_layer = Dense(10, activation='relu')(lstm_layer)
output_layer = Dense(len(item_type_dict), activation='softmax')(dense_layer)

model = Model(inputs=content_input, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(padded_sequences, labels, epochs=2, batch_size=32, validation_split=0.2)

                                                     data
0       item_type\titem_urn\tcontent\tinstrument_name\...
1       conditional\turn:ddi:uk.lha:bc358498-7085-4422...
2       conditional\turn:ddi:uk.lha:b7ec8f60-a0b7-4dbb...
3       codelist\turn:ddi:uk.lha:bdb9ae15-1848-41d7-8e...
4       codelist\turn:ddi:uk.lha:5fc4c7c5-2713-4d76-80...
...                                                   ...
186255  statement\turn:ddi:uk.lha:7817cb4c-99da-4676-a...
186256  instruction\turn:ddi:uk.lha:ba7dfd64-72ff-4988...
186257  instruction\turn:ddi:uk.lha:c93e8e55-26d3-474f...
186258  instruction\turn:ddi:uk.lha:ebf4e882-9fac-4a72...
186259  instruction\turn:ddi:uk.lha:ff7a6892-04af-4c41...

[186260 rows x 1 columns]
          item_type                                            content
0       conditional  and then go straight on to Question 8]qc_1_i =...
1       conditional                         "[If ""YES""]qc_21_a == 1"
2          codelist                                   1 a club a

<tensorflow.python.keras.callbacks.History at 0x225784b70c8>

In [29]:
val_df['item_type_id'] = val_df['item_type'].apply(lambda x: item_type_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
predicted_labels = model.predict(val_padded_sequences)
predicted_labels = np.argmax(predicted_labels, axis=1)

class_names = list(item_type_dict.keys())
true_labels = val_df['item_type_id']
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=list(item_type_dict.values()))
print("Confusion matrix:\n", conf_matrix)
print("Class names:", class_names)

Confusion matrix:
 [[ 1559     0    10    20    25     0     0]
 [    0 15984    67    38    16     0     0]
 [    1     7  8167   145    92     2     0]
 [   19    29   339  1147    67     0     0]
 [    5     1    43    32   349     1     0]
 [    0     2     5     2     0    31     0]
 [    1     0     0     0     0     0     0]]
Class names: ['conditional', 'codelist', 'question', 'statement', 'instruction', 'loop', '']


In [None]:
# Predict on validation data
val_sequences = tokenizer.texts_to_sequences(val_df['content'])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')
val_labels = to_categorical(val_df['item_type_id'])
y_pred = model.predict(val_padded_sequences)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(val_labels, axis=1)

# Print classification report
print(classification_report(y_true, y_pred_classes))

In [12]:
val_df['item_type_id'] = val_df['item_type'].apply(lambda x: item_type_dict[x])

val_sequences = tokenizer.texts_to_sequences(val_df['content'])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')

val_labels = to_categorical(val_df['item_type_id'])

loss, accuracy = model.evaluate(val_padded_sequences, val_labels)
print("Validation loss:", loss)
print("Validation accuracy:", accuracy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Validation loss: 0.031851109117269516
Validation accuracy: 0.9908175468444824
