In [1]:
import pandas as pd
# Load the Excel file and read the first sheet into a DataFrame
df = pd.read_excel(r'Dataset.xlsx', sheet_name=0, usecols='A:E', header=None)

# Concatenate all the columns into a single column
df = pd.DataFrame(df.apply(lambda x: ''.join(x.dropna().astype(str)), axis=1))

In [2]:
first_col = df.columns[0]
df.rename(columns={first_col: 'data'}, inplace=True)
print(df)

                                                     data
0       item_type\titem_urn\tcontent\tinstrument_name\...
1       conditional\turn:ddi:uk.lha:bc358498-7085-4422...
2       conditional\turn:ddi:uk.lha:b7ec8f60-a0b7-4dbb...
3       codelist\turn:ddi:uk.lha:bdb9ae15-1848-41d7-8e...
4       codelist\turn:ddi:uk.lha:5fc4c7c5-2713-4d76-80...
...                                                   ...
186255  statement\turn:ddi:uk.lha:7817cb4c-99da-4676-a...
186256  instruction\turn:ddi:uk.lha:ba7dfd64-72ff-4988...
186257  instruction\turn:ddi:uk.lha:c93e8e55-26d3-474f...
186258  instruction\turn:ddi:uk.lha:ebf4e882-9fac-4a72...
186259  instruction\turn:ddi:uk.lha:ff7a6892-04af-4c41...

[186260 rows x 1 columns]


In [2]:
import re
rows = []
for index, row in df.iloc[1:].iterrows():
    words = re.split(r"\s{2,}", row[0])
    new_list = [item.split('\t') for item in words]
    if len(new_list[0]) == 5:
        rows.append({'item_type': new_list[0][0], 'content': new_list[0][2]})

new_df = pd.DataFrame(rows)
print(new_df)

          item_type                                            content
0       conditional  and then go straight on to Question 8]qc_1_i =...
1       conditional                         "[If ""YES""]qc_21_a == 1"
2          codelist                                   1 a club at work
3          codelist                           2 an outside sports club
4          codelist                             3 you and your friends
...             ...                                                ...
141022    statement  (N.B. By friends we mean people who you meet o...
141023  instruction                Please tick the appropriate column.
141024  instruction               (PLEASE TICK THE APPROPRIATE COLUMN)
141025  instruction         (Circle more than one number if necessary)
141026  instruction         (If necessary circle more than one number)

[141027 rows x 2 columns]


In [6]:
print (df.columns)

Index(['data'], dtype='object')


In [3]:
new_df['content']=new_df['content'].str.lower()

In [4]:
new_df['item_type']=new_df['item_type'].str.lower()

In [6]:
print (new_df.columns)

Index(['item_type', 'content'], dtype='object')


In [10]:
print (new_df)

          item_type                                            content  \
0       conditional  and then go straight on to question 8]qc_1_i =...   
1       conditional                         "[if ""yes""]qc_21_a == 1"   
2          codelist                                   1 a club at work   
3          codelist                           2 an outside sports club   
4          codelist                             3 you and your friends   
...             ...                                                ...   
141022    statement  (n.b. by friends we mean people who you meet o...   
141023  instruction                please tick the appropriate column.   
141024  instruction               (please tick the appropriate column)   
141025  instruction         (circle more than one number if necessary)   
141026  instruction         (if necessary circle more than one number)   

        item_type_id  
0                  0  
1                  0  
2                  1  
3                  

In [11]:
print (type(new_df))

<class 'pandas.core.frame.DataFrame'>


In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [6]:
def extract_entities(text):

    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((str(ent.text), str(ent.label_)))
    return entities
new_df = new_df.astype(str)

new_df['entities'] = new_df['content'].apply(extract_entities)

In [7]:
unique_labels = set()

for entities in new_df['entities']:
    try:
        unique_labels.update([entity[1] for entity in entities])
    except TypeError:
        continue

print(unique_labels)

{'ORDINAL', 'TIME', 'GPE', 'MONEY', 'FAC', 'CARDINAL', 'LAW', 'PERSON', 'DATE', 'PERCENT', 'LOC', 'QUANTITY', 'PRODUCT', 'NORP', 'EVENT', 'WORK_OF_ART', 'LANGUAGE', 'ORG'}


In [8]:
import random

def choose_entity_label(entities):
    if not entities:
        return 'none'
    else:
        return random.choice(entities)[1]

new_df['entities_label'] = new_df['entities'].apply(choose_entity_label)

In [9]:
train_size = int(new_df.shape[0]*0.8)
train_new_df = new_df[:train_size]
val_df = new_df[train_size:]  

In [10]:
#LSTM With NER
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Input
from keras.layers.merge import concatenate
from keras.models import Model

item_type_dict = {item_typ: i for i, item_typ in enumerate(new_df['item_type'].unique())}
new_df['item_type_id'] = new_df['item_type'].apply(lambda x: item_type_dict[x])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_df['content'])
sequences = tokenizer.texts_to_sequences(new_df['content'])

max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(new_df['entities_label'])
label_sequences = label_tokenizer.texts_to_sequences(new_df['entities_label'])

max_label_length = max(len(seq) for seq in label_sequences)
padded_label_sequences = pad_sequences(label_sequences, maxlen=max_label_length, padding='post')

labels = to_categorical(new_df['item_type_id'])

# Define the model architecture
embedding_dim = 50

content_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length)
embedded_sequences = embedding_layer(content_input)
lstm_layer = LSTM(50)(embedded_sequences)

label_input = Input(shape=(max_label_length,))
label_embedding_layer = Embedding(input_dim=len(label_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_label_length)
embedded_label_sequences = label_embedding_layer(label_input)
lstm_label_layer = LSTM(50)(embedded_label_sequences)

merged = concatenate([lstm_layer, lstm_label_layer])
dense_layer = Dense(10, activation='relu')(merged)
output_layer = Dense(len(item_type_dict), activation='softmax')(dense_layer)

model = Model(inputs=[content_input, label_input], outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit([padded_sequences, padded_label_sequences], labels, epochs=5, batch_size=32, validation_split=0.2)

# Get the predicted labels and ground truth labels for the validation set
val_padded_sequences = padded_sequences[int(len(padded_sequences)*0.8):]
val_padded_label_sequences = padded_label_sequences[int(len(padded_label_sequences)*0.8):]
val_labels = labels[int(len(labels)*0.8):]

val_pred_probs = model.predict([val_padded_sequences, val_padded_label_sequences])
val_pred_labels = val_pred_probs.argmax(axis=1)
val_true_labels = val_labels.argmax(axis=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
from sklearn.metrics import classification_report

# Compute evaluation metrics
report = classification_report(val_true_labels, val_pred_labels, target_names=item_type_dict.keys())
print(report)

              precision    recall  f1-score   support

 conditional       0.65      0.02      0.04      1614
    codelist       0.85      0.92      0.88     16105
    question       0.72      0.91      0.80      8414
   statement       0.70      0.00      0.01      1601
 instruction       0.00      0.00      0.00       431
        loop       0.00      0.00      0.00        40
                   0.00      0.00      0.00         1

    accuracy                           0.80     28206
   macro avg       0.42      0.27      0.25     28206
weighted avg       0.77      0.80      0.75     28206



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
import numpy as np
from sklearn.metrics import confusion_matrix

val_df['item_type_id'] = val_df['item_type'].apply(lambda x: item_type_dict[x])

val_sequences = tokenizer.texts_to_sequences(val_df['content'])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')

val_label_sequences = label_tokenizer.texts_to_sequences(val_df['entities_label'])
val_padded_label_sequences = pad_sequences(val_label_sequences, maxlen=max_label_length, padding='post')

val_labels = to_categorical(val_df['item_type_id'])

loss, accuracy = model.evaluate([val_padded_sequences, val_padded_label_sequences], val_labels)
print("Validation loss:", loss)
print("Validation accuracy:", accuracy)

predicted_labels = model.predict([val_padded_sequences, val_padded_label_sequences])
predicted_labels = np.argmax(predicted_labels, axis=1)

class_names = list(item_type_dict.keys())
true_labels = val_df['item_type_id']
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=list(item_type_dict.values()))
print("Confusion matrix:\n", conf_matrix)
print("Class names:", class_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Validation loss: 0.6876544952392578
Validation accuracy: 0.7968871593475342
Confusion matrix:
 [[   38  1351   225     0     0     0     0]
 [   15 14666  1421     3     0     0     0]
 [    0   641  7772     1     0     0     0]
 [    2   328  1270     1     0     0     0]
 [    0   142   289     0     0     0     0]
 [    0    25    14     1     0     0     0]
 [    0     0     1     0     0     0     0]]
Class names: ['conditional', 'codelist', 'question', 'statement', 'instruction', 'loop', '']


In [None]:
# Training BiLSTM with NER

In [16]:
from keras.layers import Bidirectional

In [17]:
content_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length)(content_input)
lstm_layer = Bidirectional(LSTM(20))(embedding_layer)

label_input = Input(shape=(max_label_length,))
label_embedding_layer = Embedding(input_dim=len(label_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_label_length)(label_input)
lstm_label_layer = Bidirectional(LSTM(20))(label_embedding_layer)

merged = concatenate([lstm_layer, lstm_label_layer])
dense_layer = Dense(10, activation='relu')(merged)
output_layer = Dense(len(item_type_dict), activation='softmax')(dense_layer)

model = Model(inputs=[content_input, label_input], outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit([padded_sequences, padded_label_sequences], labels, epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x2448484b5c8>

In [19]:
import numpy as np
from sklearn.metrics import confusion_matrix

val_df['item_type_id'] = val_df['item_type'].apply(lambda x: item_type_dict[x])

val_sequences = tokenizer.texts_to_sequences(val_df['content'])
val_padded_sequences = pad_sequences(val_sequences, maxlen=max_length, padding='post')

val_label_sequences = label_tokenizer.texts_to_sequences(val_df['entities_label'])
val_padded_label_sequences = pad_sequences(val_label_sequences, maxlen=max_label_length, padding='post')

val_labels = to_categorical(val_df['item_type_id'])

loss, accuracy = model.evaluate([val_padded_sequences, val_padded_label_sequences], val_labels)
print("Validation loss:", loss)
print("Validation accuracy:", accuracy)

predicted_labels = model.predict([val_padded_sequences, val_padded_label_sequences])
predicted_labels = np.argmax(predicted_labels, axis=1)

class_names = list(item_type_dict.keys())
true_labels = val_df['item_type_id']
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=list(item_type_dict.values()))
print("Confusion matrix:\n", conf_matrix)
print("Class names:", class_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Validation loss: 0.032366495579481125
Validation accuracy: 0.9911366105079651
Confusion matrix:
 [[ 1597     0     8     4     3     2     0]
 [    1 16094     3     4     3     0     0]
 [    2     6  8323    54    29     0     0]
 [    7     9    47  1507    30     1     0]
 [    3     1    14    15   398     0     0]
 [    0     2     1     0     0    37     0]
 [    0     0     0     1     0     0     0]]
Class names: ['conditional', 'codelist', 'question', 'statement', 'instruction', 'loop', '']


In [20]:
import numpy as np
from sklearn.metrics import classification_report

# Get the class names
class_names = list(item_type_dict.keys())

# Compute the classification report
report = classification_report(true_labels, predicted_labels, target_names=class_names)

# Print the report
print(report)

              precision    recall  f1-score   support

 conditional       0.99      0.99      0.99      1614
    codelist       1.00      1.00      1.00     16105
    question       0.99      0.99      0.99      8414
   statement       0.95      0.94      0.95      1601
 instruction       0.86      0.92      0.89       431
        loop       0.93      0.93      0.93        40
                   0.00      0.00      0.00         1

    accuracy                           0.99     28206
   macro avg       0.82      0.82      0.82     28206
weighted avg       0.99      0.99      0.99     28206



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
