#### DEPENDENCIES ####

In [1]:
import numpy as np
import keras
import nltk
nltk.download('punkt')
import pyconll
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional, Embedding, LSTM, Dense, Input, TimeDistributed
from keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import CSVLogger
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### VERSION ####

In [2]:
print("Tensorflow ", tf.__version__)
print("Keras ", keras.__version__)

Tensorflow  2.13.0
Keras  2.13.1


Tensorflow  2.13.0

Keras  2.13.1

#### LANGUAGE AND TRAINING CHOICE ####

In [3]:
################### DO YOU WANT TO RETRAIN THE MODEL? True/False #######################

retrain = True

########################################################################################


my_train_conll_file_location = "languages/en_ewt-ud-train.conllu"
my_dev_conll_file_location = "languages/en_ewt-ud-dev.conllu"
my_test_conll_file_location = "languages/en_ewt-ud-test.conllu"

#### LOAD DATA ####

In [4]:
train_file = pyconll.load_from_file(my_train_conll_file_location)
test_file = pyconll.load_from_file(my_dev_conll_file_location)
dev_file = pyconll.load_from_file(my_test_conll_file_location)

#print the first sentence
#print(train_file[0].text)

#look for sentences with more than 128 words or empty words
for sentence in train_file:
    if len(sentence) > 128:
        print("\nSentence in train: ", sentence.text)
        print("Sentence length: ", len(sentence))
        train_file.remove(sentence)
        print("Sentence removed, too long")
    
    for word in sentence:
        if word.form == None:
            train_file.remove(sentence)
            # print("Sentence removed, None existed")

for sentence in test_file:
    if len(sentence) > 128:
        print("\nSentence in test: ", sentence.text)
        print("Sentence length: ", len(sentence))
        test_file.remove(sentence)
        print("Sentence removed, too long")
    
    for word in sentence:
        if word.form == None:
            test_file.remove(sentence)
            # print("Sentence removed, None existed")

for sentence in dev_file:
    if len(sentence) > 128:
        print("\nSentence in dev: ", sentence.text)
        print("Sentence length: ", len(sentence))
        dev_file.remove(sentence)
        print("Sentence removed, too long")
    
    for word in sentence:
        if word.form == None:
            dev_file.remove(sentence)
            # print("Sentence removed, None existed")


Sentence in train:  Antichrist John Lennon wanted to compete with Jesus Christ, and so he grew a beard and started to make a bogus role of Christ together with Yoko Ono at the Amsterdam Hilton hotel proclaiming "Peace", being then when he was visited by the Canadian journalist who ridiculized and admonished him wanting to know about what Lennon meant when he wrote in the lyrics of "The ballad of John and Yoko": "the way things are going, they're going to crucify me...", The CURSE OF GOD upon John Lennon carried on with all type of miseries and distresses which made Lennon give the interview to the "Rolling Stone" magazine (today condensed in the "Lennon remembers" book) where he speaks about how bad thing were going for him blaming "whatever is up there" for it (referring to God).
Sentence length:  160
Sentence removed, too long

Sentence in train:  my name is Josalyn Leainne Creek and i'm 19 years old and i graduated from high in May adn of this year and i love animals and i'm great 

#### SPLIT DATASETS AND CREATE LIST OF PoS TAGS ####

In [5]:
# X_train, y_train: Training data
# X_dev, y_dev: Development (validation) data
# X_test, y_test: Test data

# create a list of sentences
X_train = []
y_train = []
for sentence in train_file:
    X_train.append([word.form for word in sentence])
    y_train.append([word.upos for word in sentence])
    
X_dev = []
y_dev = []
for sentence in dev_file:
    X_dev.append([word.form for word in sentence])
    y_dev.append([word.upos for word in sentence])
    
X_test = []
y_test = []
for sentence in test_file:
    X_test.append([word.form for word in sentence])
    y_test.append([word.upos for word in sentence])
    
# Create a list of all part-of-speech tags
pos_tags = set()
for sentence in y_train:
    for tag in sentence:
        pos_tags.add(tag)
num_pos_tags = len(pos_tags)

print("POS tags: ", pos_tags)

# Create a mapping from part-of-speech tags to integers
pos_tag_names = list(pos_tags)
pos_tag_to_idx = {t: i for i, t in enumerate(pos_tag_names)}

print("POS tag to index: ", pos_tag_to_idx)

POS tags:  {'PUNCT', 'VERB', 'ADP', 'SCONJ', 'INTJ', 'SYM', 'PART', 'AUX', 'DET', 'ADV', 'X', 'NUM', None, 'ADJ', 'PROPN', 'PRON', 'CCONJ', 'NOUN'}
POS tag to index:  {'PUNCT': 0, 'VERB': 1, 'ADP': 2, 'SCONJ': 3, 'INTJ': 4, 'SYM': 5, 'PART': 6, 'AUX': 7, 'DET': 8, 'ADV': 9, 'X': 10, 'NUM': 11, None: 12, 'ADJ': 13, 'PROPN': 14, 'PRON': 15, 'CCONJ': 16, 'NOUN': 17}


#### TOKENIZER ####

In [6]:
# Creates a world-level tokenizer
tokenizer = Tokenizer(oov_token="<unk>", filters='\t\n')

tokenizer.fit_on_texts(X_train)

vocabulary_size = len(tokenizer.word_index) + 1

max_sequence_length = 128

X_train_OHE = tokenizer.texts_to_sequences(X_train)
X_train_OHE = pad_sequences(X_train_OHE, padding='post', maxlen=max_sequence_length)

X_dev_OHE = tokenizer.texts_to_sequences(X_dev)
X_dev_OHE = pad_sequences(X_dev_OHE, padding='post', maxlen=max_sequence_length)

X_test_OHE = tokenizer.texts_to_sequences(X_test)
X_test_OHE = pad_sequences(X_test_OHE, padding='post', maxlen=max_sequence_length)

#### TRANSFORMING LABELS ####

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(pos_tag_names)

# Convert the labels from strings to integers
y_train_int = [label_encoder.transform(sequence) for sequence in y_train]
y_dev_int = [label_encoder.transform(sequence) for sequence in y_dev]
y_test_int = [label_encoder.transform(sequence) for sequence in y_test]

y_train_int = pad_sequences(y_train_int, maxlen=max_sequence_length, padding='post', value=0)
y_dev_int = pad_sequences(y_dev_int, maxlen=max_sequence_length, padding='post', value=0)
y_test_int = pad_sequences(y_test_int, maxlen=max_sequence_length, padding='post', value=0)

#### ONE-HOT ENCODING ####

In [8]:
# Convert the integer labels to one-hot encodings
vector_train = [to_categorical(sequence, num_classes=num_pos_tags) for sequence in y_train_int]
vector_dev = [to_categorical(sequence, num_classes=num_pos_tags) for sequence in y_dev_int]
vector_test = [to_categorical(sequence, num_classes=num_pos_tags) for sequence in y_test_int]

vector_train = np.array(vector_train)
vector_dev = np.array(vector_dev)
vector_test = np.array(vector_test)

#### MODEL TRAINING ####

In [10]:
if retrain == True:
    embedding_size = 128

    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_size, mask_zero=True)(input_layer)
    bidirectional_lstm = Bidirectional(LSTM(units=64, return_sequences=True))(embedding_layer)
    output_layer = TimeDistributed(Dense(num_pos_tags, activation='softmax'))(bidirectional_lstm)

    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())

    # Save the model's performance
    csv_logger = CSVLogger('training_' + 'English' + '.log')

    model.fit(X_train_OHE, vector_train, batch_size=64, epochs=5, validation_data=(X_dev_OHE, vector_dev), callbacks=[csv_logger])

    # Save the model
    model.save('model.' + 'English')
else:
    # Load the model
    model = keras.models.load_model('model.' + 'English')
    print(model.summary())


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 128, 128)          2187136   
                                                                 
 bidirectional_1 (Bidirecti  (None, 128, 128)          98816     
 onal)                                                           
                                                                 
 time_distributed_1 (TimeDi  (None, 128, 18)           2322      
 stributed)                                                      
                                                                 
Total params: 2288274 (8.73 MB)
Trainable params: 2288274 (8.73 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5
Epoch 

INFO:tensorflow:Assets written to: model.English\assets


#### EVALUATE MODEL ####

In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_OHE, vector_test, verbose=1)
print('Loss: %.2f, Accuracy: %.2f' % (loss*100,accuracy*100))

Loss: 30.03, Accuracy: 91.57


#### PoS TAGGING FUNCTION FOR UNKNOWN SENTENCES ####

In [12]:
def pos_tag_sentence(sentence, tokenizer, model, max_sequence_length):
    # Convert sentence to a list of words and punctuation symbols
    sentences = nltk.word_tokenize(sentence)
    print(sentences)

    X = tokenizer.texts_to_sequences(sentences)

    X = pad_sequences(X, maxlen=max_sequence_length, value=0)

    predictions = model.predict(X)

    # Convert predictions to part-of-speech tags
    predicted_tags = np.argmax(predictions, axis=2)
    
    # Remove padding
    values = predicted_tags[:, -1]
    tag_names = label_encoder.inverse_transform(values)

    return tag_names

#### TEST ON A SAMPLE SENTENCE ####

In [13]:
# Dictionary of sentences in each language
sentence = ("I'm very happy # today, because! it's sunny outside.")

result = pos_tag_sentence(sentence, tokenizer, model, max_sequence_length)

print(result)


['I', "'m", 'very', 'happy', '#', 'today', ',', 'because', '!', 'it', "'s", 'sunny', 'outside', '.']


['PRON' 'AUX' 'ADV' 'ADJ' 'SYM' 'NOUN' 'PUNCT' 'SCONJ' 'PUNCT' 'PRON'
 'PART' 'ADJ' 'ADP' 'PUNCT']
