# **2. Елементи синтаксису**
Сірікова Ольга МІ-41

# Додаємо необхідні бібліотеки


In [2]:
import nltk
import random
import numpy as np
from nltk.corpus import brown
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.utils import to_categorical

In [3]:
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
tagged_sentences = list(brown.tagged_sents(tagset='universal'))
random.seed(42)
random.shuffle(tagged_sentences)

# Розділяємо дані

80% на тренування і 20% для тестування

In [5]:
split = int(0.8 * len(tagged_sentences))
train_sentences = tagged_sentences[:split]
test_sentences = tagged_sentences[split:]

# Виділяємо слова і теги з речень у списки


In [6]:
def get_words_and_tags(sentences):
    sentences_words = []
    sentences_tags = []
    for sentence in sentences:
        words = [word.lower() for word, tag in sentence]
        tags = [tag for word, tag in sentence]
        sentences_words.append(words)
        sentences_tags.append(tags)
    return sentences_words, sentences_tags

train_words, train_tags = get_words_and_tags(train_sentences)
test_words, test_tags = get_words_and_tags(test_sentences)

In [7]:
words = set([w for s in train_words for w in s])
tags = set([t for s in train_tags for t in s])

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

tag2idx = {t: i for i, t in enumerate(tags)}

idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

n_words = len(word2idx)
n_tags = len(tag2idx)


In [8]:
def encode_sentences(sentences, word2idx):
    encoded_sentences = []
    for sentence in sentences:
        encoded_sentence = []
        for word in sentence:
            encoded_sentence.append(word2idx.get(word, word2idx['UNK']))
        encoded_sentences.append(encoded_sentence)
    return encoded_sentences

def encode_tags(sentences_tags, tag2idx):
    encoded_tags = []
    for tags in sentences_tags:
        encoded_tag = [tag2idx[tag] for tag in tags]
        encoded_tags.append(encoded_tag)
    return encoded_tags

In [9]:
X_train = encode_sentences(train_words, word2idx)
y_train = encode_tags(train_tags, tag2idx)

X_test = encode_sentences(test_words, word2idx)
y_test = encode_tags(test_tags, tag2idx)

X_train_flat = [word for sentence in X_train for word in sentence]
y_train_flat = [tag for tags in y_train for tag in tags]

X_test_flat = [word for sentence in X_test for word in sentence]
y_test_flat = [tag for tags in y_test for tag in tags]

X_train_flat = np.array(X_train_flat)
y_train_flat = np.array(y_train_flat)

X_test_flat = np.array(X_test_flat)
y_test_flat = np.array(y_test_flat)

y_train_cat = to_categorical(y_train_flat, num_classes=n_tags)
y_test_cat = to_categorical(y_test_flat, num_classes=n_tags)

inputs = Input(shape=(1,))
embedding = Embedding(input_dim=n_words, output_dim=64, input_length=1)(inputs)
flatten = Flatten()(embedding)
dense = Dense(64, activation='relu')(flatten)
output = Dense(n_tags, activation='softmax')(dense)

model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



# Тренуємо модель


In [10]:
model.fit(
    X_train_flat,
    y_train_cat,
    batch_size=128,
    epochs=5,
    validation_split=0.1,
    verbose=1
)

Epoch 1/5
[1m6533/6533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 26ms/step - accuracy: 0.8816 - loss: 0.4159 - val_accuracy: 0.9424 - val_loss: 0.1546
Epoch 2/5
[1m6533/6533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 30ms/step - accuracy: 0.9521 - loss: 0.1192 - val_accuracy: 0.9430 - val_loss: 0.1512
Epoch 3/5
[1m6533/6533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 26ms/step - accuracy: 0.9536 - loss: 0.1113 - val_accuracy: 0.9398 - val_loss: 0.1531
Epoch 4/5
[1m6533/6533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 29ms/step - accuracy: 0.9538 - loss: 0.1099 - val_accuracy: 0.9441 - val_loss: 0.1528
Epoch 5/5
[1m6533/6533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 29ms/step - accuracy: 0.9547 - loss: 0.1076 - val_accuracy: 0.9443 - val_loss: 0.1531


<keras.src.callbacks.history.History at 0x7d565e8dc1f0>

# Перевіряємо модель на тестових даних

In [11]:
loss, accuracy = model.evaluate(X_test_flat, y_test_cat, verbose=1)
print("Test accuracy: {:.2f}%".format(100 * accuracy))

[1m7256/7256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - accuracy: 0.9452 - loss: 0.1547
Test accuracy: 94.54%


# Пояснення до результатів


| Tag   | Description                  | Examples                      |
|-------|------------------------------|-------------------------------|
| **ADJ**   | Adjective                    | *big*, *old*, *blue*            |
| **ADV**   | Adverb                       | *quickly*, *very*, *well*       |
| **NOUN**  | Noun                         | *dog*, *city*, *car*            |
| **PRON**  | Pronoun                      | *I*, *you*, *he*, *she*         |
| **VERB**  | Verb                         | *run*, *eat*, *be*              |
| **.**     | Other                        |             |


# Речення користувача і результат


In [16]:
from nltk.tokenize import word_tokenize

import builtins
input = builtins.input

user_input = input("\nEnter a sentence to analyze: ")

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
sentence = tokenizer.tokenize(user_input)

encoded_sentence = [word2idx.get(word.lower(), word2idx['UNK']) for word in sentence]
encoded_sentence = np.array(encoded_sentence)

predictions = model.predict(encoded_sentence)
predicted_tags_indices = np.argmax(predictions, axis=1)
predicted_tags = [idx2tag[idx] for idx in predicted_tags_indices]

print("\nPredicted tags:")
for word, tag in zip(sentence, predicted_tags):
    print(f"{word}\t{tag}")


Enter a sentence to analyze: I love it
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step

Predicted tags:
I	PRON
love	NOUN
it	PRON
