In [1]:
!pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [2]:
import pandas as pd
import conllu

def extract_sentences(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith('# sent_id'):
                if current_sentence:
                    sentences.append(' '.join(current_sentence))
                    current_sentence = []
            elif line.startswith('#') or line == '\n':
                continue
            else:
                token = conllu.parse(line)[0][0]['form']
                current_sentence.append(token)

    if current_sentence:
        sentences.append(' '.join(current_sentence))

    return sentences

train_sentences = extract_sentences('hr500k-train.conllu')
test_sentences = extract_sentences('hr500k-test.conllu')
dev_sentences = extract_sentences('hr500k-dev.conllu')

train_df = pd.DataFrame({'sentence': train_sentences})
test_df = pd.DataFrame({'sentence': test_sentences})
dev_df = pd.DataFrame({'sentence': dev_sentences})

train_df.to_csv('train_sentences.csv', index=False)
test_df.to_csv('test_sentences.csv', index=False)
dev_df.to_csv('dev_sentences.csv', index=False)

print('Train data sentences DataFrame:')
print(train_df.head())

print('\nTest data sentences DataFrame:')
print(test_df.head())

print('\nDev data sentences DataFrame:')
print(dev_df.head())

Train data sentences DataFrame:
                                            sentence
0  Kazna medijskom mogulu obnovila raspravu u Mak...
1  Neki tvrde da je presuda Veliji Ramkovskom nap...
2  Medijski mogul Velija Ramkovski osuđen je na 1...
3  Kaznena presuda i zatvorska kazna medijskom mo...
4  Ramkovski , bivši vlasnik televizijske postaje...

Test data sentences DataFrame:
                                            sentence
0  Beograd i Priština postigli dogovor o slobodi ...
1  Pregovarački timovi Beograda i Prištine usugla...
2  Neki tvrde kako su sporazumi korak prema konač...
3  Dok vlasti u Beogradu pokušavaju predstaviti p...
4  Nakon završetka razgovora u Bruxellesu , šef i...

Dev data sentences DataFrame:
                                            sentence
0       Proces privatizacije na Kosovu pod povećalom
1  Kosovo ozbiljno analizira proces privatizacije...
2  Feronikel je privatiziran prije pet godina i j...
3      Barem na papiru , izgleda kao odlična ideja .
4  V

In [3]:
import conllu

def extract_ner_tags(file_path):
    sentences = []
    ner_tags = []

    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = []
        current_ner_tags = []

        for tokenlist in conllu.parse_incr(file):
            for token in tokenlist:
                if 'misc' in token and token['misc'] is not None and 'NamedEntity' in token['misc'] and token['misc']['NamedEntity'] == 'Yes':
                    current_sentence.append(token['form'])
                    current_ner_tags.append(token['misc']['NER'])
            # End of the sentence
            if current_sentence:
                sentences.append(" ".join(current_sentence))
                ner_tags.append(current_ner_tags)
                current_sentence = []
                current_ner_tags = []

    return sentences, ner_tags


In [4]:
train_sentences, train_ner_tags = extract_ner_tags('hr500k-train.conllu')

test_sentences, test_ner_tags = extract_ner_tags('hr500k-test.conllu')
dev_sentences, dev_ner_tags = extract_ner_tags('hr500k-dev.conllu')


In [5]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense

In [6]:
# Combining sentences and NER tags for training data
train_data = list(zip(train_sentences, train_ner_tags))

In [7]:
# Tokenize and pad the sentences
max_words = 10000  
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
dev_sequences = tokenizer.texts_to_sequences(dev_sentences)

In [9]:
maxlen = max(len(seq) for seq in train_sequences)
train_data = pad_sequences(train_sequences, maxlen=maxlen)
test_data = pad_sequences(test_sequences, maxlen=maxlen)
dev_data = pad_sequences(dev_sequences, maxlen=maxlen)

In [10]:
# Converting NER tags to one-hot encoding
ner_tags = set(tag for tags in train_ner_tags for tag in tags)
tag_to_index = {tag: i for i, tag in enumerate(ner_tags)}

In [11]:
train_labels = [[tag_to_index[tag] for tag in tags] for tags in train_ner_tags]
train_labels = pad_sequences(train_labels, maxlen=maxlen, padding='post')

test_labels = [[tag_to_index[tag] for tag in tags] for tags in test_ner_tags]
test_labels = pad_sequences(test_labels, maxlen=maxlen, padding='post')

dev_labels = [[tag_to_index[tag] for tag in tags] for tags in dev_ner_tags]
dev_labels = pad_sequences(dev_labels, maxlen=maxlen, padding='post')

In [12]:
# Bidirectional LSTM model
embedding_dim = 100 
output_dim = len(ner_tags)  # Number of NER tags

In [13]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen))
model.add(Bidirectional(LSTM(100, return_sequences=True)))
model.add(Dense(output_dim, activation='softmax'))

In [14]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [15]:
# Train the model
model.fit(train_data, train_labels, epochs=5, validation_data=(dev_data, dev_labels))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f9b38167ee0>

In [19]:
# Create reverse mapping from index to tag
index_to_tag = {i: tag for tag, i in tag_to_index.items()}

In [20]:
from sklearn.metrics import classification_report

import numpy as np
# Predict on test data
test_predictions = model.predict(test_data)

# Convert predictions to labels
test_pred_labels = [[index_to_tag[i] for i in np.argmax(pred, axis=-1)] for pred in test_predictions]
test_true_labels = [[index_to_tag[i] for i in seq] for seq in test_labels]

# Flatten the lists for classification_report
flat_test_true_labels = [tag for sublist in test_true_labels for tag in sublist]
flat_test_pred_labels = [tag for sublist in test_pred_labels for tag in sublist]

# Print classification report
print(classification_report(flat_test_true_labels, flat_test_pred_labels))



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

 B-deriv-per       0.00      0.00      0.00        38
       B-loc       0.96      0.99      0.98     36208
      B-misc       0.00      0.00      0.00       350
       B-org       0.27      0.21      0.24       740
       B-per       0.31      0.22      0.25       692
       I-loc       0.00      0.00      0.00       165
      I-misc       0.00      0.00      0.00       509
       I-org       0.26      0.47      0.34       556
       I-per       0.08      0.00      0.00       441

    accuracy                           0.92     39699
   macro avg       0.21      0.21      0.20     39699
weighted avg       0.89      0.92      0.90     39699



  _warn_prf(average, modifier, msg_start, len(result))
