# Automatic animacy classification for Romanian nouns

Import necessary libraries and modules:

In [14]:
from nltk.corpus import wordnet as wn
import nltk
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import gensim
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import re
import csv
import spacy
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Data preprocessing

Our proposed classifier distinguishes between two classes of Romanian nouns, human and non-human, by working with lemmas from a word list as opposed to tokens from a large annotated corpus. We use a seed set of nouns labeled with animacy information derived from Romanian WordNet and make use of the associations encoded in a pretrained word embedding model to train a classifier that can generalize beyond the labeled seed nouns.

We first derived two sets of Romanian nouns from WordNet, attempting to encompass as many tokens that can be labelled as either human or non-human. To this end, we first identified two high-order hypernyms in the WordNet hierarchy that can act as either animate or inanimate targets for each subsequent set of words (namely *ființă umană* 'human being' and *artefact* 'artefact'). Then, we used Open Multilingual WordNet (OMW) to generate lists of hyponyms for each of the two high-order target synsets, thus resulting in two different sets of words, one containing nouns with the semantic feature [+Human] and the other containing the semantic feature [+Non-Human].

In [43]:
# define function provided by RoWordNet API author
# https://github.com/dumitrescustefan/RoWordNet/tree/master
def get_hyponyms(synset):
    hyponyms = set()
    for hyponym in synset.hyponyms():
        hyponyms |= set(get_hyponyms(hyponym))
    return hyponyms | set(synset.hyponyms())

In [44]:
# get sysnsets | lemmas below target hypernyms
human_target = wn.synsets('făptură', lang = 'ron', pos = wn.NOUN)
non_animate_target = wn.synsets('obiect', lang = 'ron', pos = wn.NOUN)
list_of_animates = []
list_of_inanimates = []
for synset in wn.synsets('om', lang = 'ron', pos = wn.NOUN):
    hyponyms = get_hyponyms(synset)
    for item in hyponyms: #item is a synset object
        for lemma in item.lemmas():
            list_of_animates.append(lemma.name())

for synset in wn.synsets('artifact', lang = 'ron', pos = wn.NOUN):
    inanimate_hyponyms = get_hyponyms(synset)
    for item in inanimate_hyponyms:
        for lemma in item.lemmas():
            list_of_inanimates.append(lemma.name())

Secondly, we extracted corresponding vectors for each word in this seed set using a set of pre-trained word embeddings for Romanian.

In [46]:
# load pre-trained vectors
# http://89.38.230.23/word_embeddings/
wv_from_text = KeyedVectors.load_word2vec_format('.\corola.300.20.vec', binary=False)

In [47]:
# create a list with all tokens (animate + inanimate)
list_of_lines = []
all_tokens = list_of_animates + list_of_inanimates
newline = '\n'

# create an np array containing tokens, vectors and animacy labels
for token in all_tokens:
    if token in wv_from_text:
        if token in list_of_animates:
            animacy_label = 'animate'
        else:
            animacy_label = 'inanimate'
        vector = wv_from_text.__getitem__(token)
        line_str = animacy_label + ',' + np.array2string(vector, max_line_width=np.inf) + ',' + token
        line_list = line_str.split(',')
        list_of_lines.append(line_list)
    else:
        continue

# writing the list of lists to a csv file
outfilename = "vectors1.csv"
with open(outfilename, "w") as outfile:
    for row in list_of_lines:
        line = f'{",".join(row)}{newline}'
        outfile.write(line)
        
words_data = pd.read_csv('vectors1.csv', encoding='latin-1', header = None)
words_data.columns = ['animacy label', 'vector str', 'word']
words_data.head()
words_data.shape[0]

# convert column 1 to np.ndarray again
words_data['vector'] = words_data['vector str'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

# encoding the label column
words_data['animacy label'] = words_data['animacy label'].map({'animate':1,'inanimate':0})

# train-test split
X_train, X_test, y_train, y_test = train_test_split (words_data['vector'], words_data['animacy label'] , test_size=0.2)
vector_length = words_data['vector'][0].shape[0]

# create the X_train_vectors array
X_train_vectors = np.zeros((len(X_train), vector_length))
i=0
for index, row in X_train.iteritems():
    vector = row
    X_train_vectors[i] = vector
    i=i+1

### Training the classifiers

We applied three different classifiers to the task: Random Forest (RF), Multi-layer Perceptron (MLP) and k-nearest neighbours (KNN).

In [60]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vectors, y_train)

mlp = MLPClassifier()
mlp_model = mlp.fit(X_train_vectors, y_train)

knn_model = KNeighborsClassifier().fit(X_train_vectors, y_train)

# feature vectors from testing data
X_test_vectors = np.zeros((len(X_test), vector_length))
for i, vector in enumerate(X_test):
    X_test_vectors[i] = vector

### Predict animacy labels using trained classifiers

In [61]:
y_pred = rf_model.predict(X_test_vectors)
y_pred_mlp = mlp_model.predict(X_test_vectors)
y_pred_knn = knn_model.predict(X_test_vectors)

### Compute evaluation metrics

In [62]:
precision = precision_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

precision_mlp = precision_score(y_test, y_pred_mlp)

recall_mlp = recall_score(y_test, y_pred_mlp)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_mlp, 3), round(recall_mlp, 3), round((y_pred_mlp == y_test).sum()/len(y_pred_mlp), 3)))

precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision_knn, 3), round(recall_knn, 3), round((y_pred_knn == y_test).sum()/len(y_pred_knn), 3)))

Precision: 0.921 / Recall: 0.797 / Accuracy: 0.873
Precision: 0.863 / Recall: 0.866 / Accuracy: 0.873
Precision: 0.742 / Recall: 0.827 / Accuracy: 0.784


### Evaluation on natural data

In [63]:
def preprocess_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile:
        text = infile.read()

        # Remove punctuation and numbers, keep words and spaces
        preprocessed_text = ''.join(char for char in text if char.isalpha() or char.isspace())

    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write(preprocessed_text)

def tokenize_preprocessed_text(preprocessed_text):
    tokens = nltk.word_tokenize(preprocessed_text)
    return tokens

In [64]:
preprocess_text_file('evaluation.txt', 'evaluation_clean.txt')
with open('evaluation_clean.txt', 'r', encoding = 'utf8') as infile:
    clean_text = infile.read()
    tokens = tokenize_preprocessed_text(clean_text)

ro_model = spacy.load("ro_core_news_sm")

# list of tokens into single string
tokenized_text = " ".join(tokens)

# extract nouns
doc = ro_model(tokenized_text)
nouns = [token.text for token in doc if token.pos_ == "NOUN"]
#len(nouns)

# process list of nouns to expected data format for rf_model (2D np array)

# extract vectors for the list of nouns
novel_noun_vectors = []
for noun in nouns:
    if noun in wv_from_text:
        word_vector = wv_from_text[noun]
        novel_noun_vectors.append(word_vector)

# join the nouns and vectors
novel_noun_vectors = np.array(novel_noun_vectors)

# pass nouns to RF classifier
novel_noun_predictions = rf_model.predict(novel_noun_vectors)

In [66]:
df = pd.read_csv('noun_prediction_annotation.csv', encoding = 'utf8')

label_counts = df['annotated'].value_counts(normalize=True) * 100

print("Percentage of rows for each label:")
print(label_counts)

Percentage of rows for each label:
0     87.211740
1      7.337526
oc     5.450734
Name: annotated, dtype: float64


In [67]:
df['annotated'].value_counts()

0     832
1      70
oc     52
Name: annotated, dtype: int64

In [68]:
# Drop lines labelled as other class (not nouns, i.e. POS-tagging errors)
df = df[df.annotated != 'oc']

In [69]:
df['annotated'].value_counts()

0    832
1     70
Name: annotated, dtype: int64

In [70]:
# # Load the CSV file
# file_path = 'predicted_lables_novel_nouns.csv'  
# df = pd.read_csv(file_path)

# Clean up NaN values
df = df.dropna()
# Generate the confusion matrix
y_true = df['annotated'].astype('int')
y_pred = df['predicted']

# Define the labels for confusion matrix
labels = ['False Negative', 'Correct Prediction', 'False Positive']

# Create the confusion matrix
confusion = confusion_matrix(y_true, y_pred)

# Extract values from the confusion matrix
false_positive = confusion[0, 1]
correct_prediction = int(confusion[1, 1]) + int(confusion[0,0])
false_negative = confusion[1, 0]
true_negative = confusion[1,1]
true_positive = confusion[0,0]

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Print the results
print("Confusion Matrix:")
print(confusion)
print(f"False Negatives: {false_negative}")
print(f'True negatives: {true_negative}')
print(f"True positives: {true_positive}")
print(f"Correct Predictions: {correct_prediction}")
print(f"False Positives: {false_positive}")
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Confusion Matrix:
[[514 318]
 [ 43  27]]
False Negatives: 43
True negatives: 27
True positives: 514
Correct Predictions: 541
False Positives: 318

Accuracy: 0.5997782705099778
Precision: 0.0782608695652174
Recall: 0.38571428571428573
F1 Score: 0.13012048192771086
