In [1]:
import numpy as np
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [2]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [3]:
train_data = JsonlDataReader(file_name='train.jsonl').read()
dev_data = JsonlDataReader(file_name='dev.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

In [4]:
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=False, remove_duplicates=False)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_dev = preprocessor.preprocess(dev_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [5]:
from src.tokenize.spacy_tokenizer import SpacyTokenizer

tokenizer = SpacyTokenizer(replace_numbers=True, remove_stopwords=False, merge_nouns=False, merge_entities=False,
                           lemmatize=False)
tokenized_train = tokenizer.tokenize(train_data)
tokenized_dev = tokenizer.tokenize(preprocessed_dev)
tokenized_test = tokenizer.tokenize(test_data)

In [6]:

from src.vectorizer.sk_tfidf_vectorizer import SkTfidfVectorizer

vectorizer = SkTfidfVectorizer(
    ngram_range=(1, 2), ignore_preprocessing=False,
    analyzer='word', binary=True
)
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_dev = vectorizer.transform(tokenized_dev)
vectorized_test = vectorizer.transform(tokenized_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_fscore_support

model = LogisticRegression(max_iter=2000, C=10)
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')
print(confusion_matrix(vectorized_train.label_indices, y_pred_train))
print(precision_recall_fscore_support(vectorized_train.label_indices, y_pred_train, average='macro'))

y_pred_dev = model.predict(vectorized_dev.vectors)
dev_score = f1_score(vectorized_dev.label_indices, y_pred_dev, average='macro')
print(f'{dev_score=}')
print(confusion_matrix(vectorized_dev.label_indices, y_pred_dev))
print(precision_recall_fscore_support(vectorized_dev.label_indices, y_pred_dev, average='macro'))

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')
print(confusion_matrix(vectorized_test.label_indices, y_pred_test))
print(precision_recall_fscore_support(vectorized_test.label_indices, y_pred_test, average='macro'))

training_score=0.9978193880384293
[[4834    5    1]
 [   5 2289    0]
 [   5    0 1104]]
(0.9982836713639273, 0.9973573884495986, 0.9978193880384293, None)
dev_score=0.8182162969395542
[[490  33  15]
 [ 62 187   6]
 [ 23   5  95]]
(0.8340835137986562, 0.8054905753518501, 0.8182162969395542, None)
testing_score=0.8241645391110555
[[870  62  65]
 [102 479  24]
 [ 33   7 219]]
(0.8169327313569607, 0.8366377454368701, 0.8241645391110555, None)
