In [1]:
import numpy as np
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [2]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [3]:
train_data = JsonlDataReader(file_name='train.jsonl').read()
dev_data = JsonlDataReader(file_name='dev.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

In [4]:
from src.preprocessing.null_preprocessor import NullPreprocessor

preprocessor = NullPreprocessor()
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_dev = preprocessor.preprocess(dev_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [5]:
from src.tokenize.null_tokenizer import NullTokenizer

tokenizer = NullTokenizer()
tokenized_train = tokenizer.tokenize(train_data)
tokenized_dev = tokenizer.tokenize(preprocessed_dev)
tokenized_test = tokenizer.tokenize(test_data)

In [6]:

from src.vectorizer.sk_count_vectorizer import SkCountVectorizer

vectorizer = SkCountVectorizer(
    ngram_range=(1, 1)
)
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_dev = vectorizer.transform(tokenized_dev)
vectorized_test = vectorizer.transform(tokenized_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_fscore_support

model = LogisticRegression(max_iter=2000)
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')
print(confusion_matrix(vectorized_train.label_indices, y_pred_train))
print(precision_recall_fscore_support(vectorized_train.label_indices, y_pred_train, average='macro'))

y_pred_dev = model.predict(vectorized_dev.vectors)
dev_score = f1_score(vectorized_dev.label_indices, y_pred_dev, average='macro')
print(f'{dev_score=}')
print(confusion_matrix(vectorized_dev.label_indices, y_pred_dev))
print(precision_recall_fscore_support(vectorized_dev.label_indices, y_pred_dev, average='macro'))

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')
print(confusion_matrix(vectorized_test.label_indices, y_pred_test))
print(precision_recall_fscore_support(vectorized_test.label_indices, y_pred_test, average='macro'))

training_score=0.9979293560266734
[[4832    7    1]
 [   3 2290    1]
 [   2    1 1106]]
(0.9978933197327637, 0.9979660961698094, 0.9979293560266734, None)
dev_score=0.7976097570260805
[[488  37  13]
 [ 73 178   4]
 [ 30   4  89]]
(0.8260423832575504, 0.7762265494948849, 0.7976097570260805, None)
testing_score=0.7859460492913773
[[865  70  62]
 [122 462  21]
 [ 61  11 187]]
(0.7896010004212245, 0.7844156313564539, 0.7859460492913773, None)
