In [1]:
import numpy as np
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [2]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [6]:
train_data = JsonlDataReader(file_name='train.jsonl').read()
train_jsonl = JsonlDataReader(file_name='train.jsonl').read_jsonl()

In [8]:
from src.preprocessing.train_duplicate_remover import TrainDuplicateRemover

deduplicated_jsonl = TrainDuplicateRemover().remove_if_train(train_jsonl)
deduplicated_jsonl

[{'source': 'explicit',
  'citeEnd': 175,
  'sectionName': 'Introduction',
  'citeStart': 168,
  'string': 'However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).',
  'label': 'background',
  'label_confidence': 1.0,
  'citingPaperId': '1872080baa7d30ec8fb87be9a65358cd3a7fb649',
  'citedPaperId': '894be9b4ea46a5c422e81ef3c241072d4c73fdc0',
  'isKeyCitation': True,
  'id': '1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4ea46a5c422e81ef3c241072d4c73fdc0',
  'unique_id': '1872080baa7d30ec8fb87be9a65358cd3a7fb649>894be9b4ea46a5c422e81ef3c241072d4c73fdc0_11',
  'excerpt_index': 11},
 {'source': 'explicit',
  'citeStart': 16,
  'sectionName': 'Novel Quantitative Trait Loci for Seminal Root Traits in Barley',
  'string': 'In the study by Hickey et al. (2012), spikes were sampled from the field at the point of physiological\nro

In [9]:
import json
from src.utils.path_getter import PathGetter

with open(PathGetter.get_data_directory() / 'deduplicated_train.jsonl', 'w') as f:
    for row in deduplicated_jsonl:
        json.dump(row, f)
        f.write('\n')

In [4]:
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=False, remove_duplicates=True)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_dev = preprocessor.preprocess(dev_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [5]:
preprocessed_train



In [5]:
from src.tokenize.spacy_tokenizer import SpacyTokenizer

tokenizer1 = SpacyTokenizer(replace_numbers=True, remove_stopwords=False, merge_nouns=False, merge_entities=False,
                            lemmatize=False)
tokenized_train1 = tokenizer1.tokenize(train_data)
tokenized_dev1 = tokenizer1.tokenize(preprocessed_dev)
tokenized_test1 = tokenizer1.tokenize(test_data)

In [26]:
from src.tokenize.spacy_tokenizer import SpacyTokenizer
from src.tokenize.spacy_dep_tokenizer import SpacyDepTokenizer
from src.tokenize.spacy_pos_tokenizer import SpacyPosTokenizer
from src.tokenize.spacy_tag_tokenizer import SpacyTagTokenizer

tokenizers = [
    SpacyTokenizer(replace_numbers=True, remove_stopwords=False, merge_nouns=False, merge_entities=False,
                   lemmatize=False),
    # SpacyPosTokenizer(),
    # SpacyTagTokenizer(),
    SpacyDepTokenizer(),
]

preprocessed_data = {
    'train': preprocessed_train,
    'dev': preprocessed_dev,
    'test': preprocessed_test,
}

for tokenizer in tokenizers:
    tokenizer.fit(preprocessed_data['train'])

tokenized_data = dict()
for env, dataset in preprocessed_data.items():
    tokenized_data[env] = [tokenizer.tokenize(dataset) for tokenizer in tokenizers]

In [27]:

from src.vectorizer.sk_count_vectorizer import SkCountVectorizer
from src.vectorizer.sk_tfidf_vectorizer import SkTfidfVectorizer

vectorizers = [
    SkTfidfVectorizer(
        ngram_range=(1, 2), ignore_preprocessing=False,
        analyzer='word', binary=True
    ),
    # SkCountVectorizer(
    #     ignore_preprocessing=False,
    #     ngram_range=(2, 4), analyzer='word',
    #     binary=False
    # ),
    # SkCountVectorizer(
    #     ignore_preprocessing=False,
    #     ngram_range=(2, 4), analyzer='word',
    #     binary=False
    # ),
    SkCountVectorizer(
        ignore_preprocessing=False,
        ngram_range=(2, 4), analyzer='word',
        binary=False
    ),
]

for vectorizer, data in zip(vectorizers, tokenized_data['train']):
    vectorizer.fit(data)

vectorized_data = dict()
for env, dataset in tokenized_data.items():
    vectorized_data[env] = [vectorizer.transform(data) for vectorizer, data in zip(vectorizers, dataset)]

In [28]:
from src.schema.vectorized_data import VectorizedData
import scipy

vectorized_train = VectorizedData(scipy.sparse.hstack([x.vectors for x in vectorized_data['train']]), vectorized_data['train'][0].id,
                                  vectorized_data['train'][0].labels)
vectorized_dev = VectorizedData(scipy.sparse.hstack([x.vectors for x in vectorized_data['dev']]), vectorized_data['dev'][0].id,
                                vectorized_data['dev'][0].labels)
vectorized_test = VectorizedData(scipy.sparse.hstack([x.vectors for x in vectorized_data['test']]), vectorized_data['test'][0].id,
                                 vectorized_data['test'][0].labels)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_fscore_support

model = LogisticRegression(max_iter=2000, C=10)
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')
print(confusion_matrix(vectorized_train.label_indices, y_pred_train))
print(precision_recall_fscore_support(vectorized_train.label_indices, y_pred_train, average='macro'))

y_pred_dev = model.predict(vectorized_dev.vectors)
dev_score = f1_score(vectorized_dev.label_indices, y_pred_dev, average='macro')
print(f'{dev_score=}')
print(confusion_matrix(vectorized_dev.label_indices, y_pred_dev))
print(precision_recall_fscore_support(vectorized_dev.label_indices, y_pred_dev, average='macro'))

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')
print(confusion_matrix(vectorized_test.label_indices, y_pred_test))
print(precision_recall_fscore_support(vectorized_test.label_indices, y_pred_test, average='macro'))

training_score=1.0
[[4320    0    0]
 [   0 2191    0]
 [   0    0 1038]]
(1.0, 1.0, 1.0, None)
dev_score=0.6281904970097546
[[439  70  29]
 [ 90 153  12]
 [ 57  14  52]]
(0.651285387626194, 0.6129164525846003, 0.6281904970097546, None)
testing_score=0.6432170195723672
[[791 134  72]
 [214 361  30]
 [ 87  39 133]]
(0.6554487945714338, 0.6345292896036034, 0.6432170195723672, None)
