In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [2]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [3]:
train_data = JsonlDataReader(file_name='train.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

In [4]:
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=True, remove_duplicates=True)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [5]:
# from src.tokenize.spacy_tokenizer import SpacyTokenizer
# 
# tokenizer = SpacyTokenizer(replace_numbers=True)
# tokenized_train = tokenizer.tokenize(train_data)
# tokenized_test = tokenizer.tokenize(test_data)

In [5]:
from src.tokenize.null_tokenizer import NullTokenizer

tokenizer = NullTokenizer()
tokenized_train = tokenizer.tokenize(preprocessed_train)
tokenized_test = tokenizer.tokenize(preprocessed_test)

In [12]:
from src.vectorizer.sk_count_vectorizer import SkCountVectorizer

vectorizer = SkCountVectorizer(ngram_range=(1, 2), ignore_preprocessing=False)
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)

In [9]:
vectorizer.model.vocabulary_

{'however': 12681,
 'how': 12679,
 'frataxin': 10685,
 'interacts': 13783,
 'with': 28554,
 'the': 26315,
 'fe-s': 10080,
 'cluster': 5761,
 'biosynthesis': 4006,
 'components': 6112,
 'remains': 22406,
 'unclear': 27392,
 'as': 2948,
 'direct': 7978,
 'one-to-one': 18914,
 'interactions': 13778,
 'each': 8625,
 'component': 6111,
 'were': 28395,
 'reported': 22485,
 'iscs': 14112,
 'iscu': 14113,
 'isu1': 14175,
 '11': 146,
 '16': 270,
 'or': 19025,
 'isd11': 14115,
 '14': 227,
 '15': 249,
 'in': 13338,
 'study': 25390,
 'by': 4575,
 'spikes': 24892,
 'sampled': 23345,
 'from': 10755,
 'field': 10237,
 'at': 3078,
 'point': 20574,
 'of': 18768,
 'physiological': 20276,
 'robinson': 22939,
 'et': 9496,
 'al': 2040,
 'genomic': 11173,
 'regions': 22296,
 'influencing': 13551,
 'root': 23011,
 'traits': 26818,
 'barley': 3527,
 '13': 199,
 'maturity': 16245,
 'dried': 8394,
 'grain': 11586,
 'threshed': 26464,
 'hand': 11953,
 'and': 2408,
 'stored': 25268,
 '20c': 608,
 'to': 26649,
 'p

In [10]:
[term for term in vectorizer.model.vocabulary_ if term.isdigit()]

['11',
 '16',
 '14',
 '15',
 '13',
 '1982',
 '1988',
 '12',
 '1964',
 '2007',
 '340',
 '1995',
 '1999',
 '2001',
 '2015',
 '2016',
 '2004',
 '2006',
 '1984',
 '1986',
 '1720',
 '1993',
 '1998',
 '2000',
 '2005',
 '54',
 '55',
 '2010',
 '2013',
 '1979',
 '24',
 '21',
 '1997',
 '2009',
 '2008',
 '2011',
 '2003',
 '2014',
 '1991',
 '1996',
 '1989',
 '20',
 '000',
 '200304',
 '10',
 '36',
 '42',
 '397',
 '2002',
 '383',
 '697',
 '41',
 '38',
 '64',
 '39',
 '40',
 '19',
 '23',
 '0100',
 '34',
 '1987',
 '1992',
 '1983',
 '44',
 '17',
 '003',
 '18',
 '30',
 '28',
 '4452',
 '25',
 '26',
 '27',
 '80',
 '105',
 '66',
 '95',
 '3060',
 '60',
 '158',
 '812',
 '043',
 '100',
 '07',
 '1821',
 '880',
 '31',
 '32',
 '33',
 '0111',
 '1990',
 '200300',
 '50',
 '65',
 '00075',
 '127',
 '129',
 '61',
 '130',
 '102',
 '2017',
 '008',
 '2012',
 '201214',
 '99',
 '85',
 '43',
 '1379',
 '51',
 '13751387',
 '1980',
 '200607',
 '0019',
 '1994',
 '600',
 '550',
 '1127',
 '05',
 '02',
 '01',
 '59',
 '180',
 '68',


In [17]:
def containing_digit(sentence: str) -> list[str]:
    return [word for word in sentence.split() if word.isdigit()]
digit_occurrence = [containing_digit(sentence) for sentence in preprocessed_train.texts]
digit_occurrence

[[],
 ['11', '13'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['340'],
 [],
 [],
 [],
 [],
 [],
 ['2'],
 [],
 [],
 [],
 ['4'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['2013', '2016'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['24'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['2003', '2006'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10'],
 [],
 ['24'],
 ['3'],
 [],
 [],
 [],
 [],
 ['38', '64'],
 [],
 [],
 [],
 [],
 [],
 ['19', '19', '23'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['34'],
 [],
 ['4'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['10'],
 [],
 [],
 ['2005'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['5'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['1', '2', '5', '18', '2', '3', '1', '2'],
 [],
 [],
 [],
 ['30'],
 [],
 [],
 ['28', '3'],
 ['4452'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['2', '5', '80', '105'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 

In [19]:
compile_digit_occurrences = list(zip(digit_occurrence, preprocessed_train.label_indices))
counter = dict()
for digits, y_value in compile_digit_occurrences:
    for digit in digits:
        counter.setdefault(digit, {0: 0, 1: 0, 2: 0})
        counter[digit][y_value] += 1

In [21]:
digit_class_counts = pd.DataFrame(counter).transpose()
digit_class_counts

Unnamed: 0,0,1,2
11,11,4,5
13,11,5,0
340,0,1,0
2,98,55,9
4,31,29,7
...,...,...,...
512,0,1,0
341,0,1,0
014,1,0,0
93,1,0,0


In [25]:
digit_counts = digit_class_counts.sum(axis=1).reset_index()
filtered_counts = digit_counts[digit_counts[0] > 10]
filtered_counts

Unnamed: 0,index,0
0,11,20
1,13,16
3,2,162
4,4,67
7,24,19
10,10,68
11,3,88
18,5,74
19,1,176
20,18,23


In [27]:
digit_class_counts.merge(filtered_counts.set_index('index'), left_index=True, right_index=True)

Unnamed: 0,0_x,1,2,0_y
11,11,4,5,20
13,11,5,0,16
2,98,55,9,162
4,31,29,7,67
24,10,7,2,19
10,31,31,6,68
3,47,29,12,88
5,40,29,5,74
1,103,55,18,176
18,12,8,3,23


In [None]:
from sklearn.metrics import f1_score
from sklearn.svm import SVC


model = SVC(C=10.0, kernel='rbf')
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression()
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')

training_score=0.9998007816475948
testing_score=0.8098938705998758
