In [3]:
import string

import numpy as np
import pandas as pd
import plotly.express as px
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader

In [4]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)
train_data = JsonlDataReader(file_name='train.jsonl').read()
dev_data = JsonlDataReader(file_name='dev.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=True, remove_duplicates=True)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_dev = preprocessor.preprocess(dev_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [5]:
from src.tokenize.null_tokenizer import NullTokenizer

tokenizer = NullTokenizer()
tokenized_train = tokenizer.tokenize(preprocessed_train)
tokenized_dev = tokenizer.tokenize(preprocessed_dev)
tokenized_test = tokenizer.tokenize(preprocessed_test)

In [6]:
from src.vectorizer.sk_count_vectorizer import SkCountVectorizer

vectorizer = SkCountVectorizer(
    ignore_preprocessing=False,
    ngram_range=(1, 1), analyzer='word',
    binary=True
)

vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)
len(vectorizer.model.vocabulary_)

28986

In [None]:
[term for term in vectorizer.model.vocabulary_ if term.isdigit()]

In [13]:
def containing_digit(sentence: str) -> list[str]:
    return [word for word in sentence.split() if word.isdigit()]


digit_occurrence = [containing_digit(sentence) for sentence in preprocessed_train.texts]

compile_digit_occurrences = list(zip(digit_occurrence, preprocessed_train.label_indices))
counter = dict()
for digits, y_value in compile_digit_occurrences:
    for digit in digits:
        counter.setdefault(digit, {0: 0, 1: 0, 2: 0})
        counter[digit][y_value] += 1
digit_class_counts = pd.DataFrame(counter).transpose()
digit_class_counts

Unnamed: 0,0,1,2
11,11,4,5
13,11,5,0
340,0,1,0
2,100,58,9
4,31,29,7
...,...,...,...
93,1,0,0
688,1,0,0
300330,0,1,0
520,1,0,0


In [14]:
digit_class_counts.loc[:, 'total'] = digit_class_counts.sum(axis=1)
for column in (0, 1, 2):
    digit_class_counts[column] = digit_class_counts[column] / digit_class_counts['total']
digit_class_counts = digit_class_counts.reset_index()
digit_class_counts

Unnamed: 0,index,0,1,2,total
0,11,0.550000,0.200000,0.250000,20
1,13,0.687500,0.312500,0.000000,16
2,340,0.000000,1.000000,0.000000,1
3,2,0.598802,0.347305,0.053892,167
4,4,0.462687,0.432836,0.104478,67
...,...,...,...,...,...
365,93,1.000000,0.000000,0.000000,1
366,688,1.000000,0.000000,0.000000,1
367,300330,0.000000,1.000000,0.000000,1
368,520,1.000000,0.000000,0.000000,1


In [15]:
fig = px.scatter_3d(digit_class_counts, x=0, y=1, z=2, hover_data='index')
fig.show()