In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from numpy.random import RandomState

from src.extraction.jsonl_data_reader import JsonlDataReader


KeyboardInterrupt



In [None]:
seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

In [None]:
train_data = JsonlDataReader(file_name='train.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

In [None]:
from src.preprocessing.simple_preprocessor import SimplePreprocessor

preprocessor = SimplePreprocessor(remove_citations=True, remove_duplicates=True)
preprocessed_train = preprocessor.preprocess(train_data)
preprocessed_test = preprocessor.preprocess(test_data)

In [None]:
from src.tokenize.spacy_tokenizer import SpacyTokenizer

tokenizer = SpacyTokenizer()
tokenized_train = tokenizer.tokenize(train_data)
tokenized_test = tokenizer.tokenize(test_data)

In [None]:
from src.tokenize.null_tokenizer import NullTokenizer

tokenizer = NullTokenizer()
tokenized_train = tokenizer.tokenize(preprocessed_train)
tokenized_test = tokenizer.tokenize(preprocessed_test)

In [None]:
from src.vectorizer.fasttext_w2v_vectorizer import FastTextW2vVectorizer

vectorizer = FastTextW2vVectorizer()
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)

In [None]:
from src.vectorizer.sk_count_vectorizer import SkCountVectorizer

vectorizer = SkCountVectorizer(ngram_range=(1, 2))
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)

In [None]:
from src.vectorizer.tfidf_vectorizer import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)

In [None]:
from src.vectorizer.lsi_vectorizer import LsiVectorizer

vectorizer = LsiVectorizer(num_topics=100)
vectorizer.fit(tokenized_train)
vectorized_train = vectorizer.transform(tokenized_train)
vectorized_test = vectorizer.transform(tokenized_test)

In [None]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, perplexity=80, n_iter=1000)
tsne_embedding = tsne_model.fit_transform(vectorized_train.vectors)
tsne_embedding.shape

In [None]:
plot_tsne = pd.DataFrame(tsne_embedding, columns=['tsne1', 'tsne2'])
plot_tsne.loc[:, 'label'] = vectorized_train.labels
fig = px.scatter(plot_tsne, x='tsne1', y='tsne2', color='label')
fig.show()

In [None]:
from sklearn.metrics import f1_score
from sklearn.svm import SVC


model = SVC(C=10.0, kernel='rbf', gamma=0.001)
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

model = LogisticRegression()
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier(n_estimators=2000, max_depth=100, min_samples_leaf=5, n_jobs=-1)
model.fit(vectorized_train.vectors, vectorized_train.label_indices)
y_pred_train = model.predict(vectorized_train.vectors)
training_score = f1_score(vectorized_train.label_indices, y_pred_train, average='macro')
print(f'{training_score=}')

y_pred_test = model.predict(vectorized_test.vectors)
testing_score = f1_score(vectorized_test.label_indices, y_pred_test, average='macro')
print(f'{testing_score=}')