In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
import nltk
import tensorflow as tf

In [2]:
from tensorflow.keras import layers
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import os
import torch
from sklearn.metrics import classification_report
from helper_functions import calculate_results
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from keras.models import Sequential
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import random

In [4]:
df_papers = pd.read_parquet("./data/cleaned_df_arxiv.parquet")

## Training

In [5]:
tf.config.set_visible_devices([], "GPU")

In [6]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)

claim_data_dir = "./Claim-Extraction-dataset/"
claim_filenames = [claim_data_dir + filename for filename in os.listdir(claim_data_dir)]

claim_train_data = pd.read_json(claim_filenames[1], lines=True)
claim_test_data = pd.read_json(claim_filenames[0], lines=True)
claim_val_data = pd.read_json(claim_filenames[2], lines=True)


def sentences_label(claim_train_data):
    ID = []
    sentences = []
    labels = []
    list_df = []
    claim_sentences = claim_train_data["sentences"]
    claim_labels = claim_train_data["labels"]
    paper_id = claim_train_data["paper_id"]
    for i in range(len(claim_sentences)):
        for j in range(len(claim_sentences[i])):
            sentences.append(claim_sentences[i][j])
            labels.append(claim_labels[i][j])
            ID.append(paper_id[i])
            list_df.append([paper_id[i], claim_labels[i][j], claim_sentences[i][j]])
    df = pd.DataFrame(list_df, columns=["Paper Ids", "Label", "Sentences"])
    return df


claim_train_sentences_df = sentences_label(claim_train_data)
claim_train_sentences = claim_train_sentences_df["Sentences"].tolist()
claim_train_labels_one_hot = one_hot_encoder.fit_transform(
    claim_train_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_label_encoder = LabelEncoder()
claim_train_labels_encoded = claim_label_encoder.fit_transform(
    claim_train_sentences_df["Label"].to_numpy()
)

claim_test_sentences_df = sentences_label(claim_test_data)
claim_test_sentences = claim_test_sentences_df["Sentences"].tolist()
claim_test_labels_one_hot = one_hot_encoder.fit_transform(
    claim_test_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_test_labels_encoded = claim_label_encoder.fit_transform(
    claim_test_sentences_df["Label"].to_numpy()
)

claim_val_sentences_df = sentences_label(claim_val_data)
claim_val_sentences = claim_val_sentences_df["Sentences"].tolist()
claim_val_labels_one_hot = one_hot_encoder.fit_transform(
    claim_val_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_val_labels_encoded = claim_label_encoder.fit_transform(
    claim_val_sentences_df["Label"].to_numpy()
)


# Get class names and number of classes from LabelEncoder instance
num_classes = len(claim_label_encoder.classes_)
class_names = claim_label_encoder.classes_
print(num_classes, class_names)

sen_len = [len(sentences.split()) for sentences in claim_train_sentences]
avg_sen_len = np.mean(sen_len)
print(avg_sen_len)


def count_words(claim_train_sentences):
    total = []
    total_unique = []
    for i in range(len(claim_train_sentences)):
        text_tokens = word_tokenize(claim_train_sentences[i])
        # tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        total.append(text_tokens)
    for i in range(len(total)):
        for j in range(len(total[i])):
            total_unique.append(total[i][j])
    words = list(set((total_unique)))
    n_words = len(words)
    return n_words


claim_max_token = count_words(claim_train_sentences)

claim_text_vectorizer = TextVectorization(
    max_tokens=claim_max_token,
    standardize="lower_and_strip_punctuation",
    output_sequence_length=55,
)
# Adapt text vectorizer to training sentences
claim_text_vectorizer.adapt(claim_train_sentences)

# viewing vectorize training sentences
target_sentence = random.choice(claim_train_sentences)
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence.split())}")
print(f"\nVectorized text:\n{claim_text_vectorizer([target_sentence])}")


# Getting the vocabulary and showing most frequent and least frequest words in the vocabulary
claim_text_vocab = claim_text_vectorizer.get_vocabulary()
most_common = claim_text_vocab[:5]
least_common = claim_text_vocab[-5:]
print(f"Number of words in vocabulary: {len(claim_text_vocab)}"),
print(f"Most common words in the vocabulary: {most_common}")
print(f"Least common words in the vocabulary: {least_common}")

# Get the config of our text vectorizer
claim_text_vectorizer.get_config()

token_embed = layers.Embedding(
    input_dim=len(claim_text_vocab), output_dim=128, mask_zero=True, input_length=55
)


print(f"Sentence before Vectorization : \n{target_sentence}\n")
vec_sentence = claim_text_vectorizer([target_sentence])
print(f"Sentence After vectorization :\n {vec_sentence}\n")
embed_sentence = token_embed(vec_sentence)
print(f"Embedding Sentence :\n{embed_sentence}\n")

# Turn our data into TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_train_sentences, claim_train_labels_one_hot)
)
valid_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_val_sentences, claim_val_labels_one_hot)
)
test_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_test_sentences, claim_test_labels_one_hot)
)

# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vector = claim_text_vectorizer(inputs)
embed = token_embed(text_vector)
x = layers.Conv1D(
    filters=64,
    kernel_size=5,
    padding="same",
    activation="relu",
    kernel_regularizer=tf.keras.regularizers.L2(0.01),
)(embed)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)
model.summary()

model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])
model_1_history = model.fit(
    train_dataset,
    steps_per_epoch=int(0.1 * len(train_dataset)),
    epochs=15,
    validation_data=valid_dataset,
    validation_steps=int(0.1 * len(valid_dataset)),
)

2 ['0' '1']
24.318397827562798


2024-12-08 15:33:23.792043: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Text:
These results show that coevolution between class I and TAP genes can explain the presence of a single dominantly expressed class I molecule in common chicken MHC haplotypes.

Length of text: 28

Vectorized text:
[[  23   34   65    8 5452   27  392  298    4 4190  101   48 1856    2
   236    3    7  174 3715  243  392  298 1301    5  215 1042  730 1008
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]]
Number of words in vocabulary: 11218
Most common words in the vocabulary: ['', '[UNK]', 'the', 'of', 'and']
Least common words in the vocabulary: ['0027', '0021', '001p009', '00174', '0006']
Sentence before Vectorization : 
These results show that coevolution between class I and TAP genes can explain the presence of a single dominantly expressed class I molecule in common chicken MHC haplotypes.

Sentence After vectorization :
 [[  23   34   65    8 5452   27  392  298    4 4190  101   48 18