In [1]:
import pandas as pd
pd.options.plotting.backend = "plotly"

In [2]:
df_translation_papers = pd.read_parquet('data/translation_papers.parquet')

In [3]:
def find_index_conclusion(text_list):
    for i, text in enumerate(text_list):
        if 'conclusion' in text[0].lower():
            return i
    return -1

In [4]:
# Function to split text into title, abstract, and body
def split_text(text_list):
    index_conclusion = find_index_conclusion(text_list)
    title = " ".join(text_list[0])  # First list, first element
    abstract = " ".join(text_list[1])  # Second list, first element
    conclusion = " ".join(text_list[index_conclusion])

    return title, abstract, conclusion


# Apply function to create new columns
new_translation_df = df_translation_papers.copy()
new_translation_df[["title", "abstract", "conclusion"]] = new_translation_df["text"].apply(
    lambda x: pd.Series(split_text(x))
)

In [5]:
# Remove the first 10 characters from the 'abstract' column
new_translation_df["abstract"] = new_translation_df["abstract"].apply(
    lambda x: x[10:] if isinstance(x, str) else x
)

# Remove the first 13 characters from the 'body' column
new_translation_df["conclusion"] = new_translation_df["conclusion"].apply(
    lambda x: x[11:] if isinstance(x, str) else x
)

## Training

In [6]:
import os
import torch
from sklearn.metrics import classification_report
from helper_functions import calculate_results
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from keras.models import Sequential
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import random

In [7]:
tf.config.set_visible_devices([], "GPU")

In [8]:
label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)

claim_data_dir = "./Claim-Extraction-dataset/"
claim_filenames = [claim_data_dir + filename for filename in os.listdir(claim_data_dir)]

claim_train_data = pd.read_json(claim_filenames[1], lines=True)
claim_test_data = pd.read_json(claim_filenames[0], lines=True)
claim_val_data = pd.read_json(claim_filenames[2], lines=True)


def sentences_label(claim_train_data):
    ID = []
    sentences = []
    labels = []
    list_df = []
    claim_sentences = claim_train_data["sentences"]
    claim_labels = claim_train_data["labels"]
    paper_id = claim_train_data["paper_id"]
    for i in range(len(claim_sentences)):
        for j in range(len(claim_sentences[i])):
            sentences.append(claim_sentences[i][j])
            labels.append(claim_labels[i][j])
            ID.append(paper_id[i])
            list_df.append([paper_id[i], claim_labels[i][j], claim_sentences[i][j]])
    df = pd.DataFrame(list_df, columns=["Paper Ids", "Label", "Sentences"])
    return df


claim_train_sentences_df = sentences_label(claim_train_data)
claim_train_sentences = claim_train_sentences_df["Sentences"].tolist()
claim_train_labels_one_hot = one_hot_encoder.fit_transform(
    claim_train_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_label_encoder = LabelEncoder()
claim_train_labels_encoded = claim_label_encoder.fit_transform(
    claim_train_sentences_df["Label"].to_numpy()
)

claim_test_sentences_df = sentences_label(claim_test_data)
claim_test_sentences = claim_test_sentences_df["Sentences"].tolist()
claim_test_labels_one_hot = one_hot_encoder.fit_transform(
    claim_test_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_test_labels_encoded = claim_label_encoder.fit_transform(
    claim_test_sentences_df["Label"].to_numpy()
)

claim_val_sentences_df = sentences_label(claim_val_data)
claim_val_sentences = claim_val_sentences_df["Sentences"].tolist()
claim_val_labels_one_hot = one_hot_encoder.fit_transform(
    claim_val_sentences_df["Label"].to_numpy().reshape(-1, 1)
)
claim_val_labels_encoded = claim_label_encoder.fit_transform(
    claim_val_sentences_df["Label"].to_numpy()
)


# Get class names and number of classes from LabelEncoder instance
num_classes = len(claim_label_encoder.classes_)
class_names = claim_label_encoder.classes_
print(num_classes, class_names)

sen_len = [len(sentences.split()) for sentences in claim_train_sentences]
avg_sen_len = np.mean(sen_len)
print(avg_sen_len)


def count_words(claim_train_sentences):
    total = []
    total_unique = []
    for i in range(len(claim_train_sentences)):
        text_tokens = word_tokenize(claim_train_sentences[i])
        # tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        total.append(text_tokens)
    for i in range(len(total)):
        for j in range(len(total[i])):
            total_unique.append(total[i][j])
    words = list(set((total_unique)))
    n_words = len(words)
    return n_words


claim_max_token = count_words(claim_train_sentences)

claim_text_vectorizer = TextVectorization(
    max_tokens=claim_max_token,
    standardize="lower_and_strip_punctuation",
    output_sequence_length=55,
)
# Adapt text vectorizer to training sentences
claim_text_vectorizer.adapt(claim_train_sentences)

# viewing vectorize training sentences
target_sentence = random.choice(claim_train_sentences)
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence.split())}")
print(f"\nVectorized text:\n{claim_text_vectorizer([target_sentence])}")


# Getting the vocabulary and showing most frequent and least frequest words in the vocabulary
claim_text_vocab = claim_text_vectorizer.get_vocabulary()
most_common = claim_text_vocab[:5]
least_common = claim_text_vocab[-5:]
print(f"Number of words in vocabulary: {len(claim_text_vocab)}"),
print(f"Most common words in the vocabulary: {most_common}")
print(f"Least common words in the vocabulary: {least_common}")

# Get the config of our text vectorizer
claim_text_vectorizer.get_config()

token_embed = layers.Embedding(
    input_dim=len(claim_text_vocab), output_dim=128, mask_zero=True, input_length=55
)


print(f"Sentence before Vectorization : \n{target_sentence}\n")
vec_sentence = claim_text_vectorizer([target_sentence])
print(f"Sentence After vectorization :\n {vec_sentence}\n")
embed_sentence = token_embed(vec_sentence)
print(f"Embedding Sentence :\n{embed_sentence}\n")

# Turn our data into TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_train_sentences, claim_train_labels_one_hot)
)
valid_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_val_sentences, claim_val_labels_one_hot)
)
test_dataset = tf.data.Dataset.from_tensor_slices(
    (claim_test_sentences, claim_test_labels_one_hot)
)

# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vector = claim_text_vectorizer(inputs)
embed = token_embed(text_vector)
x = layers.Conv1D(
    filters=64,
    kernel_size=5,
    padding="same",
    activation="relu",
    kernel_regularizer=tf.keras.regularizers.L2(0.01),
)(embed)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = tf.keras.Model(inputs, outputs)
model.summary()

model.compile(optimizer="Adam", loss="categorical_crossentropy", metrics=["accuracy"])
model_1_history = model.fit(
    train_dataset,
    steps_per_epoch=int(0.1 * len(train_dataset)),
    epochs=15,
    validation_data=valid_dataset,
    validation_steps=int(0.1 * len(valid_dataset)),
)

2 ['0' '1']
24.318397827562798


2024-12-11 16:16:32.393331: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Text:
To address this inconsistency, and to shed light on the potential role of carabao in the transmission of S. japonicum in the Philippines, we undertook a pilot survey, collecting fecal samples from animals in Western Samar Province and we used a combination of molecular and copro-parasitological techniques to determine the prevalence and intensity of S. japonicum.

Length of text: 56

Vectorized text:
[[   6 1055   16 8624    4    6 4308  405   19    2   99   50    3 1925
     5    2  712    3  518 1152    5    2 2602   12 6115    7 4571 2021
  9965 5168  371   20  284    5 1230 6863 3235    4   12   64    7  391
     3  134    4 2898  958    6  628    2  605    4 1157    3  518]]
Number of words in vocabulary: 11218
Most common words in the vocabulary: ['', '[UNK]', 'the', 'of', 'and']
Least common words in the vocabulary: ['0027', '0021', '001p009', '00174', '0006']
Sentence before Vectorization : 
To address this inconsistency, and to shed light on the potential role of carabao

## Claim Extraction

In [9]:
data = new_translation_df


# Function to extract claims from a text block
def extract_claims(text):
    sentences = text.split(". ")  # Split into sentences
    results = model.predict(sentences)
    claims = [sentence for sentence, result in zip(sentences, results) if result[0] > 0.75]
    return claims


# Process each abstract and body
results = []
for idx, row in data.iterrows():
    id_paper = row["id"]
    title = row["title"]
    abstract = row["abstract"]
    conclusion = row["conclusion"]
    # body = row["body"]

    # Extract claims
    abstract_claims = extract_claims(abstract)
    conclusion_claims = extract_claims(conclusion)
    # body_claims = extract_claims(body)

    results.append({"id": id_paper,"title": title, "abstract_claims": abstract_claims, "conclusion_claims": conclusion_claims})

# Convert results to a DataFrame
results_df = pd.DataFrame(results)



## Author and insitution extraction :

In [10]:
import arxiv


def get_main_author(arxiv_id):
    # Search for the paper using the arXiv ID
    search = arxiv.Search(id_list=[arxiv_id])

    # Fetch the result
    result = next(search.results())

    # Get the main author (first author)
    main_author = result.authors[0]

    return main_author.name

In [11]:
results_df["main_author"] = results_df["id"].apply(get_main_author)

  result = next(search.results())


In [12]:
from pyalex import Works
import time
from tqdm import tqdm

def get_main_author_institutions(df):
    """Extract first author institution for each paper using OpenAlex"""
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        try:
            # Extract title from nested list structure
            title = row["title"]

            # Search OpenAlex by title
            work = Works().search_filter(title=title).get()

            if work and len(work) > 0:
                paper = work[0]

                # Get first author's institution
                if paper['authorships'] and len(paper['authorships']) > 0:
                    first_author = paper['authorships'][0]
                    institution = (
                        first_author['institutions'][0]['display_name']
                        if first_author['institutions']
                        else "No institution found"
                    )
                    author_name = first_author['author']['display_name']
                else:
                    institution = "No authors found"
                    author_name = "Unknown"

                results.append(
                    {
                        "title": title,
                        "main_author": author_name,
                        "institution": institution,
                        "year": paper['publication_year'],
                    }
                )
            else:
                results.append(
                    {
                        "title": title,
                        "main_author": "Not found",
                        "institution": "Paper not found in OpenAlex",
                        "year": None,
                    }
                )

            time.sleep(0.1)  # Rate limiting

        except Exception as e:
            results.append(
                {
                    "title": title,
                    "main_author": "Error",
                    "institution": f"Error: {str(e)}",
                    "year": None,
                }
            )

    return pd.DataFrame(results)


# Process your results_df
institutions_df = get_main_author_institutions(results_df)

100%|██████████| 355/355 [05:00<00:00,  1.18it/s]


In [13]:
institutions_df_cleaned = institutions_df[institutions_df["institution"] != "No institution found"]

In [14]:
def academic_or_industry(institution):
    if "university" in institution.lower() or "college" in institution.lower() or "institute" in institution.lower():
        return "Academic"
    else:
        return "Industry"

In [15]:
institutions_df_cleaned["institution_type"] = institutions_df_cleaned["institution"].apply(academic_or_industry)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  institutions_df_cleaned["institution_type"] = institutions_df_cleaned["institution"].apply(academic_or_industry)


In [16]:
industry_df = institutions_df_cleaned[
    (institutions_df_cleaned["institution_type"] == "Industry")
    & (institutions_df_cleaned["year"].notnull())
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("univ"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("instit"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("eth"))
    & ~(
        institutions_df_cleaned["institution"].str.lower().str.contains("polytechnique")
    )
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("mit"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("instit"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("national"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("beijing"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("commonwealth"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("bruno"))
    & ~(institutions_df_cleaned["institution"].str.lower().str.contains("laboratoire d'"))
]

## Saving

In [17]:
industry_df_complete = results_df.merge(industry_df, on=["title","main_author"], how="inner")
academic_df_complete = results_df.merge(
    institutions_df_cleaned[
        (institutions_df_cleaned["institution_type"] == "Academic")
        & (institutions_df_cleaned["year"].notnull())
    ]
    .drop_duplicates(subset=["title"]),
    on=["title","main_author"],
    how="inner",
)

In [18]:
df_industry_really_complete =industry_df_complete.merge(new_translation_df, on=["title","id"], how="inner")
df_academic_really_complete =academic_df_complete.merge(new_translation_df, on=["title","id"], how="inner")

In [19]:
df_academic_really_complete.to_parquet("data/academic_really_complete.parquet")
df_industry_really_complete.to_parquet("data/industry_really_complete.parquet")

In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
import re


def get_word_frequencies(titles):
    # Combine all titles into one string and convert to lowercase
    text = " ".join([title.lower() for title in titles])

    # Remove special characters and numbers
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)

    # Split into words
    words = text.split()

    # Get stopwords and add custom ones
    stop_words = set(stopwords.words("english"))
    custom_stops = {"using", "based", "via", "towards", "approach", "study", "analysis"}
    stop_words.update(custom_stops)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Count frequencies
    return Counter(words).most_common(10)


# Get frequencies for both categories
industry_freq = get_word_frequencies(df_industry_really_complete["title"])
academic_freq = get_word_frequencies(df_academic_really_complete["title"])

# Display results
print("Top 10 words in industry titles:")
for word, count in industry_freq:
    print(f"{word}: {count}")

print("\nTop 10 words in academic titles:")
for word, count in academic_freq:
    print(f"{word}: {count}")

Top 10 words in industry titles:
neural: 6
translation: 5
language: 4
machine: 3
show: 3
models: 3
pretraining: 2
generation: 2
fully: 2
embedding: 2

Top 10 words in academic titles:
language: 29
neural: 23
translation: 17
machine: 16
natural: 13
processing: 11
networks: 10
word: 9
survey: 8
learning: 8


In [168]:
# Get frequencies for both categories
industry_freq = get_word_frequencies(df_industry_really_complete["abstract"])
academic_freq = get_word_frequencies(df_academic_really_complete["abstract"])

# Display results
print("Top 10 words in industry abstracts:")
for word, count in industry_freq:
    print(f"{word}: {count}")

print("\nTop 10 words in academic abstracts:")
for word, count in academic_freq:
    print(f"{word}: {count}")

Top 10 words in industry abstracts:
language: 49
models: 38
translation: 33
natural: 32
processing: 29
model: 28
tasks: 27
machine: 24
show: 18
sentence: 18

Top 10 words in academic abstracts:
language: 301
translation: 194
models: 186
natural: 164
model: 149
processing: 148
tasks: 146
machine: 128
languages: 109
data: 107


In [169]:
# Get frequencies for both categories
industry_freq = get_word_frequencies(df_industry_really_complete["conclusion"])
academic_freq = get_word_frequencies(df_academic_really_complete["conclusion"])

# Display results
print("Top 10 words in industry conclusions:")
for word, count in industry_freq:
    print(f"{word}: {count}")

print("\nTop 10 words in academic conclusions:")
for word, count in academic_freq:
    print(f"{word}: {count}")

Top 10 words in industry conclusions:
model: 33
image: 33
models: 27
translation: 24
method: 22
results: 20
mathcal: 19
language: 17
languages: 16
neural: 16

Top 10 words in academic conclusions:
models: 156
model: 149
language: 131
tasks: 112
translation: 110
work: 106
future: 105
data: 95
different: 81
languages: 81
