In [None]:
import pandas as pd
import json
import os
from tqdm import tqdm
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def load_articles(dataset_path):

    with open(os.path.join(dataset_path, "train.json")) as f_obj:
        train_records = json.load(f_obj)

    with open(os.path.join(dataset_path, "valid.json")) as f_obj:
        valid_records = json.load(f_obj)

    with open(os.path.join(dataset_path, "test.json")) as f_obj:
        test_records = json.load(f_obj)

    all_records = train_records + valid_records + test_records

    relevant_article_paths = []
    for record in all_records:
        for _, article_path in record["metadata"]["premise_articles"].items():
            relevant_article_paths.append(article_path)

    rel_article_path_to_sents = {}
    all_rel_sentences = []
    for rel_article_path in tqdm(relevant_article_paths):
        with open(os.path.join("articles", rel_article_path)) as f_obj:
            article_sentences = json.load(f_obj)
        rel_article_path_to_sents[rel_article_path] = article_sentences
        all_rel_sentences.extend(article_sentences)

    return (
        rel_article_path_to_sents,
        all_rel_sentences
    )

In [None]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()
    # remove link, website or short keywords
    if ('link:' in sentence) or (r'http' in sentence) or len(sentence.split()) <= 4:
        return ""
    # remove punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # remove additional space
    sentence = sentence.replace(r'\s+', ' ')
    # print(sentence)
    return sentence

In [None]:
def load_data(dataset_path, article_path_to_sents, tfidf_vectorizer):
    with open(os.path.join(dataset_path, "train.json")) as f_obj:
        train_records = json.load(f_obj)

    train = []
    for record in tqdm(train_records):
        train_item = {}

        train_item["claim_id"] = record["metadata"]["id"]

        if record["metadata"]["claimant"] is not None:
            train_item["text"] = (
                record["metadata"]["claim"].strip()
                + " "
                + record["metadata"]["claimant"].strip()
            )
            train_item["text"] = train_item["text"].strip()
        else:
            train_item["text"] = record["metadata"]["claim"].strip()

        evidence_sentences = []

        for _, article_path in record["metadata"]["premise_articles"].items():
            if article_path not in article_path_to_sents:
                continue

            article_sentences = []
            for sent in article_path_to_sents[article_path]:
                sent = preprocess_sentence(sent)
                if sent.strip() != "":
                    article_sentences.append(sent.strip())

            if len(article_sentences) == 0:
                continue

            Y = tfidf_vectorizer.transform(article_sentences)

            evidence_sentences.append(article_sentences)

        if len(evidence_sentences) == 0:
            continue

        train_item["evidence_sents"] = evidence_sentences
        train_item["rating"] = record["label"]["rating"]
        train.append(train_item)

    train_df = pd.DataFrame.from_records(train)
    print(train_df)

    with open(os.path.join(dataset_path, "valid.json")) as f_obj:
        valid_records = json.load(f_obj)

    valid = []
    for record in tqdm(valid_records):
        valid_item = {}

        valid_item["claim_id"] = record["metadata"]["id"]

        if record["metadata"]["claimant"] is not None:
            valid_item["text"] = (
                record["metadata"]["claim"].strip()
                + " "
                + record["metadata"]["claimant"].strip()
            )
            valid_item["text"] = valid_item["text"].strip()
        else:
            valid_item["text"] = record["metadata"]["claim"].strip()

        evidence_sentences = []
        evidence_scores = []
        for _, article_path in record["metadata"]["premise_articles"].items():
            
            if article_path not in article_path_to_sents:
                continue

            article_sentences = []
            for sent in article_path_to_sents[article_path]:
                sent = preprocess_sentence(sent)
                if sent.strip() != "":
                    article_sentences.append(sent.strip())

            if len(article_sentences) == 0:
                continue

            Y = tfidf_vectorizer.transform(article_sentences)

            evidence_sentences.append(article_sentences)

        if len(evidence_sentences) == 0:
            continue

        valid_item["evidence_sents"] = evidence_sentences
        valid_item["rating"] = record["label"]["rating"]
        valid.append(valid_item)

    valid_df = pd.DataFrame.from_records(valid)
    print(valid_df)

    with open(os.path.join(dataset_path, "test.json")) as f_obj:
        test_records = json.load(f_obj)

    test = []
    for record in tqdm(test_records):
        test_item = {}

        test_item["claim_id"] = record["metadata"]["id"]

        if record["metadata"]["claimant"] is not None:
            test_item["text"] = (
                record["metadata"]["claim"].strip()
                + " "
                + record["metadata"]["claimant"].strip()
            )
            test_item["text"] = test_item["text"].strip()
        else:
            test_item["text"] = record["metadata"]["claim"].strip()

        evidence_sentences = []
        for _, article_path in record["metadata"]["premise_articles"].items():
            if article_path not in article_path_to_sents:
                continue

            article_sentences = []
            for sent in article_path_to_sents[article_path]:
                sent = preprocess_sentence(sent)
                if sent.strip() != "":
                    article_sentences.append(sent.strip())

            if len(article_sentences) == 0:
                continue

            Y = tfidf_vectorizer.transform(article_sentences)


            evidence_sentences.append(article_sentences)

        if len(evidence_sentences) == 0:
            continue

        test_item["evidence_sents"] = evidence_sentences
        test_item["rating"] = 0
        test.append(test_item)
    test_df = pd.DataFrame.from_records(test)
    return train_df, valid_df, test_df

In [None]:
dataset_path = "./data"
print("Loading articles...")
(
  rel_article_path_to_sents,
  all_rel_sentences
) = load_articles(dataset_path)
  
print("Generating tf idf vectorizer...")
vectorizer = TfidfVectorizer()
vectorizer.fit(all_rel_sentences)

print("Generating dataframes with relevant articles...")
train_w, valid_w, test_w = load_data(
  dataset_path, rel_article_path_to_sents, vectorizer
)

train_w.to_pickle(os.path.join(dataset_path, "train.pkl"))
valid_w.to_pickle(os.path.join(dataset_path, "valid.pkl"))
test_w.to_pickle(os.path.join(dataset_path, "test.pkl"))