In [1]:
# ! python -m spacy download en_core_web_sm

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy
import numpy as np
import os
import json

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [65]:
MAX_FEATURES = 1000

In [209]:
# all_papers = pd.read_pickle("../Pickles/all_papers_with_splits_embeddings_and_firstauthor.pkl")

In [67]:
def split_abstract(abstract_text):
    doc = nlp(abstract_text)
    return [sent.string.strip() for sent in doc.sents]

In [68]:
def merge_sents(row):
    abstract_sents = row.abstract_sents
    abstract_sents.insert(0, row.title)
    return abstract_sents

In [69]:
# all_papers["abstract_sents"] = all_papers["abstract"].apply(lambda x: split_abstract(x))

In [70]:
# all_papers["abstract_and_title_sents"] = all_papers.apply(lambda x: merge_sents(x), axis=1)

In [71]:
# all_papers.to_pickle("../Pickles/all_papers_with_splits_and_embeddings.pkl")

In [72]:
# pd.read_pickle("../Pickles/all_papers_with_splits_and_embeddings.pkl")

In [73]:
#TODO: play around with parameters AND maybe with the preprocessing of the titles (stuff like formulas and such can be filtered out)
tfidf_vectorizer = TfidfVectorizer(max_features = MAX_FEATURES, stop_words="english") 

In [74]:
all_papers.sample()

Unnamed: 0,title,abstract,date,authors,format,subjects,za_id,uri,full_sentences_path,full_scibert_embedding_path,full_sbert_embedings_abstract_title_merged_path,full_sbert_embedings_abstract_title_separate_path,abstract_sents,abstract_and_title_sents,first_author
206,GarNet: A Two-Stream Network for Fast and Accu...,While Physics-Based Simulation (PBS) can acc...,2018-11-27T00:00:00+00:00,"[{'first_name': 'Erhan', 'last_name': 'Gundogd...",scientific paper,[cs.CV],24a9a111571f1d2080446b48a416db6b7389fc39,https://arxiv.org/abs/1811.10983,../zetaobjects_v1/24a9/24a9a111571f1d2080446b4...,../zetaobjects_v1/embeddings/24a9/24a9a111571f...,../zetaobjects_v1/sbert_embedings_abstract_tit...,../zetaobjects_v1/sbert_embedings_abstract_tit...,[While Physics-Based Simulation (PBS) can accu...,[GarNet: A Two-Stream Network for Fast and Acc...,"{'first_name': 'Erhan', 'last_name': 'Gundogdu..."


In [75]:
CORPUS_MERGE = [sentence for sublist in all_papers.abstract_sents for sentence in sublist]

In [76]:
tfidf_vectorizer.fit(CORPUS_MERGE)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [77]:
def calculate_average_tfidf_vector(list_of_sentences, vectorizer=tfidf_vectorizer):
    vectors = vectorizer.transform(list_of_sentences)
    return np.array(np.mean(vectors, axis=0))

In [79]:
def calculate_average_tfidf_vector_abstract_and_title(sentences, strategy="merge", model=tfidf_vectorizer):
    
    title = sentences[0]
    abstract_sents = sentences[1:]
    title_vector = model.transform([title]).toarray()
    
    if strategy == "merge":
        abstract_embeddings = model.transform(abstract_sents).toarray()
        merged = np.concatenate((title_vector, abstract_embeddings), axis=0)
        average = np.mean(merged, axis=0)
    elif strategy == "separate":
        average_abstract = calculate_average_tfidf_vector(abstract_sents)
        average = np.mean([average_abstract, title_vector], axis=0)
    else:
        print("Warning: wrong strategy is used. Use either 'merge' or 'separate'. Proceeding using the 'merge' strategy.")
        abstract_embeddings = model.transform(abstract_sents).toarray()
        merged = np.concatenate((title_vector, abstract_embeddings), axis=0)
        average = np.mean(merged, axis=0)
        
    return average

In [80]:
def store_abstract_tfidf_vector(row, emb_strategy="merge"):
    sentences = row.abstract_sents
    if sentences[0] != "None":
        za_id = row["za_id"]
        za_id_short = za_id[:4]

        path_to_tfidf_representation = row["full_sentences_path"][:-22]

        folder_merge = row["full_scibert_embedding_path"].replace("zetaobjects_v1/embeddings/", "zetaobjects_v1/tfidf_vector_abstract_title_merged/").replace("/document_embedding.npy", "")
        folder_separate = row["full_scibert_embedding_path"].replace("zetaobjects_v1/embeddings/", "zetaobjects_v1/tfidf_vector_abstract_title_separate/").replace("/document_embedding.npy", "")
        embedding = calculate_average_tfidf_vector_abstract_and_title(sentences, strategy=emb_strategy)
        if emb_strategy=="merge":
            if not os.path.exists(folder_merge):
                os.makedirs(folder_merge)
            np.save(f"{folder_merge}/embedding.npy", embedding)

            with open(f"{path_to_tfidf_representation}tfidf_vector_merged", 'w', encoding='utf-8') as f:
                json.dump({"version":"1.0.0","value":{"pointer":f"/data/zetaobjects_v1/tfidf_vector_abstract_title_merged/{za_id_short}/{za_id}/embedding.npy"},"type":"pointer"}, f, ensure_ascii=False, indent=4)

        elif emb_strategy=="separate":
            if not os.path.exists(folder_separate):
                os.makedirs(folder_separate)
            np.save(f"{folder_separate}/embedding.npy", embedding) 

            with open(f"{path_to_tfidf_representation}tfidf_vector_separate", 'w', encoding='utf-8') as f:
                json.dump({"version":"1.0.0","value":{"pointer":f"/data/zetaobjects_v1/tfidf_vector_abstract_title_separate/{za_id_short}/{za_id}/embedding.npy"},"type":"pointer"}, f, ensure_ascii=False, indent=4)
    else:
        print("Sentences:", sentences, sentences[0], sentences[0] == "None")
        pass

In [81]:
# calculate_average_tfidf_vector_abstract_and_title(all_papers.iloc[265].abstract_sents, "separate")

### Rework: compute the TF-IDF for the whole abstract / abstract + title instead of averaging.
#### Note that the "merge" and "separate" strategies here differ from the merge and separate strategies for the calculation of the BERT based embeddings! Here, for the merge strategy we concatenate the title and the abstract strings together and perform vectorization on the whole piece of text.
#### For the separate strategy, we first calculate the vector for the **whole** abstract text, then the vector just for the title, and then take the average vector from those two separate vectors.

In [172]:
def calculate_tfidf_vector_paper(row, strategy="merge", vectorizer=tfidf_vectorizer):
    abstract = row.abstract
    title = row.title
    
    title_vector = vectorizer.transform([title]).toarray()[0]
    
    if strategy == "merge":
        together = title + " " + abstract
        average = vectorizer.transform([together]).toarray()[0]
    elif strategy == "separate":
        abstract_vector = vectorizer.transform([abstract]).toarray()[0]
        average = np.mean([abstract_vector, title_vector], axis=0)
    else:
        print("Warning: wrong strategy is used. Use either 'merge' or 'separate'. Proceeding using the 'merge' strategy.")
        together = title + " " + abstract
        average = vectorizer.transform([together]).toarray()
    
    return average

In [176]:
# for i, row in all_papers.iterrows():
#     print(calculate_tfidf_vector_paper(row, "merge"))
#     break

In [180]:
def store_abstract_tfidf_vector_new(row, emb_strategy="merge"):
    za_id = row["za_id"]
    za_id_short = za_id[:4]

    path_to_tfidf_representation = row["full_sentences_path"][:-22]

    folder_merge = row["full_scibert_embedding_path"].replace("zetaobjects_v1/embeddings/", "zetaobjects_v1/tfidf_vector_abstract_title_merged/").replace("/document_embedding.npy", "")
    folder_separate = row["full_scibert_embedding_path"].replace("zetaobjects_v1/embeddings/", "zetaobjects_v1/tfidf_vector_abstract_title_separate/").replace("/document_embedding.npy", "")
    
    vector = calculate_tfidf_vector_paper(row, strategy=emb_strategy)
    
    if emb_strategy=="merge":
        if not os.path.exists(folder_merge):
            os.makedirs(folder_merge)
        np.save(f"{folder_merge}/embedding.npy", vector)

        with open(f"{path_to_tfidf_representation}tfidf_vector_merged", 'w', encoding='utf-8') as f:
            json.dump({"version":"1.0.0","value":{"pointer":f"/data/zetaobjects_v1/tfidf_vector_abstract_title_merged/{za_id_short}/{za_id}/embedding.npy"},"type":"pointer"}, f, ensure_ascii=False, indent=4)

    elif emb_strategy=="separate":
        if not os.path.exists(folder_separate):
            os.makedirs(folder_separate)
        np.save(f"{folder_separate}/embedding.npy", vector) 

        with open(f"{path_to_tfidf_representation}tfidf_vector_separate", 'w', encoding='utf-8') as f:
            json.dump({"version":"1.0.0","value":{"pointer":f"/data/zetaobjects_v1/tfidf_vector_abstract_title_separate/{za_id_short}/{za_id}/embedding.npy"},"type":"pointer"}, f, ensure_ascii=False, indent=4)

In [182]:
for i, row in all_papers.iterrows():
    store_abstract_tfidf_vector_new(row, "separate")

In [242]:
other_colls = pd.read_csv("tfidf_columns.csv")[["tfidf_embeddings_abstract_title_merged_path",
                                                "tfidf_embeddings_abstract_title_separate_path",
                                                "za_id"]]

In [243]:
mrg = pd.merge(left=all_papers, right=other_colls, left_on='za_id', right_on='za_id')

In [244]:
mrg = mrg.drop_duplicates(subset="za_id").reset_index(drop = True)

In [255]:
# mrg.to_pickle("../Pickles/all_papers_with_tfidf.pkl")