**In this notebook, we embed the abstract of the papers into a low dimensional space (using either sentencetransformers library or doc2vec from Gensim) and associate to each author his abstracts embedding**

In [None]:
!pip install -U sentence-transformers

In [None]:
from tqdm import tqdm_notebook as tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import gzip
import pickle
import numpy as np
import torch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from string import digits, ascii_letters, punctuation, printable
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

In [1]:
def save(object, filename, protocol = 0):
        """Saves a compressed object to disk
        """
        file = gzip.GzipFile(filename, 'wb')
        file.write(pickle.dumps(object, protocol))
        file.close()
def load_dataset_file(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object

# Load Abstracts

In [None]:
tmp = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/files_generated/preprocess_abstracts.txt')
## Cleaning V2 (before conditioned on word with  word.isalpha() as a condition)
valid = ascii_letters + digits + punctuation + printable
paper_id = []
text = []
for key in tqdm(tmp.keys()):
    txt = ''.join([char for char in tmp[key] if char in valid])
    if len(txt) > 0:
        paper_id.append(key)
        text.append(txt)

# Abstract Embedding

## STSB Roberta Base

In [None]:
model = SentenceTransformer('stsb-roberta-base')
model.cuda()
embeddings = model.encode(text)

In [None]:
emb_per_paper = {}
for idx, id in enumerate(paper_id):
    emb_per_paper[id] = embeddings[idx]
save(emb_per_paper, '/content/drive/MyDrive/altegrad_datachallenge/embedding_per_paper_clean.txt')

## Doc2Vec

In [None]:
stop_words = set(stopwords.words('english')) 
doc = []
for txt in tqdm(text):
    p = txt.split()
    p_clean = [l for l in p if l not in stop_words]
    doc.append(p_clean)
del text

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(doc)]
model = Doc2Vec(tagged_data, vector_size = 256, window = 5, min_count = 2, epochs = 100, workers=10)

In [None]:
# Save the embedding
emb_per_paper = {}
for idx, id_ in tqdm(enumerate(paper_id)):
    emb_per_paper[id_] = model.docvecs[idx]
model.save('/content/drive/MyDrive/altegrad_datachallenge/word2vec.model') # Saving the model
save(emb_per_paper, '/content/drive/MyDrive/altegrad_datachallenge/doc2vec_paper_embedding.txt') # Saving the embedding

# Abstract Per Author Embedding
Associate each author with his articles

In [None]:
# read the file to create a dictionary with author key and paper list as value
f = open("/content/drive/MyDrive/altegrad_datachallenge/author_papers.txt","r")
papers_set = set()
d = {}
for l in f:
    auth_paps = [paper_id.strip() for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    d[l.split(":")[0]] = auth_paps

## Using Roberta Embedding

In [None]:
emb_per_paper = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/embedding_per_paper_clean.txt')
df = open("/content/drive/MyDrive/altegrad_datachallenge/author_embedding_clean.csv","w")
for id_author in tqdm(d.keys()):
    tot_embedding = np.zeros(768)
    c = 0
    for id_paper in d[id_author]:
        try:
            tot_embedding += emb_per_paper[id_paper]
            c += 1
        except KeyError:
            continue
    if c==0:
        c=1
    tot_embeddding = np.append(tot_embedding/c, c)
    df.write(id_author+","+",".join(map(lambda x:"{:.8f}".format(round(x, 8)), tot_embedding))+"\n")
df.close()

## Using Doc2Vec

In [None]:
emb_per_paper = load_dataset_file('/content/drive/MyDrive/altegrad_datachallenge/doc2vec_paper_embedding.txt')
df = open("/content/drive/MyDrive/altegrad_datachallenge/doc2vec_author_embedding.csv","w")
for id_author in tqdm(d.keys()):
    tot_embedding = np.zeros(256)
    c = 0
    for id_paper in d[id_author]:
        try:
            tot_embedding += emb_per_paper[id_paper]
            c += 1
        except KeyError:
            continue
    if c==0:
        c=1
    tot_embeddding = np.append(tot_embedding/c, c)
    df.write(id_author+","+",".join(map(lambda x:"{:.8f}".format(round(x, 8)), tot_embedding))+"\n")
df.close()