In [1]:
import pandas as pd

In [25]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import re

## Doc2Vec

This is done twice - one with embeddings length 100, one with embeddings length 1000.

In [14]:
df_embeddings = pd.read_csv('data/original/df_submission_rating.csv')

df_embeddings = df_embeddings[['id', 'title', 'abstract']]

df_embeddings['title_abstract'] = df_embeddings['title'] + ': ' + df_embeddings['abstract']

df_embeddings = df_embeddings[['id', 'title_abstract']]

df_embeddings['title_abstract'] = df_embeddings['title_abstract'].fillna("")

tagged_data = [TaggedDocument(words=word_tokenize(row['title_abstract'].lower()), tags=[str(i)]) 
               for i, row in df_embeddings.iterrows()]


model = Doc2Vec(vector_size=1000, min_count=1, epochs=20)  # Adjust vector_size and other parameters as needed
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [15]:
# Extract vectors for each document
vectors = [model.dv[str(i)] for i in range(len(df_embeddings))]

# Create a new DataFrame with vectors
df_embeddings['doc2vec_vector'] = vectors

df_embeddings

Unnamed: 0,id,title_abstract,doc2vec_vector
0,B1-Hhnslg,Prototypical Networks for Few-shot Learning: A...,"[0.10684066, -0.19433972, 0.22517942, 0.108567..."
1,B1-q5Pqxl,Machine Comprehension Using Match-LSTM and Ans...,"[0.13974802, -0.15090857, 0.13521388, 0.181433..."
2,B12Js_yRb,Learning to Count Objects in Natural Images fo...,"[0.06778164, -0.057509117, 0.09472902, 0.06823..."
3,B13EC5u6W,Thinking like a machine — generating visual ra...,"[-0.014361723, -0.14671512, 0.04236695, 0.2642..."
4,B13njo1R-,Progressive Reinforcement Learning with Distil...,"[0.11626778, -0.27097604, 0.15835746, 0.246660..."
...,...,...,...
1395,rywHCPkAW,Noisy Networks For Exploration: We introduce N...,"[0.025960615, -0.12322217, 0.11756279, 0.03782..."
1396,rywUcQogx,Differentiable Canonical Correlation Analysis:...,"[0.05128503, -0.1404907, 0.15931574, 0.0914060..."
1397,ryxB0Rtxx,Identity Matters in Deep Learning: An emerging...,"[0.24072312, -0.15895177, 0.11074136, -0.12267..."
1398,ryykVe-0W,Learning Independent Features with Adversarial...,"[-0.047164343, -0.008090606, -0.028973319, 0.2..."


In [16]:
# Extract vectors for each document
vectors = [model.dv[str(i)] for i in range(len(df_embeddings))]

# Create a new DataFrame with id and vectors
df_vectors = pd.DataFrame({'id': df_embeddings['id'], 'doc2vec_vector': vectors})

# Prepare the transposed DataFrame
df_transposed = pd.DataFrame(columns=df_vectors['id'].values)
df_transposed.loc['paper_id'] = df_vectors['id'].values
df_transposed.loc['embedding'] = df_vectors['doc2vec_vector'].apply(lambda x: x.tolist()).values


In [17]:
df_transposed

Unnamed: 0,B1-Hhnslg,B1-q5Pqxl,B12Js_yRb,B13EC5u6W,B13njo1R-,B14TlG-RW,B14uJzW0b,B16Jem9xe,B16_iGWCW,B16dGcqlx,...,rytstxWAW,ryup8-WCW,ryuxYmvel,ryvxcPeAb,rywDjg-RW,rywHCPkAW,rywUcQogx,ryxB0Rtxx,ryykVe-0W,ryzm6BATZ
paper_id,B1-Hhnslg,B1-q5Pqxl,B12Js_yRb,B13EC5u6W,B13njo1R-,B14TlG-RW,B14uJzW0b,B16Jem9xe,B16_iGWCW,B16dGcqlx,...,rytstxWAW,ryup8-WCW,ryuxYmvel,ryvxcPeAb,rywDjg-RW,rywHCPkAW,rywUcQogx,ryxB0Rtxx,ryykVe-0W,ryzm6BATZ
embedding,"[0.10684066265821457, -0.19433972239494324, 0....","[0.13974802196025848, -0.15090857446193695, 0....","[0.06778164207935333, -0.05750911682844162, 0....","[-0.014361723326146603, -0.14671511948108673, ...","[0.11626777797937393, -0.2709760367870331, 0.1...","[0.1743335872888565, -0.15381790697574615, 0.1...","[0.08692383766174316, -0.10689122974872589, 0....","[0.06834282726049423, -0.02778083272278309, 0....","[0.14603886008262634, -0.09772276878356934, -0...","[0.1080716922879219, -0.16476379334926605, 0.0...",...,"[0.04030180349946022, -0.09882833808660507, 0....","[0.1766229271888733, -0.23738908767700195, 0.1...","[0.10507836937904358, -0.019523752853274345, -...","[0.08721420168876648, 0.06386078149080276, -0....","[0.15877512097358704, -0.09361056983470917, 0....","[0.025960614904761314, -0.12322217226028442, 0...","[0.0512850284576416, -0.14049069583415985, 0.1...","[0.240723118185997, -0.1589517742395401, 0.110...","[-0.04716434329748154, -0.008090605959296227, ...","[0.10083272308111191, -0.1454930603504181, 0.1..."


In [18]:
df_transposed.to_csv('data/embeddings/doc2vec1000_embeddings.csv', header=True)

## Simple bag of words

In [43]:
# Load the data
df_embeddings = pd.read_csv('data/original/df_submission_rating.csv')

# Select necessary columns and preprocess
df_embeddings = df_embeddings[['id', 'title', 'abstract']]
df_embeddings['title_abstract'] = df_embeddings['title'] + ': ' + df_embeddings['abstract']
df_embeddings = df_embeddings[['id', 'title_abstract']]
df_embeddings['title_abstract'] = df_embeddings['title_abstract'].fillna("")

# Initialize CountVectorizer (default tokenizer handles punctuation)
vectorizer = CountVectorizer()

# Fit and transform the text data to get BoW embeddings
bow_matrix = vectorizer.fit_transform(df_embeddings['title_abstract'].str.lower())

# Convert the BoW matrix to a list of vectors
bow_vectors = bow_matrix.toarray().tolist()

# Create a DataFrame with id and the vectors
df_bow_vectors = pd.DataFrame({'id': df_embeddings['id'], 'bow_vector': bow_vectors})

# Prepare the transposed DataFrame
df_transposed = pd.DataFrame(columns=df_embeddings['id'].values)
df_transposed.loc['paper_id'] = df_bow_vectors['id'].values
df_transposed.loc['embedding'] = df_bow_vectors['bow_vector'].values

# Save the transposed DataFrame to a CSV file
df_transposed.to_csv('data/embeddings/bow_embeddings.csv', header=True)

In [44]:
df_transposed

Unnamed: 0,B1-Hhnslg,B1-q5Pqxl,B12Js_yRb,B13EC5u6W,B13njo1R-,B14TlG-RW,B14uJzW0b,B16Jem9xe,B16_iGWCW,B16dGcqlx,...,rytstxWAW,ryup8-WCW,ryuxYmvel,ryvxcPeAb,rywDjg-RW,rywHCPkAW,rywUcQogx,ryxB0Rtxx,ryykVe-0W,ryzm6BATZ
paper_id,B1-Hhnslg,B1-q5Pqxl,B12Js_yRb,B13EC5u6W,B13njo1R-,B14TlG-RW,B14uJzW0b,B16Jem9xe,B16_iGWCW,B16dGcqlx,...,rytstxWAW,ryup8-WCW,ryuxYmvel,ryvxcPeAb,rywDjg-RW,rywHCPkAW,rywUcQogx,ryxB0Rtxx,ryykVe-0W,ryzm6BATZ
embedding,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
