In [1]:
# !pip install -U sentence-transformers


In [2]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anuragmaravi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anuragmaravi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anuragmaravi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

# Instantiate the model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [4]:
# Example comments
comments = [
    "The uncertainty principle is a key topic in quantum mechanics.",
    "Quantum entanglement involves the pairing of particles in a way that they cannot be described independently.",
    "A wave function collapse results from a measurement in quantum physics."
]


In [5]:

question = "What is quantum entanglement?"


In [6]:

# Compute embeddings for comments and the question
comment_embeddings = model.encode(comments, convert_to_tensor=True)
question_embedding = model.encode(question, convert_to_tensor=True)

# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(question_embedding, comment_embeddings)

# Convert cosine scores to a Python list for better readability
cosine_scores = cosine_scores.cpu().tolist()[0]

# Combine the comments with their respective scores
relevance_scores = list(zip(comments, cosine_scores))

# Sort the comments by their relevance score in descending order
relevance_scores.sort(key=lambda x: x[1], reverse=True)

# Output the sorted relevance scores
print(relevance_scores)


[('Quantum entanglement involves the pairing of particles in a way that they cannot be described independently.', 0.7740704417228699), ('The uncertainty principle is a key topic in quantum mechanics.', 0.4301987588405609), ('A wave function collapse results from a measurement in quantum physics.', 0.40187445282936096)]


### More Data

In [7]:
# nRowsRead = None # specify 'None' if want to read whole file
nRowsRead = 500 # specify 'None' if want to read whole file

# ArXiv.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
df = pd.read_csv('ArXiv.csv', delimiter=',', nrows = nRowsRead)
df.dataframeName = 'ArXiv.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 500 rows and 6 columns


In [8]:
df.head()

Unnamed: 0,title,abstract,categories,created,id,doi
0,"General System theory, Like-Quantum Semantics ...",It is outlined the possibility to extend the q...,"['physics.gen-ph', 'quant-ph']",2007-03-31,704.0042,
1,Entanglement of Subspaces and Error Correcting...,We introduce the notion of entanglement of sub...,['quant-ph'],2007-04-02,704.0251,10.1103/PhysRevA.76.042309
2,General sequential quantum cloning,Some multipartite quantum states can be genera...,['quant-ph'],2007-04-03,704.0323,10.1088/1751-8113/41/15/155303
3,Levy-Lieb constrained-search formulation as a ...,The constrained-search formulation of Levy and...,['quant-ph'],2007-04-03,704.0372,10.1088/1751-8113/40/11/013
4,Review: Semiconductor Quantum Light Sources,Lasers and LEDs display a statistical distribu...,['quant-ph'],2007-04-03,704.0403,10.1038/nphoton.2007.46


## 1. Data Preprocessing
Prepare the dataset to be in a format suitable for training a model:

- Clean the article titles and abstracts.
- Tokenize the text and remove stop words.
- Perform stemming or lemmatization.

In [9]:
# Data cleaning function
def clean_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove punctuation from each word
    words = [word for word in tokens if word.isalpha()]
    # Filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(lemmatized)


In [10]:

# Apply the cleaning function to title and abstract columns
df['cleaned_title'] = df['title'].apply(clean_text)
df['cleaned_abstract'] = df['abstract'].apply(clean_text)

# Show the cleaned dataframe head
df[['cleaned_title', 'cleaned_abstract']].head()


Unnamed: 0,cleaned_title,cleaned_abstract
0,general system theory semantics fuzzy set,outlined possibility extend quantum formalism ...
1,entanglement subspace error correcting code,introduce notion entanglement subspace measure...
2,general sequential quantum cloning,multipartite quantum state generated sequentia...
3,formulation minimization correlation functional,formulation levy lieb formally defines exact f...
4,review semiconductor quantum light source,laser led display statistical distribution num...


## 2. Feature Engineering
Convert the text data into numerical vectors:

- Use embedding techniques to generate vectors for titles and abstracts.
- Encode categories to enrich the feature set.

In [11]:
from sentence_transformers import SentenceTransformer

# Initialize the model, we're using 'all-MiniLM-L6-v2' as an example
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for titles and abstracts
# Note: Depending on the size of your dataset, you might need to batch this process to avoid memory issues.
title_embeddings = model.encode(df['cleaned_title'].tolist(), show_progress_bar=True)
abstract_embeddings = model.encode(df['cleaned_abstract'].tolist(), show_progress_bar=True)

# Add the embeddings as new columns in the dataframe
df['title_embeddings'] = list(title_embeddings)
df['abstract_embeddings'] = list(abstract_embeddings)


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

### Encoding of Categories


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# We assume the 'categories' column contains lists of categories
# Convert the lists in the categories column to a format suitable for one-hot encoding
df['categories_list'] = df['categories'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# One-hot encode the categories
categories_one_hot = mlb.fit_transform(df['categories_list'])

# Create a dataframe from the one-hot encoded categories
categories_df = pd.DataFrame(categories_one_hot, columns=mlb.classes_)

# Concatenate the new categories dataframe with the original dataframe
df = pd.concat([df, categories_df], axis=1)


In [13]:
df.head()

Unnamed: 0,title,abstract,categories,created,id,doi,cleaned_title,cleaned_abstract,title_embeddings,abstract_embeddings,...,physics.data-an,physics.flu-dyn,physics.gen-ph,physics.geo-ph,physics.hist-ph,physics.optics,physics.plasm-ph,q-bio.GN,q-bio.OT,quant-ph
0,"General System theory, Like-Quantum Semantics ...",It is outlined the possibility to extend the q...,"['physics.gen-ph', 'quant-ph']",2007-03-31,704.0042,,general system theory semantics fuzzy set,outlined possibility extend quantum formalism ...,"[0.035404414, -0.025593145, 0.00084112806, -0....","[-0.020028133, -0.0009899782, -0.05651996, -0....",...,0,0,1,0,0,0,0,0,0,1
1,Entanglement of Subspaces and Error Correcting...,We introduce the notion of entanglement of sub...,['quant-ph'],2007-04-02,704.0251,10.1103/PhysRevA.76.042309,entanglement subspace error correcting code,introduce notion entanglement subspace measure...,"[-0.04198437, 0.044631943, 0.009335664, 0.0282...","[-0.012097446, -0.04231486, 0.0082591325, -0.0...",...,0,0,0,0,0,0,0,0,0,1
2,General sequential quantum cloning,Some multipartite quantum states can be genera...,['quant-ph'],2007-04-03,704.0323,10.1088/1751-8113/41/15/155303,general sequential quantum cloning,multipartite quantum state generated sequentia...,"[-0.16208114, -0.06516198, 0.0041780276, -0.01...","[-0.090092696, -0.008661873, -0.052945428, -0....",...,0,0,0,0,0,0,0,0,0,1
3,Levy-Lieb constrained-search formulation as a ...,The constrained-search formulation of Levy and...,['quant-ph'],2007-04-03,704.0372,10.1088/1751-8113/40/11/013,formulation minimization correlation functional,formulation levy lieb formally defines exact f...,"[0.0076493416, -0.020693908, 0.06052915, -0.00...","[-0.08496914, 0.060600147, 0.02267859, 0.03819...",...,0,0,0,0,0,0,0,0,0,1
4,Review: Semiconductor Quantum Light Sources,Lasers and LEDs display a statistical distribu...,['quant-ph'],2007-04-03,704.0403,10.1038/nphoton.2007.46,review semiconductor quantum light source,laser led display statistical distribution num...,"[-0.06981001, 0.057882775, -0.059592254, 0.062...","[-0.038788173, -0.0013268375, -0.07536729, 0.0...",...,0,0,0,0,0,0,0,0,0,1


## 3. Model Training

In [14]:
import pandas as pd

# Assuming df is your DataFrame and 'categories' is the column with the category data
# df = pd.read_csv('your_file.csv')  # Uncomment this line if you're reading from a CSV file

# Function to convert the string representation of a list into an actual list
def string_to_list(string):
    return string.strip("[]").replace("'", "").split(", ")

# Apply this function to each row in the 'categories' column to convert the strings to lists
df['categories_list'] = df['categories'].apply(string_to_list)

# Use a set to find all unique labels
unique_labels = set()
for row in df['categories_list']:
    unique_labels.update(row)

# unique_labels now contains all your unique category labels
print(unique_labels)


{'cond-mat.mtrl-sci', 'math.MP', 'cond-mat.other', 'math.CT', 'astro-ph', 'cond-mat.supr-con', 'math.IT', 'physics.atm-clus', 'q-bio.GN', 'physics.plasm-ph', 'math.FA', 'cs.IT', 'cs.DM', 'cs.CR', 'physics.gen-ph', 'nlin.CD', 'math.SP', 'gr-qc', 'math.OA', 'q-bio.OT', 'math.PR', 'physics.ao-ph', 'physics.data-an', 'math.CA', 'physics.flu-dyn', 'cond-mat.quant-gas', 'nlin.PS', 'cs.DS', 'cond-mat.mes-hall', 'math.GT', 'math.CO', 'astro-ph.EP', 'cs.AI', 'physics.chem-ph', 'cs.DB', 'cond-mat.dis-nn', 'physics.comp-ph', 'physics.atom-ph', 'math.QA', 'math.AP', 'nlin.SI', 'physics.hist-ph', 'physics.bio-ph', 'cs.ET', 'cond-mat.soft', 'nucl-th', 'cond-mat.str-el', 'physics.optics', 'math-ph', 'hep-th', 'physics.geo-ph', 'cs.GT', 'math.OC', 'quant-ph', 'cs.CC', 'cs.DC', 'cs.SE', 'hep-ph', 'cond-mat.stat-mech'}


In [17]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load your dataframe (assuming it's already been processed as per steps 1 and 2)
# df = pd.read_csv('your_processed_dataset.csv')

# Combining title and abstract embeddings (assuming they are stored as lists in the DataFrame)
combined_embeddings = np.array(df['title_embeddings'].tolist()) + np.array(df['abstract_embeddings'].tolist())

# Extracting the one-hot encoded category data
# Update this list with the actual names of your one-hot encoded category columns
category_columns = [
    'cond-mat.mtrl-sci', 'math.MP', 'cond-mat.other', 'math.CT', 'astro-ph', 
    'cond-mat.supr-con', 'math.IT', 'physics.atm-clus', 'q-bio.GN', 'physics.plasm-ph', 
    'math.FA', 'cs.IT', 'cs.DM', 'cs.CR', 'physics.gen-ph', 'nlin.CD', 'math.SP', 
    'gr-qc', 'math.OA', 'q-bio.OT', 'math.PR', 'physics.ao-ph', 'physics.data-an', 
    'math.CA', 'physics.flu-dyn', 'cond-mat.quant-gas', 'nlin.PS', 'cs.DS', 
    'cond-mat.mes-hall', 'math.GT', 'math.CO', 'astro-ph.EP', 'cs.AI', 'physics.chem-ph', 
    'cs.DB', 'cond-mat.dis-nn', 'physics.comp-ph', 'physics.atom-ph', 'math.QA', 
    'math.AP', 'nlin.SI', 'physics.hist-ph', 'physics.bio-ph', 'cs.ET', 'cond-mat.soft', 
    'nucl-th', 'cond-mat.str-el', 'physics.optics', 'math-ph', 'hep-th', 'physics.geo-ph', 
    'cs.GT', 'math.OC', 'quant-ph', 'cs.CC', 'cs.DC', 'cs.SE', 'hep-ph', 'cond-mat.stat-mech'
]

categories = df[category_columns].to_numpy()

# Combine embeddings with categories to form the full feature set
full_features = np.hstack((combined_embeddings, categories))


In [21]:
from sentence_transformers import SentenceTransformer, util

# Example comments
comments = [
    "The uncertainty principle is a key topic in quantum mechanics.",
    "Quantum entanglement involves the pairing of particles in a way that they cannot be described independently.",
    "A wave function collapse results from a measurement in quantum physics."
]

# Example question
question = "What is quantum entanglement?"

# Load the sentence-transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the question and comments
question_embedding = model.encode(question, convert_to_tensor=True)
comment_embeddings = model.encode(comments, convert_to_tensor=True)

# Calculate cosine similarity between the question and each comment
cosine_scores = util.pytorch_cos_sim(question_embedding, comment_embeddings)

# Convert cosine scores to a more readable format
cosine_scores = cosine_scores.cpu().numpy()

# Iterate over each comment and print its similarity score with the question
for i, score in enumerate(cosine_scores[0]):
    print(f"Comment: {comments[i]}")
    print(f"Similarity Score: {score}\n")


Comment: The uncertainty principle is a key topic in quantum mechanics.
Similarity Score: 0.4301987588405609

Comment: Quantum entanglement involves the pairing of particles in a way that they cannot be described independently.
Similarity Score: 0.7740704417228699

Comment: A wave function collapse results from a measurement in quantum physics.
Similarity Score: 0.40187445282936096

