In [None]:
import os
import requests
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
import csv
import re

def fix_commas_before_author(row_string):
    """Replaces non-delimiting commas in each row with semicolons."""
    match = re.search(r'(.*)(,)([^,]+,\d{4},.*)', row_string) # this is Gemini, the regex is a bit too complex otherwise
    if match:
        title_part = match.group(1)
        author_date_url_part = match.group(3)
        modified_title = title_part.replace(",", "|").replace("/", "~") # here we need to replace the commas in the title otherwise pandas won't be able to open the file in CSV format
        # modified_title = title_part.replace("/", "}")
        return modified_title + "," + author_date_url_part
    else:
        return row_string

def process_csv(input_filepath, output_filepath):
    """Processes the CSV file, fixing commas in titles."""
    try:
        with open(input_filepath, 'r', encoding='utf-8') as infile, \
                open(output_filepath, 'w', newline='', encoding='utf-8') as outfile:

            reader = csv.reader(infile)
            writer = csv.writer(outfile)

            for row in reader:
                row_string = ",".join(row)  # Convert row (list) to string
                modified_row_string = fix_commas_before_author(row_string)
                modified_row = modified_row_string.split(",") # Convert string back to list
                writer.writerow(modified_row)

    except FileNotFoundError:
        print(f"Input file not found: {input_filepath}")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    input_csv = "papers.csv" # replace with path to csv
    output_csv = "fixed_arxiv_links.csv"
    process_csv(input_csv, output_csv)
    print(f"Fixed CSV file saved to: {output_csv}")

In [None]:
df = pd.read_csv("fixed_arxiv_links.csv")

In [None]:
df.iloc[309] # highlight the issue with the row and the "/"

In [None]:
def download_page(title, authors, year, url, output_dir = None):
	headers = {
        "User-Agent": "Script to download quantum computing arxiv papers (example@student.maastrichtuniversity.nl)" # replace with own email, this isn't mandatory but it's nice to let the server admin know who you are
    }

	# Default flag to False
	file_already_exists = False
	
	try:		
		if output_dir and not os.path.exists(output_dir):
			os.makedirs(output_dir)

		# Sanitize filename
		fixed_title = title.replace("|", ",")
		fixed_title = fixed_title.replace("}", "-")
		filepath = os.path.join(output_dir, fixed_title + ".pdf")
		
		if not os.path.exists(output_dir):
			os.makedirs(output_dir)

		file_already_exists = os.path.exists(filepath)

		if file_already_exists:
			print(f"File {filepath} already exists. Skipping download.")
			return
		else: # handle the request only if the file isn't downloaded already -> don't overload server
			response = requests.get(url, headers=headers, stream=True)  # Fetch the HTML
			response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

		print(f"Downloading {url} to {filepath}...")
		response = requests.get(url, stream=True)
		response.raise_for_status()

		# not sure if this really is needed
		with open(filepath, "wb") as f:
			for chunk in response.iter_content(chunk_size=8192):
				f.write(chunk)

		print(f"Downloaded {fixed_title} successfully!")

	except requests.exceptions.RequestException as e:
		print(f"Error downloading {url}: {e}")
		return
	except Exception as e:
		print(f"An unexpected error occurred: {e}")
		return
	finally:
		if not file_already_exists:
			time.sleep(15)  # change this at own risk, this is basically the time between requests to the server as per robots.txt guidelines

In [None]:
import pandas as pd 
import time
import requests
import os 

In [None]:
def process_csv(csv_filepath):
	"""Processes a CSV file containing arXiv URLs.

	Args:
		csv_filepath: The path to the CSV file.
	"""
	try:
		df = pd.read_csv(csv_filepath)
		for row in df.itertuples():
			title, authors, date, url = row[1], row[2], row[3], row[4]
			download_page(title, authors, date, url, output_dir = os.path.join("papers/", date + "/"))
	except Exception as e:
		print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    csv_file = "fixed_arxiv_links.csv" # replace with own csv path
    process_csv(csv_file)

In [None]:
os.listdir()

In [None]:
import pdfplumber

In [None]:
with pdfplumber.open("/Users/lpaggen/Documents/DACS COURSES/dsdm_research_sem2/python/papers/3D Topological Quantum Computing.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()

In [None]:
print(text)

In [None]:
with pdfplumber.open("/Users/lpaggen/Documents/DACS COURSES/dsdm_research_sem2/python/papers/3D Topological Quantum Computing.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if not text:
            print("No text found, checking for images...")
            images = page.images
            print(f"Found {len(images)} images on this page.")
        if text:
            print(text)

In [None]:
text

In [None]:
from pdfminer.high_level import extract_text
import re

pdf_path = "/Users/lpaggen/Documents/DACS COURSES/dsdm_research_sem2/python/papers/3D Topological Quantum Computing.pdf"
text = extract_text(pdf_path)
text = re.sub(r'[^a-zA-Z]', ' ', text) # remove non-alphabetic characters
text = re.sub(r'<.*?>', '', text) # remove angled brackets
text = re.sub(r'[^\w\s]', '', text)
text = re.sub(r'\n', '', text) # remove new line characters from the text
text = re.sub(r'\d', '', text) # remove digits
text = re.sub(r'[\|#-]', '', text) # remove special characters
text = re.sub(r'\b[a-zA-Z]\b', '', text) # remove single characters
text = re.sub(r'\s+', ' ', text).strip() # remove extra whitespaces
stop_words = {"the", "is", "a", "an", "of", "in", "on", "at", "to", "and", "or", "it"}
text = " ".join([word for word in text.split() if word.lower() not in stop_words])
text = text.lower()
print(text)

In [None]:
from pdfminer.high_level import extract_text
import re

def tokenize_text(dir):
	"""Tokenizes text from PDF files in a directory.

	Args:
		dir: The directory containing the PDF files.
	"""
	pdf_files = [f for f in os.listdir(dir) if f.endswith(".pdf")]
	for pdf_file in pdf_files:
		pdf_path = os.path.join(dir, pdf_file)
		txt_path = pdf_path.replace(".pdf", ".txt")
		if os.path.exists(txt_path):
			print(f"skipping {pdf_path}, text file already exists / pdf already tokenized.")
			continue
		try:
			text = extract_text(pdf_path)
		except Exception as e:
			print(f"Error extracting text from {pdf_path}: {e}")
			continue
		text = re.sub(r'[^a-zA-Z]', ' ', text) # remove non-alphabetic characters
		text = re.sub(r'<.*?>', '', text) # remove angled brackets
		text = re.sub(r'[^\w\s]', '', text)
		text = re.sub(r'\n', '', text) # remove new line characters from the text
		text = re.sub(r'\d', '', text) # remove digits
		text = re.sub(r'[\|#-]', '', text) # remove special characters
		text = re.sub(r'\b[a-zA-Z]\b', '', text) # remove single characters
		text = re.sub(r'\s+', ' ', text).strip() # remove extra whitespaces
		stop_words = {"the", "is", "a", "an", "of", "in", "on", "at", "to", "and", "or", "it"}
		text = " ".join([word for word in text.split() if word.lower() not in stop_words])
		text = text.lower()

		with open (pdf_path.replace(".pdf", ".txt"), "w") as f:
			f.write(text)

In [None]:
import os 

In [None]:
tokenize_text("papers/")

In [None]:
with open("testing", "w") as f:
	f.write(text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf_bigrams(corpus):
    """
    Compute TF-IDF for 2-grams in a given corpus.
    
    :param corpus: List of text documents (strings)
    :return: TF-IDF matrix (sparse) and feature names (bigrams)
    """
    vectorizer = TfidfVectorizer(ngram_range=(2, 2))
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    
    return tfidf_matrix, feature_names

In [None]:
compute_tfidf_bigrams("/Users/lpaggen/Documents/DACS COURSES/dsdm_research_sem2/python/testing")

In [None]:
os.listdir()

In [None]:
with open("testing", "r") as f:
	text = f.read()

In [None]:
for i in text:
    if isinstance(i, str):
        print('hi')
    else:
        print(i)

In [None]:
import re

In [None]:
# Regex pattern for 'quantum computing' (case-insensitive)
pattern = r"(?i)quantum computing"

# Check if the whole text contains the pattern
if re.search(pattern, text):
    print("Found 'quantum computing' in text!")
else:
    print("Pattern not found.")

In [None]:
import shutil

In [None]:
# now to move all the .txt files to a separate folder

for i in os.listdir("papers/"):
	if i.endswith(".txt"):
		shutil.move(f"papers/{i}", "papers_txt/")
	else:
		continue

In [None]:
import os
import shutil

In [None]:
for year in os.listdir("/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers"):
	if year == ".DS_Store":
		continue
	for paper in os.listdir(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/{year}"):
		if paper.endswith(".txt"):
			if not os.path.exists(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/text_data{year}"):
				os.makedirs(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/text_data{year}")
			shutil.move(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/{year}/{paper}", f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/text_data/{year}/{paper}")

In [None]:
import pandas as pd 
import os

In [None]:
import os
import pandas as pd

def sort_by_year(csv_filepath, path_to_tokenized_papers):
    """Processes a CSV file containing arXiv URLs.

    Args:
        csv_filepath: The path to the CSV file.
        path_to_tokenized_papers: The directory containing tokenized papers.
    """
    try:
        df = pd.read_csv(csv_filepath)
        for row in df.itertuples():
            title, date = row[1], row[3]  # We only need those entries
            
            # Sanitize the filename
            fixed_title = title.replace("|", ",").replace("}", "-").replace(" ", "\\")

            # Construct the full path properly
            path_to_pdf = os.path.join(path_to_tokenized_papers, f"{fixed_title}.txt")

            print(path_to_pdf)
            print(os.path.exists(path_to_pdf))
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

sort_by_year("fixed_arxiv_links.csv", "/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/tokenized_papers")


In [None]:
os.listdir()

In [None]:
counter = {}

for i in os.listdir("/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/"):
	if i != ".DS_Store":
		k = 0
		for j in os.listdir(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/{i}"):
			k += 1
		counter[i] = k

In [None]:
counter

In [None]:
sns.set_theme(style="whitegrid")

# Sort keys and values
sorted_years = sorted(counter.keys())
sorted_values = [counter[year] for year in sorted_years]

# Create the bar plot with Seaborn
plt.figure(figsize=(10, 5))
sns.barplot(x=sorted_years, y=sorted_values, palette="Blues", edgecolor="black")

# Add labels and title
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("Quantum Computing Papers Published per Year", fontsize=14, fontweight="bold")

# Rotate x-axis labels if needed
plt.xticks(rotation=45)

# Show the plot
plt.show()


In [None]:
from pdfminer.high_level import extract_text
import re
import os

def tokenize_text(dir):
	"""Tokenizes text from PDF files in a directory.

	Args:
		dir: The directory containing the PDF files.
	"""
	pdf_files = [f for f in os.listdir(dir) if f.endswith(".pdf")]
	for pdf_file in pdf_files:
		pdf_path = os.path.join(dir, pdf_file)
		txt_path = pdf_path.replace(".pdf", ".txt")
		if os.path.exists(txt_path):
			print(f"skipping {pdf_path}, text file already exists / pdf already tokenized.")
			continue
		try:
			text = extract_text(pdf_path)
		except Exception as e:
			print(f"Error extracting text from {pdf_path}: {e}")
			continue
		text = re.sub(r'[^a-zA-Z]', ' ', text) # remove non-alphabetic characters
		text = re.sub(r'<.*?>', '', text) # remove angled brackets
		text = re.sub(r'[^\w\s]', '', text)
		text = re.sub(r'\n', '', text) # remove new line characters from the text
		text = re.sub(r'\d', '', text) # remove digits
		text = re.sub(r'[\|#-]', '', text) # remove special characters
		text = re.sub(r'\b[a-zA-Z]\b', '', text) # remove single characters
		text = re.sub(r'\s+', ' ', text).strip() # remove extra whitespaces
		stop_words = {"the", "is", "a", "an", "of", "in", "on", "at", "to", "and", "or", "it", "can be",
    "is a",
    "of the",
    "in the",
    "to the",
    "it is",
    "that is",
    "with the",
    "for the",
    "on the",
    "and the",
    "be the",
    "cid",
    "cuj"}
		text = " ".join([word for word in text.split() if word.lower() not in stop_words])
		text = text.lower()

		with open (pdf_path.replace(".pdf", ".txt"), "w") as f:
			f.write(text)
		print(f"tokenized {pdf_path} to {txt_path}")

if __name__ == "__main__":
	for year in os.listdir("/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/"):
		if year != ".DS_Store":
			tokenize_text(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/{year}")


In [None]:
import shutil

In [None]:
source_dir = "/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/"
destination_dir = "/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/tokenized/"

for i in os.listdir(source_dir):
    if i != ".DS_Store":
        for j in os.listdir(os.path.join(source_dir, i)):
            if j.endswith(".txt"):
                destination_path = os.path.join(destination_dir, i)
                if not os.path.exists(destination_path):
                    os.makedirs(destination_path)
                shutil.move(os.path.join(source_dir, i, j), os.path.join(destination_path, j))
            else:
                continue


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = [
    "This is the first document with some bigrams.",
    "This document is the second document and has more bigrams.",
    "The third document is shorter and has fewer bigrams.",
    "A fourth document, also with bigrams."
]

vectorizer = TfidfVectorizer(ngram_range=(2, 2))  # Bigrams
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print(tfidf_df)

In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

source_dir = "/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/tokenized/"  # Replace with your tokenized directory
top_n = 5  # Number of top bigrams to display

# Custom stop phrases
stop_phrases = [
    "can be",
    "is a",
    "of the",
    "in the",
    "to the",
    "it is",
    "that is",
    "with the",
    "for the",
    "on the",
    "and the",
    "be the",
    "cid cid",
    "cid uj",
    "uj cid"
]

for year_dir in os.listdir(source_dir):
    if year_dir != ".DS_Store":
        year_path = os.path.join(source_dir, year_dir)
        if os.path.isdir(year_path):
            print(f"\n--- Processing year: {year_dir} ---")
            for filename in os.listdir(year_path):
                if filename.endswith(".txt"):
                    file_path = os.path.join(year_path, filename)
                    try:
                        with open(file_path, "r") as file:
                            content = file.read()

                        # Calculate TF-IDF for bigrams (excluding stop phrases)
                        vectorizer = TfidfVectorizer(ngram_range=(2, 2))
                        tfidf_matrix = vectorizer.fit_transform([content])

                        feature_names = vectorizer.get_feature_names_out()
                        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

                        # Get top N bigrams
                        top_bigrams = tfidf_df.T.nlargest(top_n, 0)

                        print(f"\nFile: {filename}")
                        for bigram, score in top_bigrams[0].items():
                            if bigram not in stop_phrases: # Check if bigram is a stop phrase.
                                print(f"  {bigram}: {score:.4f}")

                    except FileNotFoundError:
                        print(f"File not found: {file_path}")
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

In [None]:
from pdfminer.high_level import extract_text
import re
import os

def tokenize_text(dir):
	"""Tokenizes text from PDF files in a directory.

	Args:
		dir: The directory containing the PDF files.
	"""
	pdf_files = [f for f in os.listdir(dir) if f.endswith(".pdf")]
	for pdf_file in pdf_files:
		pdf_path = os.path.join(dir, pdf_file)
		txt_path = pdf_path.replace(".pdf", ".txt")
		if os.path.exists(txt_path):
			print(f"skipping {pdf_path}, text file already exists / pdf already tokenized.")
			continue
		try:
			text = extract_text(pdf_path)
		except Exception as e:
			print(f"Error extracting text from {pdf_path}: {e}")
			continue

		with open (pdf_path.replace(".pdf", ".txt"), "w") as f:
			f.write(text)
		print(f"tokenized {pdf_path} to {txt_path}")

if __name__ == "__main__": # change to the correct directory
	for year in os.listdir("/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/"):
		if year != ".DS_Store":
			tokenize_text(f"/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/{year}")


In [None]:
# an example of using an LLM on a year to get the top terms

for year_dir in os.listdir(source_dir):
    if year_dir != ".DS_Store":
        year_path = os.path.join(source_dir, year_dir)
        if os.path.isdir(year_path):
            print(f"\n--- Processing year: {year_dir} ---")
            for filename in os.listdir(year_path):
                if filename.endswith(".txt"):
                    file_path = os.path.join(year_path, filename)
                    try:
                        with open(file_path, "r") as file:
                            content = file.read()
                            

                    except FileNotFoundError:
                        print(f"File not found: {file_path}")
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

In [None]:
import openai
import json

In [None]:
openai.api_key = ""

client = openai.OpenAI(api_key="")

In [None]:
def extract_keywords(text, model="gpt-4o-mini"):
    prompt = f"""
    Extract the top 10 most important keywords from the following research abstract on quantum computing. 
    Focus on emerging concepts and new techniques. Return only a comma-separated list of keywords.
    
    You are an expert in quantum information science and natural language processing. Given the following research abstract on quantum computing, perform a multifaceted analysis and generate a structured output.

Keyword Extraction with Nuance:
Identify the top 15 most significant keywords, prioritizing emerging concepts, novel techniques, and theoretical breakthroughs.
Beyond simple term frequency, consider semantic relationships, contextual importance, and potential future impact.
Distinguish between keywords representing hardware advancements, algorithmic innovations, and theoretical frameworks.

Return only a comma-separated list of keywords.

    Abstract:
    {text}

    Keywords:
    """
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )
    keywords = response.choices[0].message.content.strip()
    return keywords.split(", ")

# Directory containing research papers (assuming .txt files)
papers_dir = "/Users/lpaggen/Documents/DACS_COURSES/dsdm_research_sem2/papers/2021"
output_file = "extracted_keywords_2021.json"

# Process all papers
results = {}
for filename in os.listdir(papers_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(papers_dir, filename), "r", encoding="utf-8") as f:
            text = f.read()
            keywords = extract_keywords(text[:128000] if len(text) > 128000 else text)
            results[filename] = keywords

# Save extracted keywords
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import json

# Load extracted keywords
with open("extracted_keywords_2021.json", "r", encoding="utf-8") as f:
    keyword_data = json.load(f)

# Flatten all keywords into a single list
all_keywords = [keyword for keywords in keyword_data.values() for keyword in keywords if keyword not in ["quantum computing", "Quantum Computing"]]

# Count occurrences
keyword_counts = Counter(all_keywords)

# Get the top 15 most common keywords
top_keywords = keyword_counts.most_common(15)

# Unpack for plotting
labels, counts = zip(*top_keywords)

# Plot
plt.figure(figsize=(12, 6))
plt.barh(labels, counts, color="royalblue")
plt.xlabel("Frequency")
plt.ylabel("Keywords")
plt.title("Top 15 Most Frequent Keywords in Quantum Computing Papers")
plt.gca().invert_yaxis()  # Invert so most common is on top
plt.show()