# FETCHING DATA THROUGH ArXiv API

In [1]:
'''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "computer science"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n       

In [2]:
''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "physic"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n     

In [3]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "network"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [4]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "data"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [5]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "signal"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [6]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "information"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [7]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "statistics"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [8]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "electrical engineering"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

In [9]:
'''''''''''''''''
import requests
import feedparser
import time
import math
import csv
from datetime import datetime

def fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by='relevance', sort_order='descending', delay=3):
    base_url = 'http://export.arxiv.org/api/query?'
    max_batch_size = 1000

    if batch_size > max_batch_size:
        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")
        batch_size = max_batch_size

    num_batches = math.ceil(total_results / batch_size)
    print(f"Total results to fetch: {total_results}")
    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")

    all_papers = []

    for batch_num in range(num_batches):
        start = batch_num * batch_size
        current_batch_size = min(batch_size, total_results - start)
        params = {
            'search_query': f'all:{search_query}',
            'start': start,
            'max_results': current_batch_size,
            'sortBy': sort_by,
            'sortOrder': sort_order
        }

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                print(f"Error fetching batch {batch_num + 1}: HTTP {response.status_code}")
                break

            feed = feedparser.parse(response.text)

            for entry in feed.entries:
                paper = {
                    'title': entry.title.strip().replace('\n', ' '),
                    'authors': ', '.join([author.name for author in entry.authors]),
                    'summary': entry.summary.strip().replace('\n', ' '),
                    'published': entry.published,
                    'updated': entry.updated,
                    'link': entry.link,
                    'pdf_url': None,
                    'categories': ', '.join(tag.term for tag in entry.tags) if 'tags' in entry else 'N/A'
                }

                for link in entry.links:
                    if link.rel == 'related' and link.type == 'application/pdf':
                        paper['pdf_url'] = link.href
                        break

                all_papers.append(paper)

            print(f"Fetched batch {batch_num + 1}/{num_batches} (start: {start})")
            time.sleep(delay)

        except Exception as e:
            print(f"An error occurred during batch {batch_num + 1}: {str(e)}")
            break

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"arxiv_papers_{search_query.replace(' ', '_')}_{timestamp}.csv"

    with open(filename, mode='w', encoding='utf-8', newline='') as csv_file:
        fieldnames = ['title', 'authors', 'summary', 'published', 'updated', 'link', 'pdf_url', 'categories']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        for paper in all_papers:
            writer.writerow(paper)

    print(f"Data fetching complete. {len(all_papers)} papers saved to {filename}.")


if __name__ == "__main__":
    search_term = "system science"
    total_papers = 10000  # Total number of papers to fetch
    batch_size = 100        # Number of papers per request (adjust as needed)
    delay_seconds = 3      # Delay between requests to respect rate limits

    fetch_arxiv_papers(
        search_query=search_term,
        total_results=total_papers,
        batch_size=batch_size,
        sort_by='relevance',
        sort_order='descending',
        delay=delay_seconds
    )
'''''''''''''''''

'\'\'\nimport requests\nimport feedparser\nimport time\nimport math\nimport csv\nfrom datetime import datetime\n\ndef fetch_arxiv_papers(search_query, total_results=10000, batch_size=100, sort_by=\'relevance\', sort_order=\'descending\', delay=3):\n    base_url = \'http://export.arxiv.org/api/query?\'\n    max_batch_size = 1000\n\n    if batch_size > max_batch_size:\n        print(f"Batch size {batch_size} exceeds maximum of {max_batch_size}. Setting to {max_batch_size}.")\n        batch_size = max_batch_size\n\n    num_batches = math.ceil(total_results / batch_size)\n    print(f"Total results to fetch: {total_results}")\n    print(f"Fetching in {num_batches} batches of up to {batch_size} papers each.")\n\n    all_papers = []\n\n    for batch_num in range(num_batches):\n        start = batch_num * batch_size\n        current_batch_size = min(batch_size, total_results - start)\n        params = {\n            \'search_query\': f\'all:{search_query}\',\n            \'start\': start,\n   

# Load data

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS


In [11]:
path_physic = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/physic.csv"
data_physic = pd.read_csv(path_physic)
data_physic.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Is Physics Sick? [In Praise of Classical Physics],Hisham Ghassib,"In this paper, it is argued that theoretical p...",2012-09-04T10:32:56Z,2012-09-04T10:32:56Z,http://arxiv.org/abs/1209.0592v1,http://arxiv.org/pdf/1209.0592v1,"physics.gen-ph, physics.hist-ph"
1,Modern Mathematical Physics: what it should be?,Ludwig Faddeev,Personal view of author on goals and content o...,2000-02-08T13:13:00Z,2000-02-10T10:14:56Z,http://arxiv.org/abs/math-ph/0002018v2,http://arxiv.org/pdf/math-ph/0002018v2,"math-ph, hep-th, math.MP"
2,Topology in Physics,R. Jackiw,The phenomenon of quantum number fractionaliza...,2005-03-15T16:00:59Z,2005-03-15T16:00:59Z,http://arxiv.org/abs/math-ph/0503039v1,http://arxiv.org/pdf/math-ph/0503039v1,"math-ph, cond-mat.mes-hall, math.MP, physics.c..."
3,Contents of Physics Related E-Print Archives,"E. R. Prakasan, Anil Kumar, Anil Sagar, Lalit ...",The frontiers of physics related e-print archi...,2003-08-28T13:12:57Z,2003-08-28T13:12:57Z,http://arxiv.org/abs/physics/0308107v1,http://arxiv.org/pdf/physics/0308107v1,physics.data-an
4,Fundamental Dilemmas in Theoretical Physics,Hisham Ghassib,"In this paper, we argue that there are foundat...",2014-05-22T07:49:09Z,2014-05-22T07:49:09Z,http://arxiv.org/abs/1405.5530v1,http://arxiv.org/pdf/1405.5530v1,physics.hist-ph


In [12]:
path_algebra = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/algebra.csv"
data_algebra = pd.read_csv(path_algebra)
data_algebra.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Invariant Algebras,Keqin Liu,We introduce invariant algebras and representa...,2011-04-20T07:06:40Z,2011-04-20T07:06:40Z,http://arxiv.org/abs/1104.3954v1,http://arxiv.org/pdf/1104.3954v1,math.RA
1,Deformation of algebras over the Landweber-Nov...,Donald Yau,An algebraic deformation theory of algebras ov...,2005-01-28T17:57:07Z,2006-02-19T13:29:55Z,http://arxiv.org/abs/math/0501518v2,http://arxiv.org/pdf/math/0501518v2,"math.AC, math.AT, math.RA, 13D10; 55N22"
2,Hopf-like Algebras and Extended P-B-W Theorems,Keqin Liu,"Based on invariant algebras, we introduce repr...",2010-12-13T19:59:26Z,2010-12-13T19:59:26Z,http://arxiv.org/abs/1012.2844v1,http://arxiv.org/pdf/1012.2844v1,math.RA
3,Deformation quantization of vertex Poisson alg...,Shintarou Yanagida,We introduce dg Lie algebras controlling the d...,2016-07-07T16:15:51Z,2016-07-07T16:15:51Z,http://arxiv.org/abs/1607.02068v1,http://arxiv.org/pdf/1607.02068v1,math.QA
4,Symplectic reflection algebras and non-homogen...,"Roland Berger, Victor Ginzburg","From symplectic reflection algebras, some alge...",2005-06-06T08:25:02Z,2005-06-06T08:25:02Z,http://arxiv.org/abs/math/0506093v1,http://arxiv.org/pdf/math/0506093v1,"math.RA, math.QA"


In [13]:
path_bio = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/biology.csv"
data_bio = pd.read_csv(path_bio)
data_bio.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,"Borges Dilemma, Fundamental Laws, and Systems ...",P Ao,I reason here that the known folk law in biolo...,2007-09-01T00:16:45Z,2007-09-01T00:16:45Z,http://arxiv.org/abs/0709.0025v1,http://arxiv.org/pdf/0709.0025v1,"q-bio.QM, q-bio.OT"
1,Quantum Biology at the Cellular Level - elemen...,"Michael Bordonaro, Vasily Ogryzko",Quantum Biology is emerging as a new field at ...,2013-04-02T16:38:18Z,2013-04-03T12:45:52Z,http://arxiv.org/abs/1304.0683v2,http://arxiv.org/pdf/1304.0683v2,q-bio.OT
2,Landscape Paradigms in Physics and Biology: In...,David Sherrington,A brief introductory overview in general terms...,1996-08-21T16:24:55Z,1996-08-21T16:24:55Z,http://arxiv.org/abs/cond-mat/9608088v1,http://arxiv.org/pdf/cond-mat/9608088v1,"cond-mat, q-bio"
3,Mathematics at the eve of a historic transitio...,Guo-Wei Wei,A century ago physicists and mathematicians wo...,2017-11-06T16:39:24Z,2017-11-06T16:39:24Z,http://arxiv.org/abs/1711.02001v1,http://arxiv.org/pdf/1711.02001v1,q-bio.OT
4,G-quadruplexes and mRNA localization,Valentina Agoni,G-quadruplexes represent a novelty for molecul...,2013-10-01T09:43:40Z,2013-10-01T09:43:40Z,http://arxiv.org/abs/1310.0213v1,http://arxiv.org/pdf/1310.0213v1,q-bio.OT


In [14]:
path_cell = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/cell.csv"
data_cell = pd.read_csv(path_cell)
data_cell.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Emergence of collective propulsion through cel...,Katsuyoshi Matsushita,The mechanisms driving the collective movement...,2017-08-31T07:23:49Z,2017-08-31T07:23:49Z,http://arxiv.org/abs/1708.09593v1,http://arxiv.org/pdf/1708.09593v1,physics.bio-ph
1,Surface activity of cancer cells: the fusion o...,"Ivana Pajic-Lijakovic, Milan Milivojevic",Although a good comprehension of how cancer ce...,2021-09-20T11:04:43Z,2021-09-20T11:04:43Z,http://arxiv.org/abs/2109.09424v1,http://arxiv.org/pdf/2109.09424v1,q-bio.CB
2,Regulative Differentiation as Bifurcation of I...,"Akihiko Nakajima, Kunihiko Kaneko","In multicellular organisms, several cell state...",2007-12-04T15:16:59Z,2007-12-04T15:16:59Z,http://arxiv.org/abs/0712.0545v1,http://arxiv.org/pdf/0712.0545v1,q-bio.CB
3,Modelling cell-cell collision and adhesion wit...,"Nikolaos Sfakianakis, Diane Peurichard, Aaron ...",We extend the live-cell motility Filament Base...,2018-09-20T20:57:02Z,2018-09-20T20:57:02Z,http://arxiv.org/abs/1809.07852v1,http://arxiv.org/pdf/1809.07852v1,q-bio.CB
4,Pitchfork Bifurcation In A Coupled Cell System,"Shikhar Raj, Biplab Bose","Various biological phenomena, like cell differ...",2024-11-25T14:08:52Z,2024-11-25T14:08:52Z,http://arxiv.org/abs/2411.16400v1,http://arxiv.org/pdf/2411.16400v1,"math.DS, nlin.CD, q-bio.CB"


In [15]:
path_cs= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/cs.csv"
data_cs= pd.read_csv(path_cs)
data_cs.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Mathematical Logic in Computer Science,Assaf Kfoury,The article retraces major events and mileston...,2018-02-07T22:21:43Z,2018-02-07T22:21:43Z,http://arxiv.org/abs/1802.03292v1,http://arxiv.org/pdf/1802.03292v1,"cs.LO, 03B70, 68Q99, F.4.1; F.4.m; K.2"
1,Defining Data Science,"Yangyong Zhu, Yun Xiong",Data science is gaining more and more and wide...,2015-01-21T02:41:55Z,2015-01-21T02:41:55Z,http://arxiv.org/abs/1501.05039v1,http://arxiv.org/pdf/1501.05039v1,"cs.DB, cs.CY"
2,Ten Research Challenge Areas in Data Science,Jeannette M. Wing,Although data science builds on knowledge from...,2020-01-27T21:39:57Z,2020-01-27T21:39:57Z,http://arxiv.org/abs/2002.05658v1,http://arxiv.org/pdf/2002.05658v1,"cs.CY, cs.AI, A.0; E.0; G.3; I.2; I.5"
3,Why The Trans Programmer?,Skye Kychenthal,Through online anecdotal evidence and online c...,2022-05-03T15:06:23Z,2022-05-03T15:06:23Z,http://arxiv.org/abs/2205.01553v1,http://arxiv.org/pdf/2205.01553v1,cs.CY
4,Proceedings 11th Doctoral Workshop on Mathemat...,"Jan Bouda, Lukáš Holík, Jan Kofroň, Jan Strejč...",MEMICS provides a forum for doctoral students ...,2016-12-13T05:47:19Z,2016-12-13T05:47:19Z,http://arxiv.org/abs/1612.04037v1,http://arxiv.org/pdf/1612.04037v1,"cs.LO, cs.DS, cs.SE"


In [16]:
path_data =  "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/data.csv"
data_data= pd.read_csv(path_data)
data_data.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Data Gathering from Path Constrained Mobile Se...,Dinesh Dash,In Wireless Sensor Network (WSN) sensor nodes ...,2016-12-13T08:04:36Z,2016-12-13T08:04:36Z,http://arxiv.org/abs/1612.04053v1,http://arxiv.org/pdf/1612.04053v1,cs.NI
1,A Survey on Sampling and Profiling over Big Da...,"Zhicheng Liu, Aoqian Zhang",Due to the development of internet technology ...,2020-05-08T02:54:07Z,2020-05-08T02:54:07Z,http://arxiv.org/abs/2005.05079v1,http://arxiv.org/pdf/2005.05079v1,cs.DB
2,Data Science: A Comprehensive Overview,Longbing Cao,The twenty-first century has ushered in the ag...,2020-07-01T02:33:58Z,2020-07-01T02:33:58Z,http://arxiv.org/abs/2007.03606v1,http://arxiv.org/pdf/2007.03606v1,cs.CY
3,A systematic data characteristic understanding...,"Zhipeng Ma, Bo Nørregaard Jørgensen, Zheng Gra...",Big data present new opportunities for modern ...,2025-01-22T08:49:44Z,2025-01-22T08:49:44Z,http://arxiv.org/abs/2501.12720v1,http://arxiv.org/pdf/2501.12720v1,cs.IR
4,BDGS: A Scalable Big Data Generator Suite in B...,"Zijian Ming, Chunjie Luo, Wanling Gao, Rui Han...",Data generation is a key issue in big data ben...,2014-01-22T02:17:52Z,2014-02-27T03:40:26Z,http://arxiv.org/abs/1401.5465v3,http://arxiv.org/pdf/1401.5465v3,cs.DB


In [17]:
path_econ= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/economic.csv"
data_econ = pd.read_csv(path_econ)
data_econ.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,The Link Between Standardization and Economic ...,"Jussi Heikkilä, Timo Ali-Vehmas, Julius Rissanen",We analyze the link between standardization an...,2022-01-22T20:28:33Z,2022-01-22T20:28:33Z,http://arxiv.org/abs/2201.09125v1,http://arxiv.org/pdf/2201.09125v1,"econ.GN, q-fin.EC"
1,Why Economic Theories and Policies Fail? Unnot...,Victor Olkhov,Accuracy of economic theories and efficiency o...,2022-08-14T09:43:22Z,2022-08-14T09:43:22Z,http://arxiv.org/abs/2208.07839v1,http://arxiv.org/pdf/2208.07839v1,"econ.GN, q-fin.EC, q-fin.GN, q-fin.PR"
2,Business Cycles as Collective Risk Fluctuations,Victor Olkhov,We suggest use continuous numerical risk grade...,2020-12-08T15:45:11Z,2020-12-08T15:45:11Z,http://arxiv.org/abs/2012.04506v1,http://arxiv.org/pdf/2012.04506v1,"econ.GN, q-fin.EC, q-fin.RM"
3,"Econophysics of Macroeconomics: ""Action-at-a-D...",Victor Olkhov,We present macroeconomic model that describes ...,2017-02-09T09:36:46Z,2017-02-09T09:36:46Z,http://arxiv.org/abs/1702.02763v1,http://arxiv.org/pdf/1702.02763v1,q-fin.EC
4,Econophysics Macroeconomic Model,Victor Olkhov,This paper presents macroeconomic model that i...,2017-01-20T12:36:57Z,2017-01-20T12:36:57Z,http://arxiv.org/abs/1701.06625v1,http://arxiv.org/pdf/1701.06625v1,q-fin.EC


In [18]:
path_ee = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/electrical.csv"
data_ee = pd.read_csv(path_ee)
data_ee.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Driving towards net-zero: The impact of electr...,"Tobias Verheugen Hvidsten, Maximilian Roithner...",Electric vehicle batteries have a proven flexi...,2025-01-11T10:42:40Z,2025-01-11T10:42:40Z,http://arxiv.org/abs/2501.06502v1,http://arxiv.org/pdf/2501.06502v1,physics.soc-ph
1,Statistic electromotive force of solid-state c...,"Zhengliang Wang, Shanfei Chen, Gelin Wang",Based on the energy conversion of the dynamic ...,2023-12-04T03:18:24Z,2023-12-04T03:18:24Z,http://arxiv.org/abs/2312.01596v1,http://arxiv.org/pdf/2312.01596v1,physics.chem-ph
2,Charge screening and carrier transport in AA-s...,Yawar Mohammadi,The static dielectric function in AA-stacked b...,2014-08-14T15:52:37Z,2014-08-15T04:48:16Z,http://arxiv.org/abs/1408.3322v2,http://arxiv.org/pdf/1408.3322v2,cond-mat.mes-hall
3,Uniform and Staggered electric axial moment in...,Satoru Hayami,We theoretically investigate electronic orderi...,2023-07-19T00:00:59Z,2023-07-19T00:00:59Z,http://arxiv.org/abs/2307.09686v1,http://arxiv.org/pdf/2307.09686v1,cond-mat.str-el
4,A veracity preserving model for synthesizing s...,"Yunyou Huang, Jianfeng Zhan, Chunjie Luo, Lei ...",Electricity users are the major players of the...,2018-02-10T01:56:06Z,2018-02-10T01:56:06Z,http://arxiv.org/abs/1802.03500v1,http://arxiv.org/pdf/1802.03500v1,cs.OH


In [19]:
path_fin = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/finance.csv"
data_fin = pd.read_csv(path_fin)
data_fin.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Behavioral Finance -- Asset Prices Predictabil...,"Svetlozar Rachev, Stoyan Stoyanov, Stefan Mitt...",In this paper we address three main objections...,2017-10-09T17:45:44Z,2020-02-02T05:31:57Z,http://arxiv.org/abs/1710.03211v2,http://arxiv.org/pdf/1710.03211v2,"q-fin.MF, q-fin.GN"
1,International Trade Finance from the Origins t...,"Olivier Accominotti, Stefano Ugolini",This chapter presents a history of internation...,2020-09-18T07:43:19Z,2020-09-18T07:43:19Z,http://arxiv.org/abs/2009.08668v1,http://arxiv.org/pdf/2009.08668v1,"q-fin.GN, q-fin.TR"
2,"AI in Finance: Challenges, Techniques and Oppo...",Longbing Cao,AI in finance broadly refers to the applicatio...,2021-07-20T01:39:10Z,2021-07-20T01:39:10Z,http://arxiv.org/abs/2107.09051v1,http://arxiv.org/pdf/2107.09051v1,"q-fin.CP, cs.AI, cs.CE, cs.LG"
3,A model-free approach to continuous-time finance,"Henry Chiu, Rama Cont","We present a non-probabilistic, pathwise appro...",2022-11-28T16:39:35Z,2022-11-28T16:39:35Z,http://arxiv.org/abs/2211.15531v1,http://arxiv.org/pdf/2211.15531v1,q-fin.MF
4,A Survey on Blockchain-based Supply Chain Fina...,Zhengdong Luo,Supply Chain Finance is very important for sup...,2024-08-14T15:08:51Z,2024-08-14T15:08:51Z,http://arxiv.org/abs/2408.08915v1,http://arxiv.org/pdf/2408.08915v1,"cs.CR, cs.AI"


In [20]:
path_geo = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/Geometry.csv"
data_geo = pd.read_csv(path_geo)
data_geo.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Lectures notes in universal algebraic geometry,A. Shevlyakov,Lectures notes in universal algebraic geometry...,2016-01-12T06:26:37Z,2016-01-12T06:26:37Z,http://arxiv.org/abs/1601.02743v1,http://arxiv.org/pdf/1601.02743v1,math.AG
1,Seven Lectures on the Universal Algebraic Geom...,Boris Plotkin,Some notions of algebraic geometry can be defi...,2002-04-19T14:41:00Z,2002-04-19T14:41:00Z,http://arxiv.org/abs/math/0204245v1,http://arxiv.org/pdf/math/0204245v1,"math.GM, math.AG, 08A199, 03G99, 14A99"
2,Algebras with the same (algebraic) geometry,B. Plotkin,Some basic notions of classical algebraic geom...,2002-10-14T07:35:44Z,2002-10-14T07:35:44Z,http://arxiv.org/abs/math/0210194v1,http://arxiv.org/pdf/math/0210194v1,"math.GM, math.CT, 08A99, 18A99"
3,Algebraic Geometry over Lie Algebras,Ilya Kazachkov,This is a survey paper on Alegbraic Geometry o...,2006-05-10T03:03:27Z,2006-05-10T03:03:27Z,http://arxiv.org/abs/math/0605248v1,http://arxiv.org/pdf/math/0605248v1,"math.AG, math.LO, 08B20; 17B01"
4,An algebraic geometry perspective on topologic...,Paul Breiding,A short survey on applications of algebraic ge...,2020-01-06T15:01:23Z,2020-01-06T15:01:23Z,http://arxiv.org/abs/2001.02098v1,http://arxiv.org/pdf/2001.02098v1,"math.AG, math.AT"


In [21]:
path_info = "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/info.csv"
data_info = pd.read_csv(path_info)
data_info.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,An Information Identity for State-dependent Ch...,Nicolas Limal,"In this technical note, we investigate informa...",2014-02-05T03:21:22Z,2014-02-05T03:21:22Z,http://arxiv.org/abs/1402.0925v1,http://arxiv.org/pdf/1402.0925v1,"cs.IT, math.IT"
1,How Semantic Information G Measure Relates to ...,Chenguang Lu,To improve communication efficiency and provid...,2022-12-22T23:59:52Z,2022-12-22T23:59:52Z,http://arxiv.org/abs/2304.13502v1,http://arxiv.org/pdf/2304.13502v1,"cs.IT, math.IT, 94A15, 94A17, 94A34, 62B10, 68..."
2,Foundations of Information Theory,Mark Burgin,Information is the basic concept of informatio...,2008-08-06T04:04:55Z,2008-08-06T04:04:55Z,http://arxiv.org/abs/0808.0768v1,http://arxiv.org/pdf/0808.0768v1,"cs.IT, math.IT"
3,Separable Computation of Information Measures,"Xiangxiang Xu, Lizhong Zheng",We study a separable design for computing info...,2025-01-25T18:53:55Z,2025-01-25T18:53:55Z,http://arxiv.org/abs/2501.15301v1,http://arxiv.org/pdf/2501.15301v1,"cs.IT, cs.LG, math.IT, stat.ML"
4,Understanding Shannon's Entropy metric for Inf...,Sriram Vajapeyam,"Shannon's metric of ""Entropy"" of information i...",2014-03-24T17:37:16Z,2014-03-24T17:37:16Z,http://arxiv.org/abs/1405.2061v1,http://arxiv.org/pdf/1405.2061v1,"cs.IT, math.IT"


In [22]:
path_ML= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/machine_learning.csv"
data_ML = pd.read_csv(path_ML)
data_ML.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Lecture Notes: Optimization for Machine Learning,Elad Hazan,Lecture notes on optimization for machine lear...,2019-09-08T21:49:42Z,2019-09-08T21:49:42Z,http://arxiv.org/abs/1909.03550v1,http://arxiv.org/pdf/1909.03550v1,"cs.LG, stat.ML"
1,An Optimal Control View of Adversarial Machine...,Xiaojin Zhu,I describe an optimal control view of adversar...,2018-11-11T14:28:34Z,2018-11-11T14:28:34Z,http://arxiv.org/abs/1811.04422v1,http://arxiv.org/pdf/1811.04422v1,"cs.LG, stat.ML"
2,Minimax deviation strategies for machine learn...,"Michail Schlesinger, Evgeniy Vodolazskiy",The article is devoted to the problem of small...,2017-07-16T09:15:08Z,2017-07-16T09:15:08Z,http://arxiv.org/abs/1707.04849v1,http://arxiv.org/pdf/1707.04849v1,cs.LG
3,Machine Learning for Clinical Predictive Analy...,Wei-Hung Weng,"In this chapter, we provide a brief overview o...",2019-09-19T22:02:00Z,2019-09-19T22:02:00Z,http://arxiv.org/abs/1909.09246v1,http://arxiv.org/pdf/1909.09246v1,"cs.LG, stat.ML"
4,Towards Modular Machine Learning Solution Deve...,"Samiyuru Menik, Lakshmish Ramaswamy",Machine learning technologies have demonstrate...,2023-01-23T22:54:34Z,2023-01-23T22:54:34Z,http://arxiv.org/abs/2301.09753v1,http://arxiv.org/pdf/2301.09753v1,"cs.LG, cs.SE"


In [23]:
path_math= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/math.csv"
data_math = pd.read_csv(path_math)
data_math.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Math Neurosurgery: Isolating Language Models' ...,"Bryan R. Christ, Zack Gottesman, Jonathan Krop...",Math reasoning is a highly active area of Larg...,2024-10-22T12:00:58Z,2024-10-22T12:00:58Z,http://arxiv.org/abs/2410.16930v1,http://arxiv.org/pdf/2410.16930v1,"cs.CL, cs.AI"
1,Low progress math in a high performing system,"A. Jamaludin, A. I. Jabir, F. J. Wang, A. L. Tan",Math anxiety negatively relates to math perfor...,2023-01-26T02:33:09Z,2023-01-26T02:33:09Z,http://arxiv.org/abs/2302.01910v1,http://arxiv.org/pdf/2302.01910v1,math.HO
2,"Solutions to Problems in Amer. Math. Monthly, ...",Raymond Mortini,In this arxiv-post I present my solutions (pub...,2025-01-09T09:21:34Z,2025-01-09T09:21:34Z,http://arxiv.org/abs/2501.05096v1,http://arxiv.org/pdf/2501.05096v1,"math.HO, 00, 01, 26, 28, 30, 97"
3,Automatic Generation of Headlines for Online M...,"Ke Yuan, Dafang He, Zhuoren Jiang, Liangcai Ga...",Mathematical equations are an important part o...,2019-11-27T20:37:26Z,2019-11-27T20:37:26Z,http://arxiv.org/abs/1912.00839v1,http://arxiv.org/pdf/1912.00839v1,cs.CL
4,"Comments on ""Comment on ""Finiteness of corner ...",Jiten C Kalita,In this short note we provide clarification to...,2018-06-30T16:43:48Z,2018-06-30T16:43:48Z,http://arxiv.org/abs/1807.00200v1,http://arxiv.org/pdf/1807.00200v1,physics.flu-dyn


In [24]:
path_networkh= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/network.csv"
data_network = pd.read_csv(path_networkh)
data_network.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,A Survey on the Network Models applied in the ...,"Chao Dong, Xiaoxiong Xiong, Qiulin Xue, Zhengz...",Network architecture design is very important ...,2022-09-17T09:25:03Z,2022-09-17T09:25:03Z,http://arxiv.org/abs/2209.08294v1,http://arxiv.org/pdf/2209.08294v1,cs.SI
1,Characterization of Fundamental Networks,"Manuela A D Aguiar, Ana P S Dias, Pedro Soares","In the framework of coupled cell systems, a co...",2017-12-05T13:47:29Z,2017-12-05T13:47:29Z,http://arxiv.org/abs/1712.01616v1,http://arxiv.org/pdf/1712.01616v1,math.CO
2,Distinct dynamical behavior in Erdős-Rényi net...,"Marinho A. Lopes, Alexander V. Goltsev",Neuronal network dynamics depends on network s...,2018-07-21T11:22:40Z,2019-02-05T13:26:14Z,http://arxiv.org/abs/1807.08129v2,http://arxiv.org/pdf/1807.08129v2,q-bio.NC
3,NetO-App: A Network Orchestration Application ...,"Dewang Gedia, Levi Perigo",Software-defined networking (SDN) is reshaping...,2018-08-04T18:33:44Z,2018-08-04T18:33:44Z,http://arxiv.org/abs/1808.01519v1,http://arxiv.org/pdf/1808.01519v1,cs.NI
4,Predicting Hidden Links and Missing Nodes in S...,"Rakib Hassan Pran, Ljupco Todorovski",There are many networks in real life which exi...,2021-09-25T10:23:28Z,2021-09-25T10:23:28Z,http://arxiv.org/abs/2109.12331v1,http://arxiv.org/pdf/2109.12331v1,"cs.SI, cs.LG"


In [25]:
path_robotics= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/robotics.csv"
data_robotics = pd.read_csv(path_robotics)
data_robotics.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,The Use of Agricultural Robots in Orchard Mana...,"Qin Zhang, Manoj Karkee, Amy Tabb",Book chapter that summarizes recent research o...,2019-07-30T17:56:17Z,2019-07-30T17:56:17Z,http://arxiv.org/abs/1907.13114v1,http://arxiv.org/pdf/1907.13114v1,cs.RO
1,Robotics in Snow and Ice,François Pomerleau,"Definition: The terms ""robotics in snow and ic...",2022-08-10T01:02:57Z,2022-08-10T01:02:57Z,http://arxiv.org/abs/2208.05095v1,http://arxiv.org/pdf/2208.05095v1,cs.RO
2,Robot Accident Investigation: a case study in ...,"Alan F. T. Winfield, Katie Winkle, Helena Webb...","Robot accidents are inevitable. Although rare,...",2020-05-15T11:31:54Z,2020-05-15T11:31:54Z,http://arxiv.org/abs/2005.07474v1,http://arxiv.org/pdf/2005.07474v1,cs.RO
3,Pattern Formation for Asynchronous Robots with...,"Sruti Gan Chaudhuri, Swapnil Ghike, Shrainik J...",This paper presents a deterministic algorithm ...,2014-03-11T16:12:58Z,2014-03-11T16:12:58Z,http://arxiv.org/abs/1403.2625v1,http://arxiv.org/pdf/1403.2625v1,"cs.DC, cs.RO"
4,Formation of General Position by Asynchronous ...,"S. Bhagat, S. Gan Chaudhuri, K. Mukhopadhyaya",The traditional distributed model of autonomou...,2014-08-09T07:43:54Z,2014-08-09T07:43:54Z,http://arxiv.org/abs/1408.2072v1,http://arxiv.org/pdf/1408.2072v1,"cs.DC, cs.RO"


In [26]:
path_signal= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/signal.csv"
data_signal= pd.read_csv(path_signal)
data_signal.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Common Signal Analysis,Chengpu Wang,A common signal is defined for any two signals...,2011-03-22T13:35:34Z,2011-03-24T12:56:53Z,http://arxiv.org/abs/1103.4578v2,http://arxiv.org/pdf/1103.4578v2,"cs.IT, math.IT"
1,A Nondeterministic Model for Abstract Geometri...,"Rakhshan Harifi, Sama Goliaei",A signal machine is an abstract geometrical mo...,2016-09-28T12:05:56Z,2016-09-28T12:05:56Z,http://arxiv.org/abs/1609.08874v1,http://arxiv.org/pdf/1609.08874v1,cs.ET
2,The hyperanalytic signal,"Nicolas Le Bihan, Stephen J. Sangwine",The concept of the analytic signal is extended...,2010-06-24T11:20:59Z,2010-06-24T11:20:59Z,http://arxiv.org/abs/1006.4751v1,http://arxiv.org/pdf/1006.4751v1,"math.NA, math.RA, 65T50, 11R52"
3,Reversible Joint Hilbert and Linear Canonical ...,"Soo-Chang Pei, Shih-Gu Huang",Generalized analytic signal associated with th...,2017-09-20T02:34:17Z,2017-09-20T02:34:17Z,http://arxiv.org/abs/1709.06706v1,http://arxiv.org/pdf/1709.06706v1,"cs.IT, math.IT"
4,Signal Transformation for Effective Multi-Chan...,Sunil Kumar Kopparapu,Electroencephalography (EEG) is an non-invasiv...,2024-12-23T11:09:53Z,2024-12-23T11:09:53Z,http://arxiv.org/abs/2412.17478v1,http://arxiv.org/pdf/2412.17478v1,"eess.SP, cs.AI"


In [27]:
path_stats= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/stats.csv"
data_stats= pd.read_csv(path_stats)
data_stats.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,"Rejoinder to ""Equi-energy sampler with applica...","S. C. Kou, Qing Zhou, Wing H. Wong",Rejoinder to ``Equi-energy sampler with applic...,2006-11-08T11:33:47Z,2006-11-08T11:33:47Z,http://arxiv.org/abs/math/0611224v1,http://arxiv.org/pdf/math/0611224v1,"math.ST, stat.TH"
1,Upward and downward statistical continuities,Huseyin Cakalli,A real valued function $f$ defined on a subset...,2013-07-09T12:27:31Z,2013-07-09T12:27:31Z,http://arxiv.org/abs/1307.2418v1,http://arxiv.org/pdf/1307.2418v1,"math.GM, 26A15, 40A05, 40A30"
2,Applications of Information Theory: Statistics...,Khizar Qureshi,The method of optimizing entropy is used to (i...,2016-03-05T22:03:30Z,2016-03-05T22:03:30Z,http://arxiv.org/abs/1603.02589v1,http://arxiv.org/pdf/1603.02589v1,"math.ST, stat.TH"
3,Order statistics on the spacings between order...,Iosif Pinelis,Closed-form expressions for the distributions ...,2019-09-13T18:56:15Z,2019-09-13T18:56:15Z,http://arxiv.org/abs/1909.06406v1,http://arxiv.org/pdf/1909.06406v1,"math.ST, stat.TH, 62E15, 62F03"
4,Statistical Inference: The Big Picture,Robert E. Kass,Statistics has moved beyond the frequentist-Ba...,2011-06-15T07:47:09Z,2011-06-22T09:07:34Z,http://arxiv.org/abs/1106.2895v2,http://arxiv.org/pdf/1106.2895v2,"stat.OT, stat.ME"


In [28]:
path_ss= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/system_science.csv"
data_ss= pd.read_csv(path_ss)
data_ss.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Quantum information science and complex quantu...,Michael A. Nielsen,What makes quantum information science a scien...,2002-10-01T13:23:14Z,2002-10-01T13:23:14Z,http://arxiv.org/abs/quant-ph/0210005v1,http://arxiv.org/pdf/quant-ph/0210005v1,quant-ph
1,Impact of JD Bernal Thoughts in the Science of...,"Yong Zhao, Jian Du, Yishan Wu",John Desmond Bernal (1901-1970) was one of the...,2020-03-03T02:29:11Z,2020-03-05T06:38:16Z,http://arxiv.org/abs/2003.02135v2,http://arxiv.org/pdf/2003.02135v2,"physics.soc-ph, cs.DL, physics.hist-ph"
2,A Lesson from the James Webb Space Telescope: ...,"Heidi B. Hammel, Stefanie N. Milam",Astrophysics facilities have been of tremendou...,2020-07-16T20:36:24Z,2020-07-16T20:36:24Z,http://arxiv.org/abs/2007.08618v1,http://arxiv.org/pdf/2007.08618v1,"astro-ph.IM, astro-ph.EP"
3,The Globalization of Science: The Increasing P...,Marek Kwiek,National science systems have become embedded ...,2023-12-09T12:12:51Z,2023-12-09T12:12:51Z,http://arxiv.org/abs/2402.03313v1,http://arxiv.org/pdf/2402.03313v1,physics.soc-ph
4,"Finite Dynamical Systems, Linear Automata, and...","Oscar Moreno, Dorothy Bollman, Maria A. Avino-...",We establish a connection between finite field...,2006-04-25T14:05:56Z,2006-04-25T14:05:56Z,http://arxiv.org/abs/math/0604534v1,http://arxiv.org/pdf/math/0604534v1,"math.DS, math.RA, q-bio.GN, 11T06, 37B10, 92B05"


In [29]:
path_theory= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/theory.csv"
data_theory= pd.read_csv(path_theory)
data_theory.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Mutations of ordinary torsion theories and gen...,Takeshi Yoshizawa,Understanding how torsion theories are describ...,2024-05-23T03:57:09Z,2024-05-23T03:57:09Z,http://arxiv.org/abs/2405.14152v1,http://arxiv.org/pdf/2405.14152v1,math.AC
1,F and M Theories as Gauge Theories of Area Pre...,Hirotaka Sugawara,F theory and M theory are formulated as gauge ...,1997-08-06T05:05:39Z,1997-08-06T05:05:39Z,http://arxiv.org/abs/hep-th/9708029v1,http://arxiv.org/pdf/hep-th/9708029v1,hep-th
2,Codimension two lump solutions in string field...,Nicolas Moeller,We present some solutions for lumps in two dim...,2000-08-11T22:16:32Z,2000-08-11T22:16:32Z,http://arxiv.org/abs/hep-th/0008101v1,http://arxiv.org/pdf/hep-th/0008101v1,hep-th
3,A General Framework for the Semantics of Type ...,Taichi Uemura,We propose an abstract notion of a type theory...,2019-04-08T14:37:59Z,2023-05-26T15:49:35Z,http://arxiv.org/abs/1904.04097v3,http://arxiv.org/pdf/1904.04097v3,"math.CT, cs.LO, math.LO, 03B38 (Primary) 18C10..."
4,Matrix String Theory As A Generalized Quantum ...,Djordje Minic,"Matrix String Theory of Banks, Fischler, Shenk...",1997-05-16T20:57:48Z,1997-05-16T20:57:48Z,http://arxiv.org/abs/hep-th/9705126v1,http://arxiv.org/pdf/hep-th/9705126v1,hep-th


In [30]:
path_Topology= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/Topology.csv"
data_topology =  pd.read_csv(path_Topology)
data_topology.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,The graph topology,Lubica Hola,We study topological properties of the graph t...,2012-07-06T11:35:26Z,2012-07-06T11:35:26Z,http://arxiv.org/abs/1207.1589v1,http://arxiv.org/pdf/1207.1589v1,math.GN
1,Selective survey on spaces of closed subgroups...,Igor V. Protasov,We survey different topologizations of the set...,2018-09-01T09:03:23Z,2018-09-01T09:03:23Z,http://arxiv.org/abs/1809.00139v1,http://arxiv.org/pdf/1809.00139v1,math.GN
2,"Topological Expansion, Study and Applications",Helene Porchon,"In this paper, we introduce the notion of expa...",2012-11-13T12:33:25Z,2012-11-13T12:33:25Z,http://arxiv.org/abs/1211.3365v1,http://arxiv.org/pdf/1211.3365v1,"math.GM, 54A05 - 54A10 - 54D80 - 54F65 - 54H20"
3,On a topological simple Warne extension of a s...,"Iryna Fihel, Oleg Gutik, Kateryna Pavlyk",In the paper we introduce topological $\mathbb...,2013-01-07T15:45:04Z,2013-01-07T15:45:04Z,http://arxiv.org/abs/1301.1232v1,http://arxiv.org/pdf/1301.1232v1,"math.GR, 22A15, 54H15"
4,Separation Axioms in Bi-soft Topological Spaces,"Munazza Naz, Muhammad Shabir, Muhammad Irfan Ali",Concept of bi-soft topological spaces is intro...,2015-08-29T08:30:04Z,2015-08-29T08:30:04Z,http://arxiv.org/abs/1509.00866v1,http://arxiv.org/pdf/1509.00866v1,math.GN


In [31]:
path_architechture= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/architecture.csv"
data_architechture =  pd.read_csv(path_architechture)
data_architechture.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,On Assessing the Complexity of Software Archit...,Jianjun Zhao,This paper proposes some new architectural met...,2001-05-05T09:18:11Z,2001-05-05T09:18:11Z,http://arxiv.org/abs/cs/0105010v1,http://arxiv.org/pdf/cs/0105010v1,"cs.SE, D.2.8; D.2.11"
1,Applying Slicing Technique to Software Archite...,Jianjun Zhao,Software architecture is receiving increasingl...,2001-05-05T08:09:08Z,2001-05-05T08:09:08Z,http://arxiv.org/abs/cs/0105008v1,http://arxiv.org/pdf/cs/0105008v1,"cs.SE, D.2.4; D.2.5; D.2.7; D.2.11"
2,InstaNAS: Instance-aware Neural Architecture S...,"An-Chieh Cheng, Chieh Hubert Lin, Da-Cheng Jua...",Conventional Neural Architecture Search (NAS) ...,2018-11-26T06:29:39Z,2019-05-23T09:25:04Z,http://arxiv.org/abs/1811.10201v3,http://arxiv.org/pdf/1811.10201v3,"cs.LG, cs.CV, stat.ML"
3,Disentangled Neural Architecture Search,"Xinyue Zheng, Peng Wang, Qigang Wang, Zhongcha...",Neural architecture search has shown its great...,2020-09-24T03:35:41Z,2020-09-24T03:35:41Z,http://arxiv.org/abs/2009.13266v1,http://arxiv.org/pdf/2009.13266v1,"cs.LG, cs.NE, stat.ML"
4,Domain-Specific Quantum Architecture Optimization,"Wan-Hsuan Lin, Bochen Tan, Murphy Yuezhen Niu,...",With the steady progress in quantum computing ...,2022-07-29T05:16:02Z,2022-07-29T05:16:02Z,http://arxiv.org/abs/2207.14482v1,http://arxiv.org/pdf/2207.14482v1,"cs.AR, quant-ph"


In [32]:
path_processing= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/processing.csv"
data_processing =  pd.read_csv(path_processing)
data_processing.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Process Comparison Using Object-Centric Proces...,"Anahita Farhang Ghahfarokhi, Alessandro Berti,...",Process mining provides ways to analyze busine...,2021-03-12T10:08:28Z,2021-03-12T10:08:28Z,http://arxiv.org/abs/2103.07184v1,http://arxiv.org/pdf/2103.07184v1,"cs.DB, cs.AI, cs.LG"
1,Integral Equations in the Theory of Levy Proce...,Lev Sakhnovich,In this article we consider the Levy processes...,2007-02-13T16:47:20Z,2007-02-13T16:47:20Z,http://arxiv.org/abs/math/0702378v1,http://arxiv.org/pdf/math/0702378v1,"math.PR, math.FA, 60G51, 60J45, 60G17, 45A05"
2,Bi-Entangled Hidden Markov Processes and Recur...,Soueidi El Gheteb,"In this paper, we introduce the notion of Bi-e...",2024-07-12T16:05:55Z,2024-07-12T16:05:55Z,http://arxiv.org/abs/2407.09384v1,http://arxiv.org/pdf/2407.09384v1,"quant-ph, math-ph, math.MP"
3,Remarks on the Poisson additive process,Haoming Wang,The Poisson additive process is a binary condi...,2024-07-31T14:53:22Z,2024-08-14T14:44:04Z,http://arxiv.org/abs/2407.21651v2,http://arxiv.org/pdf/2407.21651v2,"math.PR, stat.AP, 60A05, 60G05, 60G55"
4,BPCE: A Prototype for Co-Evolution between Bus...,"Linyue Liu, Xi Guo, Chun Ouyang, Patrick C. K....",With the continuous development of business pr...,2023-03-30T13:59:34Z,2023-03-30T13:59:34Z,http://arxiv.org/abs/2303.17388v1,http://arxiv.org/pdf/2303.17388v1,"cs.SE, 68N99, D.2.2"


In [33]:
path_quantitative= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/quantitative.csv"
data_quantitative =  pd.read_csv(path_processing)
data_quantitative.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Process Comparison Using Object-Centric Proces...,"Anahita Farhang Ghahfarokhi, Alessandro Berti,...",Process mining provides ways to analyze busine...,2021-03-12T10:08:28Z,2021-03-12T10:08:28Z,http://arxiv.org/abs/2103.07184v1,http://arxiv.org/pdf/2103.07184v1,"cs.DB, cs.AI, cs.LG"
1,Integral Equations in the Theory of Levy Proce...,Lev Sakhnovich,In this article we consider the Levy processes...,2007-02-13T16:47:20Z,2007-02-13T16:47:20Z,http://arxiv.org/abs/math/0702378v1,http://arxiv.org/pdf/math/0702378v1,"math.PR, math.FA, 60G51, 60J45, 60G17, 45A05"
2,Bi-Entangled Hidden Markov Processes and Recur...,Soueidi El Gheteb,"In this paper, we introduce the notion of Bi-e...",2024-07-12T16:05:55Z,2024-07-12T16:05:55Z,http://arxiv.org/abs/2407.09384v1,http://arxiv.org/pdf/2407.09384v1,"quant-ph, math-ph, math.MP"
3,Remarks on the Poisson additive process,Haoming Wang,The Poisson additive process is a binary condi...,2024-07-31T14:53:22Z,2024-08-14T14:44:04Z,http://arxiv.org/abs/2407.21651v2,http://arxiv.org/pdf/2407.21651v2,"math.PR, stat.AP, 60A05, 60G05, 60G55"
4,BPCE: A Prototype for Co-Evolution between Bus...,"Linyue Liu, Xi Guo, Chun Ouyang, Patrick C. K....",With the continuous development of business pr...,2023-03-30T13:59:34Z,2023-03-30T13:59:34Z,http://arxiv.org/abs/2303.17388v1,http://arxiv.org/pdf/2303.17388v1,"cs.SE, 68N99, D.2.2"


In [34]:
path_EE_2= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/EE_2.csv"
data_EE_2 =  pd.read_csv(path_EE_2)
data_EE_2.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Teaching Introductory Electrical Engineering C...,Vladimir Vasilich Tregub,This article is about the author's experience ...,2011-07-19T17:36:18Z,2011-07-19T17:36:18Z,http://arxiv.org/abs/1107.3785v1,http://arxiv.org/pdf/1107.3785v1,cs.CY
1,AB Space Engine,Alexander Bolonkin,On 4 January 2007 the author published the art...,2008-03-02T00:49:09Z,2008-03-02T00:49:09Z,http://arxiv.org/abs/0803.0089v1,http://arxiv.org/pdf/0803.0089v1,"physics.gen-ph, physics.space-ph"
2,Optimal efficiency and power and their trade-o...,"Jincheng Lu, Yefeng Liu, Rongqian Wang, Chen W...",We establish a theory of optimal efficiency an...,2019-05-27T06:31:15Z,2019-05-27T06:31:15Z,http://arxiv.org/abs/1905.10992v1,http://arxiv.org/pdf/1905.10992v1,cond-mat.mes-hall
3,Modelling Electricity Consumption in Office Bu...,"Tao Zhang, Peer-Olaf Siebers, Uwe Aickelin","In this paper, we develop an agent-based model...",2013-05-31T15:01:01Z,2013-05-31T15:01:01Z,http://arxiv.org/abs/1305.7437v1,http://arxiv.org/pdf/1305.7437v1,"cs.CE, cs.AI"
4,"Electric cars, assessment of green nature vis ...",Satish Vitta,A comprehensive analysis of energy requirement...,2021-04-29T12:15:05Z,2021-04-29T12:15:05Z,http://arxiv.org/abs/2104.14287v1,http://arxiv.org/pdf/2104.14287v1,cond-mat.mtrl-sci


In [35]:
path_EESS_2= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/EESS_2.csv"
data_EESS_2 =  pd.read_csv(path_EESS_2)
data_EESS_2.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,HyperCon: Image-To-Video Model Transfer for Vi...,"Ryan Szeto, Mostafa El-Khamy, Jungwon Lee, Jas...",Video-to-video translation is more difficult t...,2019-12-10T19:47:53Z,2020-11-10T16:18:34Z,http://arxiv.org/abs/1912.04950v2,http://arxiv.org/pdf/1912.04950v2,cs.CV
1,Analyzing Zero-Shot Abilities of Vision-Langua...,"Avinash Madasu, Anahita Bhiwandiwalla, Vasudev...",Foundational multimodal models pre-trained on ...,2023-10-07T20:57:54Z,2023-11-24T22:25:07Z,http://arxiv.org/abs/2310.04914v2,http://arxiv.org/pdf/2310.04914v2,"cs.CV, cs.AI, cs.CL"
2,State-of-the-art in 360° Video/Image Processin...,"Chen Li, Mai Xu, Shanyi Zhang, Patrick Le Callet","Nowadays, 360{\deg} video/image has been incre...",2019-05-01T02:19:38Z,2019-10-28T08:32:46Z,http://arxiv.org/abs/1905.00161v2,http://arxiv.org/pdf/1905.00161v2,"eess.IV, cs.MM"
3,Two Decades of Colorization and Decolorization...,Shiguang Liu,"Colorization is a computer-aided process, whic...",2022-04-28T07:43:52Z,2022-07-18T09:22:00Z,http://arxiv.org/abs/2204.13322v2,http://arxiv.org/pdf/2204.13322v2,"cs.CV, cs.GR"
4,Fewer Tokens and Fewer Videos: Extending Video...,"Shimin Chen, Yitian Yuan, Shaoxiang Chen, Zequ...",Amidst the advancements in image-based Large V...,2024-06-12T09:22:45Z,2024-06-12T09:22:45Z,http://arxiv.org/abs/2406.08024v1,http://arxiv.org/pdf/2406.08024v1,"cs.CV, cs.AI"


In [36]:
path_EESS_3= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/EESS_3.csv"
data_EESS_3 =  pd.read_csv(path_EESS_3)
data_EESS_3.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Quantum information science and complex quantu...,Michael A. Nielsen,What makes quantum information science a scien...,2002-10-01T13:23:14Z,2002-10-01T13:23:14Z,http://arxiv.org/abs/quant-ph/0210005v1,http://arxiv.org/pdf/quant-ph/0210005v1,quant-ph
1,Impact of JD Bernal Thoughts in the Science of...,"Yong Zhao, Jian Du, Yishan Wu",John Desmond Bernal (1901-1970) was one of the...,2020-03-03T02:29:11Z,2020-03-05T06:38:16Z,http://arxiv.org/abs/2003.02135v2,http://arxiv.org/pdf/2003.02135v2,"physics.soc-ph, cs.DL, physics.hist-ph"
2,A Lesson from the James Webb Space Telescope: ...,"Heidi B. Hammel, Stefanie N. Milam",Astrophysics facilities have been of tremendou...,2020-07-16T20:36:24Z,2020-07-16T20:36:24Z,http://arxiv.org/abs/2007.08618v1,http://arxiv.org/pdf/2007.08618v1,"astro-ph.IM, astro-ph.EP"
3,The Globalization of Science: The Increasing P...,Marek Kwiek,National science systems have become embedded ...,2023-12-09T12:12:51Z,2023-12-09T12:12:51Z,http://arxiv.org/abs/2402.03313v1,http://arxiv.org/pdf/2402.03313v1,physics.soc-ph
4,"Finite Dynamical Systems, Linear Automata, and...","Oscar Moreno, Dorothy Bollman, Maria A. Avino-...",We establish a connection between finite field...,2006-04-25T14:05:56Z,2006-04-25T14:05:56Z,http://arxiv.org/abs/math/0604534v1,http://arxiv.org/pdf/math/0604534v1,"math.DS, math.RA, q-bio.GN, 11T06, 37B10, 92B05"


In [37]:
path_EESS_4= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/EESS_4.csv"
data_EESS_4 =  pd.read_csv(path_EESS_4)
data_EESS_4.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Quantum controllers for quantum systems,Seth Lloyd,This paper discusses fully coherent quantum fe...,1997-03-23T20:54:50Z,1997-03-23T20:54:50Z,http://arxiv.org/abs/quant-ph/9703042v1,http://arxiv.org/pdf/quant-ph/9703042v1,quant-ph
1,Learning Control of Quantum Systems,Daoyi Dong,This paper provides a brief introduction to le...,2021-01-19T04:35:36Z,2021-01-19T04:35:36Z,http://arxiv.org/abs/2101.07461v1,http://arxiv.org/pdf/2101.07461v1,"quant-ph, cs.SY, eess.SY"
2,Linear time logic control of linear systems wi...,"Jinjin Zhang, Zhaohui Zhu, Jianfei Yang",The formal analysis and design of control syst...,2012-12-29T10:30:53Z,2012-12-29T10:30:53Z,http://arxiv.org/abs/1212.6610v1,http://arxiv.org/pdf/1212.6610v1,math.OC
3,Four Generations of Control Theory Development ?,Tai Cheng Yang,This short article presents an opinion that co...,2021-02-11T18:38:19Z,2021-02-11T18:38:19Z,http://arxiv.org/abs/2102.08190v1,http://arxiv.org/pdf/2102.08190v1,"eess.SY, cs.AI, cs.SY"
4,"IEEEICM25: ""Stability of Digital Robust Motion...",Emre Sariyildiz,"In this paper, new stability analysis methods ...",2025-02-02T06:05:31Z,2025-02-02T06:05:31Z,http://arxiv.org/abs/2502.00683v1,http://arxiv.org/pdf/2502.00683v1,"eess.SY, cs.SY"


In [38]:
path_EESS_5= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/EESS_5.csv"
data_EESS_5 =  pd.read_csv(path_EESS_5)
data_EESS_5.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Audio-Visual Speech Codecs: Rethinking Audio-V...,"Karren Yang, Dejan Markovic, Steven Krenn, Vas...",Since facial actions such as lip movements con...,2022-03-31T17:57:10Z,2022-03-31T17:57:10Z,http://arxiv.org/abs/2203.17263v1,http://arxiv.org/pdf/2203.17263v1,"cs.CV, cs.LG, eess.AS"
1,Speech inpainting: Context-based speech synthe...,"Juan F. Montesinos, Daniel Michelsanti, Gloria...",Audio and visual modalities are inherently con...,2023-06-01T09:40:47Z,2023-06-01T09:40:47Z,http://arxiv.org/abs/2306.00489v1,http://arxiv.org/pdf/2306.00489v1,"cs.SD, cs.AI, eess.AS"
2,What Are They Doing? Joint Audio-Speech Co-Rea...,"Yingzhi Wang, Pooneh Mousavi, Artem Ploujnikov...","In audio and speech processing, tasks usually ...",2024-09-22T16:45:57Z,2025-01-12T10:48:00Z,http://arxiv.org/abs/2409.14526v2,http://arxiv.org/pdf/2409.14526v2,"cs.SD, cs.CL, eess.AS"
3,Neural Speech Tracking in a Virtual Acoustic E...,"Mareike Daeglau, Juergen Otten, Giso Grimm, Bo...","The audio visual benefit in speech perception,...",2025-01-14T14:00:57Z,2025-01-14T14:00:57Z,http://arxiv.org/abs/2501.08124v1,http://arxiv.org/pdf/2501.08124v1,"eess.AS, cs.SD"
4,Audio-visual Speech Enhancement Using Conditio...,"Mostafa Sadeghi, Simon Leglaive, Xavier Alamed...",Variational auto-encoders (VAEs) are deep gene...,2019-08-07T12:38:32Z,2020-05-26T09:38:39Z,http://arxiv.org/abs/1908.02590v3,http://arxiv.org/pdf/1908.02590v3,"cs.SD, cs.LG, eess.AS"


In [39]:
path_electrical_1= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/electrical_1.csv"
data_electrical_1 =  pd.read_csv(path_electrical_1)
data_electrical_1.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Minimising the expectation value of the procur...,"Naoya Yamaguchi, Maiya Hori, Yoshinari Ideguchi","In this paper, we formulate a method for minim...",2018-02-22T14:29:05Z,2018-08-01T11:13:19Z,http://arxiv.org/abs/1803.04532v2,http://arxiv.org/pdf/1803.04532v2,"q-fin.EC, q-fin.GN"
1,Long-term memory in electricity prices: Czech ...,"Ladislav Kristoufek, Petra Lunackova",We analyze long-term memory properties of hour...,2013-09-03T04:02:28Z,2013-09-03T04:02:28Z,http://arxiv.org/abs/1309.0582v1,http://arxiv.org/pdf/1309.0582v1,q-fin.ST
2,RFID-BASED Prepaid Power Meter,"Rozita Teymourzadeh, Mahmud Iwan, Ahmad J. A. ...",An Electric power meter is an important compon...,2018-06-10T19:04:30Z,2018-06-10T19:04:30Z,http://arxiv.org/abs/1807.10385v1,http://arxiv.org/pdf/1807.10385v1,eess.SP
3,Comparison of two models of electric neuro-sti...,Erich W. Schmid,Two simple mathematical models of electric neu...,2007-11-13T09:32:33Z,2007-11-13T09:32:33Z,http://arxiv.org/abs/0711.1938v1,http://arxiv.org/pdf/0711.1938v1,q-bio.NC
4,"Cellular velocity, electrical persistence and ...","Isabella Guido, Douglas Diehl, Nora Aleida Ols...",Cells have the ability to detect electric fiel...,2020-04-16T10:29:48Z,2020-04-16T10:29:48Z,http://arxiv.org/abs/2004.07575v1,http://arxiv.org/pdf/2004.07575v1,q-bio.CB


In [40]:
path_engineering_1= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/engineering_1.csv"
data_engineering_1 =  pd.read_csv(path_engineering_1)
data_engineering_1.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Artificial intelligence-aided protein engineer...,"Yuchi Qiu, Guo-Wei Wei",Protein engineering is an emerging field in bi...,2023-07-27T02:14:09Z,2023-07-27T02:14:09Z,http://arxiv.org/abs/2307.14587v1,http://arxiv.org/pdf/2307.14587v1,q-bio.BM
1,Validation of an LLM-based Multi-Agent Framewo...,"Zan Chen, Yungeng Liu, Yu Guang Wang, Yiqing Shen",Recent advancements in Large Language Models (...,2024-11-09T01:14:41Z,2024-11-09T01:14:41Z,http://arxiv.org/abs/2411.06029v1,http://arxiv.org/pdf/2411.06029v1,q-bio.QM
2,Remaining Useful Life Prediction for Aircraft ...,"Anees Peringal, Mohammed Basheer Mohiuddin, Ah...",This study uses a Long Short-Term Memory (LSTM...,2024-01-15T10:54:08Z,2024-01-15T10:54:08Z,http://arxiv.org/abs/2401.07590v1,http://arxiv.org/pdf/2401.07590v1,eess.SP
3,The spatial dynamics of ecosystem engineers,"Caroline Franco, José F. Fontanari",The changes on abiotic features of ecosystems ...,2016-11-28T18:52:36Z,2017-08-10T23:58:45Z,http://arxiv.org/abs/1611.09283v3,http://arxiv.org/pdf/1611.09283v3,q-bio.PE
4,AutoProteinEngine: A Large Language Model Driv...,"Yungeng Liu, Zan Chen, Yu Guang Wang, Yiqing Shen",Protein engineering is important for biomedica...,2024-11-07T05:23:31Z,2024-11-07T05:23:31Z,http://arxiv.org/abs/2411.04440v1,http://arxiv.org/pdf/2411.04440v1,q-bio.QM


In [41]:
path_image= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/image.csv"
data_image =  pd.read_csv(path_image)
data_image.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Snapshot multispectral imaging using a filter ...,Kazuma Shinoda,A multispectral filter array (MSFA) is one sol...,2018-08-28T04:04:47Z,2018-08-28T04:04:47Z,http://arxiv.org/abs/1808.09106v1,http://arxiv.org/pdf/1808.09106v1,eess.IV
1,A Self-supervised SAR Image Despeckling Strate...,"Liang Chen, Yifei Yin, Hao Shi, Qingqing Sheng...",Speckle noise is generated due to the SAR imag...,2023-08-11T07:19:45Z,2023-08-11T07:19:45Z,http://arxiv.org/abs/2308.05975v1,http://arxiv.org/pdf/2308.05975v1,eess.IV
2,Introduction to Brain and Medical Images,Moo K. Chung,This article is based on the first chapter of ...,2021-03-09T23:08:16Z,2021-03-09T23:08:16Z,http://arxiv.org/abs/2103.05772v1,http://arxiv.org/pdf/2103.05772v1,eess.IV
3,Studying the Effect of Digital Stain Separatio...,"Alison K. Cheeseman, Hamid R. Tizhoosh, Edward...","Due to recent advances in technology, digitize...",2020-03-31T15:41:36Z,2020-03-31T15:41:36Z,http://arxiv.org/abs/2003.14303v1,http://arxiv.org/pdf/2003.14303v1,eess.IV
4,A psychophysical evaluation of techniques for ...,"Lars C. Reining, Thomas S. A. Wallis",Mooney images can contribute to our understand...,2024-03-18T15:20:57Z,2024-03-18T15:20:57Z,http://arxiv.org/abs/2403.11867v1,http://arxiv.org/pdf/2403.11867v1,q-bio.NC


In [42]:
path_signal_1= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/signal_1.csv"
data_signal_1=  pd.read_csv(path_signal_1)
data_signal_1.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,Spectral remapping of natural signals,Md. Shoaibur Rahman,Here we present an algorithm to procedurally r...,2019-12-22T02:05:35Z,2019-12-22T02:05:35Z,http://arxiv.org/abs/1912.10371v1,http://arxiv.org/pdf/1912.10371v1,eess.SP
1,To further understand graph signals,"Feng Ji, Wee Peng Tay",Graph signal processing (GSP) is a framework t...,2022-03-02T02:52:15Z,2023-03-10T02:55:18Z,http://arxiv.org/abs/2203.00832v2,http://arxiv.org/pdf/2203.00832v2,eess.SP
2,Topological Signal Processing over Simplicial ...,"Sergio Barbarossa, Stefania Sardellitti",The goal of this paper is to establish the fun...,2019-07-26T13:50:01Z,2020-03-13T13:48:55Z,http://arxiv.org/abs/1907.11577v2,http://arxiv.org/pdf/1907.11577v2,eess.SP
3,Signal denoising based on the Schrödinger oper...,"Peihao Li, Taous Meriem Laleg-Kirati","Recently, a new Signal processing method, name...",2019-08-21T09:22:52Z,2019-08-21T09:22:52Z,http://arxiv.org/abs/1908.07758v1,http://arxiv.org/pdf/1908.07758v1,eess.SP
4,Recovery of Graph Signals from Sign Measurements,"Wenwei Liu, Hui Feng, Kaixuan Wang, Feng Ji, B...",Sampling and interpolation have been extensive...,2021-09-26T12:00:55Z,2021-09-26T12:00:55Z,http://arxiv.org/abs/2109.12576v1,http://arxiv.org/pdf/2109.12576v1,eess.SP


In [43]:
path_system= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/systems.csv"
data_system=  pd.read_csv(path_system)
data_system.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,BER Performance of Spatial Modulation Systems ...,"Yu Fu, Cheng-Xiang Wang, Xuming Fang, Li Yan, ...","In this paper, the bit error rate (BER) perfor...",2020-07-28T07:15:17Z,2020-07-28T07:15:17Z,http://arxiv.org/abs/2007.14027v1,http://arxiv.org/pdf/2007.14027v1,eess.SP
1,The multi-layer network nature of systemic ris...,"Sebastian Poledna, José Luis Molina-Borboa, Se...",The inability to see and quantify systemic fin...,2015-05-16T14:19:24Z,2015-05-16T14:19:24Z,http://arxiv.org/abs/1505.04276v1,http://arxiv.org/pdf/1505.04276v1,q-fin.RM
2,Measures of Systemic Risk,"Zachary Feinstein, Birgit Rudloff, Stefan Weber",Systemic risk refers to the risk that the fina...,2015-02-27T16:39:12Z,2016-10-13T22:20:27Z,http://arxiv.org/abs/1502.07961v5,http://arxiv.org/pdf/1502.07961v5,q-fin.RM
3,An Event-Driven Compressive Neuromorphic Syste...,"Jinbo Chen, Fengshi Tian, Jie Yang, Mohamad Sawan",Wearable electrocardiograph (ECG) recording an...,2022-05-26T12:12:31Z,2022-05-26T12:12:31Z,http://arxiv.org/abs/2205.13292v1,http://arxiv.org/pdf/2205.13292v1,eess.SP
4,Systemic risk governance in a dynamical model ...,"Lorella Fatone, Francesca Mariani",We consider the problem of governing systemic ...,2018-12-17T16:10:37Z,2018-12-17T16:10:37Z,http://arxiv.org/abs/1812.06973v1,http://arxiv.org/pdf/1812.06973v1,q-fin.RM


In [44]:
path_video= "/Users/daniellai/MSDS_2026/MSDS_2024_2026/Winter_2025/DSCI521/Project/data/video.csv"
data_video=  pd.read_csv(path_video)
data_video.head()

Unnamed: 0,title,authors,summary,published,updated,link,pdf_url,categories
0,VDPVE: VQA Dataset for Perceptual Video Enhanc...,"Yixuan Gao, Yuqin Cao, Tengchuan Kou, Wei Sun,...","Recently, many video enhancement methods have ...",2023-03-16T13:11:16Z,2023-03-16T13:11:16Z,http://arxiv.org/abs/2303.09290v1,http://arxiv.org/pdf/2303.09290v1,eess.IV
1,Coding Standards as Anchors for the CVPR CLIC ...,"Théo Ladune, Pierrick Philippe","In 2021, a new track has been initiated in the...",2021-05-20T15:33:36Z,2021-05-20T15:33:36Z,http://arxiv.org/abs/2105.09833v1,http://arxiv.org/pdf/2105.09833v1,eess.IV
2,PRNU-Based Source Device Attribution for YouTu...,"Emmanuel Kiegaing Kouokam, Ahmet Emir Dirik",Photo Response Non-Uniformity (PRNU) is a came...,2019-03-21T17:52:22Z,2019-04-01T15:51:12Z,http://arxiv.org/abs/1903.09141v2,http://arxiv.org/pdf/1903.09141v2,"eess.IV, eess.SP"
3,Tencent Video Dataset (TVD): A Video Dataset f...,"Xiaozhong Xu, Shan Liu, Zeqiang Li",Learning-based visual data compression and ana...,2021-05-12T20:46:56Z,2021-05-12T20:46:56Z,http://arxiv.org/abs/2105.05961v1,http://arxiv.org/pdf/2105.05961v1,eess.IV
4,Predicting the Quality of Compressed Videos wi...,"Xiangxu Yu, Neil Birkbeck, Yilin Wang, Christo...","Over the past decade, the online video industr...",2020-04-06T19:06:58Z,2020-04-06T19:06:58Z,http://arxiv.org/abs/2004.02943v1,http://arxiv.org/pdf/2004.02943v1,eess.IV


In [45]:
data_final = pd.concat([data_physic,data_algebra,data_bio,data_cell,data_cs,data_data,data_econ,data_ee,data_fin,data_geo,data_info,data_ML,data_math,data_network,data_robotics,data_signal,data_stats,data_ss,data_theory,data_topology,data_architechture,data_processing,data_quantitative, data_EE_2,data_EESS_2,data_EESS_3,data_EESS_4,data_EESS_4,data_EESS_5,data_electrical_1,data_engineering_1, data_image,data_video,data_system,data_signal_1],axis=0)
data_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 166383 entries, 0 to 1304
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       166383 non-null  object
 1   authors     166383 non-null  object
 2   summary     166383 non-null  object
 3   published   166383 non-null  object
 4   updated     166383 non-null  object
 5   link        166383 non-null  object
 6   pdf_url     166383 non-null  object
 7   categories  166383 non-null  object
dtypes: object(8)
memory usage: 11.4+ MB


# Data Processing

In [46]:
#total duplicate
total_dup = data_final['link'].duplicated()
total_dup.sum()

16212

In [47]:
#checking
unique_duplicate_values = data_final.loc[data_final['link'].duplicated(), 'link'].unique()
unique_duplicate_values

array(['http://arxiv.org/abs/0905.0882v1',
       'http://arxiv.org/abs/0711.1389v1',
       'http://arxiv.org/abs/2301.10930v1', ...,
       'http://arxiv.org/abs/2011.02109v2',
       'http://arxiv.org/abs/2011.05698v1',
       'http://arxiv.org/abs/2011.06161v1'], dtype=object)

In [48]:
#After cleaning
data = data_final.drop_duplicates(subset='link', keep='first')
data['link'].duplicated().sum()

0

In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150171 entries, 0 to 1304
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       150171 non-null  object
 1   authors     150171 non-null  object
 2   summary     150171 non-null  object
 3   published   150171 non-null  object
 4   updated     150171 non-null  object
 5   link        150171 non-null  object
 6   pdf_url     150171 non-null  object
 7   categories  150171 non-null  object
dtypes: object(8)
memory usage: 10.3+ MB


In [50]:
#Extracting the categories 
data['categories'].value_counts()

categories
cs.RO                                                              4159
hep-th                                                             4085
cs.IT, math.IT                                                     2380
econ.GN, q-fin.EC                                                  2304
quant-ph                                                           2106
                                                                   ... 
cs.CY, cs.AI, cs.CL, cs.CV                                            1
math.OC, econ.GN, math.PR, q-fin.EC, 91A16, 49N80, 49N10, 91B76       1
econ.EM, cs.CV, math.ST, stat.AP, stat.ME, stat.TH                    1
econ.GN, cs.MM, q-fin.EC, stat.AP                                     1
q-bio.NC, eess.SP, q-bio.TO                                           1
Name: count, Length: 41286, dtype: int64

In [51]:
#Clean the categories
def clean_category_str(cat_string):
    # Pre-define sets for each target based on the prefix
    physics_prefixes = {
        'physics', 'cond-mat', 'hep-ex', 'hep-lat', 'astro-ph', 
        'gr-qc', 'hep-ph', 'math-ph', 'hep-th', 'nlin', 
        'nucl-ex', 'nucl-th', 'quant-ph'
    }
    
    targets = set()
    #Total combination could be 2^6 () different categories
    for cat in cat_string.split(','):
        # Normalize and split by dot to get the prefix (e.g., 'cs.AI' -> 'cs') and merge math categories with stat , fin and econ together
        cat = cat.strip().lower()
        prefix = cat.split('.')[0]
        if prefix == 'cs' or prefix == 'corr':
            targets.add('cs')
        elif prefix in {'math','stat'}:
            targets.add('math-stats')
        elif prefix == 'bio' or prefix == 'q-bio':
            targets.add('bio')
        elif prefix in physics_prefixes:
            targets.add('physic')
        elif prefix in {'fin', 'q-fin','econ'}:
            targets.add('econ-qfin')
        elif prefix == 'eess':
            targets.add('eess')
    
    return ','.join(sorted(targets))


data['target'] = data['categories'].apply(clean_category_str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['target'] = data['categories'].apply(clean_category_str)


In [52]:
data['target'].unique()

array(['physic', 'math-stats,physic', 'cs,physic', 'cs,eess', 'cs',
       'math-stats', 'bio,math-stats,physic', 'cs,math-stats,physic',
       'cs,math-stats', 'bio', 'econ-qfin,physic', 'bio,physic',
       'bio,cs,physic', 'cs,eess,physic', 'bio,cs',
       'econ-qfin,math-stats', 'econ-qfin', 'cs,eess,math-stats',
       'bio,math-stats', 'bio,cs,math-stats', 'bio,econ-qfin,math-stats',
       'bio,cs,math-stats,physic', 'bio,cs,eess,physic', 'bio,cs,eess',
       'eess', 'eess,physic', 'cs,econ-qfin,math-stats', 'bio,eess',
       'cs,econ-qfin,physic', 'bio,cs,eess,math-stats', 'eess,math-stats',
       'bio,econ-qfin,physic', 'bio,eess,math-stats',
       'eess,math-stats,physic', 'cs,econ-qfin,math-stats,physic',
       'bio,cs,eess,math-stats,physic', 'bio,econ-qfin,math-stats,physic',
       'bio,eess,physic', 'cs,econ-qfin', 'cs,eess,math-stats,physic',
       'bio,cs,econ-qfin,physic', 'cs,econ-qfin,eess', 'bio,cs,econ-qfin',
       'cs,econ-qfin,eess,math-stats', 'bio,eco

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150171 entries, 0 to 1304
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   title       150171 non-null  object
 1   authors     150171 non-null  object
 2   summary     150171 non-null  object
 3   published   150171 non-null  object
 4   updated     150171 non-null  object
 5   link        150171 non-null  object
 6   pdf_url     150171 non-null  object
 7   categories  150171 non-null  object
 8   target      150171 non-null  object
dtypes: object(9)
memory usage: 11.5+ MB


In [54]:
data.to_parquet('data_final.parquet', index=False)
