In [1]:
!python3 -m venv genai-fake-article
!source genai-fake-article/bin/activate

In [None]:
!pip install transformers torch datasets

In [6]:
# create folder/files structure

import os

folders = [
    'genai-fake-article/data',
    'genai-fake-article/models',
    'genai-fake-article/scripts',
    'genai-fake-article/outputs',
]

files = [
    'genai-fake-article/README.md',
    'genai-fake-article/data/arxiv_papers.json',
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

for file_path in files:
    open(file_path, 'w').close()

print("Structure created successfully!")

Structure created successfully!


### Data Collection

In [9]:
import requests
import xml.etree.ElementTree as ET
import json

def fetch_arxiv_papers(query, max_results=100):
    # Define the base URL and parameters
    base_url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': query,
        'max_results': max_results,
        'start': 0,
        'sortBy': 'relevance',
        'sortOrder': 'descending'
    }
    
    # Make the request to arXiv API
    response = requests.get(base_url, params=params)
    
    # Check if the response was successful
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from arXiv API, status code: {response.status_code}")
    
    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Extract titles and abstracts
    papers = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
        abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
        papers.append({'title': title, 'abstract': abstract})
    
    return papers

def save_papers_to_json(papers, filename):
    with open(filename, 'w') as f:
        json.dump(papers, f, indent=4)
    print(f"Data saved to {filename}")

# Tests
if __name__ == "__main__":
    query = "machine learning"
    max_results = 10 
    papers = fetch_arxiv_papers(query, max_results)
    save_papers_to_json(papers, 'genai-fake-article/data/arxiv_papers.json')


Data saved to genai-fake-article/data/arxiv_papers.json
