In [1]:
!python3 -m venv genai-fake-article
!source genai-fake-article/bin/activate

In [6]:
# create folder/files structure

import os

folders = [
    'genai-fake-article/data',
    'genai-fake-article/models',
    'genai-fake-article/scripts',
    'genai-fake-article/outputs',
]

files = [
    'genai-fake-article/README.md',
    'genai-fake-article/data/arxiv_papers.json',
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

for file_path in files:
    open(file_path, 'w').close()

print("Structure created successfully!")

Structure created successfully!


### Data Collection

In [9]:
import requests
import xml.etree.ElementTree as ET
import json

def fetch_arxiv_papers(query, max_results=100):
    # Define the base URL and parameters
    base_url = 'http://export.arxiv.org/api/query'
    params = {
        'search_query': query,
        'max_results': max_results,
        'start': 0,
        'sortBy': 'relevance',
        'sortOrder': 'descending'
    }
    
    # Make the request to arXiv API
    response = requests.get(base_url, params=params)
    
    # Check if the response was successful
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from arXiv API, status code: {response.status_code}")
    
    # Parse the XML response
    root = ET.fromstring(response.content)
    
    # Extract titles and abstracts
    papers = []
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
        abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
        papers.append({'title': title, 'abstract': abstract})
    
    return papers

def save_papers_to_json(papers, filename):
    with open(filename, 'w') as f:
        json.dump(papers, f, indent=4)
    print(f"Data saved to {filename}")

# Tests
if __name__ == "__main__":
    query = "machine learning"
    max_results = 10 
    papers = fetch_arxiv_papers(query, max_results)
    save_papers_to_json(papers, 'genai-fake-article/data/arxiv_papers.json')


Data saved to genai-fake-article/data/arxiv_papers.json


### Generate fake titles and abstracts based on our data

In [None]:
from openai import OpenAI
import json

client = OpenAI()
api_key = os.getenv("OPENAI_API_KEY")

def load_titles(filename):
    with open(filename, 'r') as f:
        papers = json.load(f)
    return [paper['title'] for paper in papers]

def generate_abstracts(titles, model="gpt-3.5-turbo", max_tokens=200):
    fake_papers = []
    for title in titles:
        try:
            response = client.completion.create(
                engine=model,
                prompt=f"Title: {title}\nAbstract:",
                max_tokens=max_tokens,
                n=1,
                stop=None,
                temperature=0.7,
            )
            abstract = response.choices[0].text.strip()
            fake_papers.append({'title': title, 'abstract': abstract})
        except Exception as e:
            print(f"Error generating abstract for title '{title}': {e}")
            fake_papers.append({'title': title, 'abstract': "Error generating abstract"})
    return fake_papers

def save_generated_papers(fake_papers, filename):
    with open(filename, 'w') as f:
        json.dump(fake_papers, f, indent=4)
    print(f"Generated papers saved to {filename}")

# Usage
if __name__ == "__main__":
    titles = load_titles('genai-fake-article/data/arxiv_papers.json')
    fake_papers = generate_abstracts(titles)
    save_generated_papers(fake_papers, 'genai-fake-article/outputs/fake_papers.json')

### Generate fake articles from specific titles/abstracts

In [None]:
import json
import time
from docx import Document


def load_papers(filename):
    with open(filename, 'r') as f:
        papers = json.load(f)
    return papers

def generate_section(title, abstract, section_name, model="gpt-3.5-turbo", max_tokens=300):
    prompt = f"Title: {title}\nAbstract: {abstract}\n{section_name}:\n"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an AI assistant skilled in generating academic paper sections."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0.7,
    )
    section_text = response.choices[0].message.content.strip()
    return clean_generated_text(section_text, section_name)

def generate_full_paper(paper, sections, model="gpt-4", max_tokens=300):
    full_paper = {"title": paper['title'], "abstract": paper['abstract']}
    for section in sections:
        section_text = generate_section(paper['title'], paper['abstract'], section, model, max_tokens)
        full_paper[section] = section_text
    return full_paper

def clean_generated_text(text, section_name):
    """
    Removes redundant title, abstract, and section title from the generated text.
    """
    # Split the text into lines
    lines = text.strip().split('\n')
    
    # Remove lines that start with "Title:", "Abstract:", or the section name
    cleaned_lines = []
    for line in lines:
        lower_line = line.strip().lower()
        if not lower_line.startswith("title:") and \
           not lower_line.startswith("abstract:") and \
           not lower_line.startswith(section_name.lower()):
            cleaned_lines.append(line)
    
    # Join the cleaned lines back into a single string
    cleaned_text = ' '.join(cleaned_lines)
    
    return cleaned_text

def save_paper_to_docx(paper, filename):
    doc = Document()
    doc.add_heading(paper['title'], 0)
    doc.add_heading('Abstract', level=1)
    doc.add_paragraph(paper['abstract'])

    for section, content in paper.items():
        if section not in ['title', 'abstract']:
            doc.add_heading(section, level=1)
            doc.add_paragraph(content)
    
    doc.save(filename)
    print(f"Paper saved to {filename}")

# Usage
if __name__ == "__main__":
    # Load papers (which includes both titles and abstracts)
    papers = load_papers('genai-fake-article/data/arxiv_papers.json')
    # TODO: Look at the possibility of making this dynamic to process either one or any specified number of papers
    paper = papers[0]

    # Define the sections of the paper
    # TODO: We can possibly make this dynamic as well, look into the logic later
    sections = ['Introduction', 'Basic concepts in optimization and analysis', 'Stochastic Gradient Descent', 'Generalization and Non-Smooth Optimization', 'Regularization','Adaptive Regularization', 'Variance Reduction', 'Nesterov Acceleration', 'The conditional gradient method', 'Second order methods for machine learning', 'Hyperparameter Optimization', 'Bibliography']

    # Generate the full paper
    full_paper = generate_full_paper(paper, sections)

    # Save the paper to a .docx file
    save_paper_to_docx(full_paper, f'genai-fake-article/outputs/{paper["title"].replace("/", "_").time.now()}.docx')


Paper saved to ../outputs/Lecture Notes: Optimization for Machine Learning.docx
