This notebook takes a list of question ids from Stack Overflow and returns creates a PDF archive version of the threads those questions started including all answers and comments. It requires a Stack Exchange API key. One can be acquired at https://api.stackexchange.com

In [25]:
import requests
from pprint import pprint
from fpdf import FPDF
import html2text
from datetime import datetime

api_key = 'INPUT STACK EXCHANGE API KEY HERE'

def get_question_data(question_id, api_key):
    url = f"https://api.stackexchange.com/2.3/questions/{question_id}?site=stackoverflow&key={api_key}&filter=withbody"
    response = requests.get(url)
    data = response.json()
    return data['items'][0]

def get_answers_data(question_id, api_key):
    url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers?site=stackoverflow&key={api_key}&filter=withbody"
    response = requests.get(url)
    data = response.json()
    return data['items']

def get_comments_data(post_id, api_key):
    url = f"https://api.stackexchange.com/2.3/posts/{post_id}/comments?site=stackoverflow&key={api_key}&filter=withbody"
    response = requests.get(url)
    data = response.json()
    return data['items']

def generate_pdf(question_id):
    question = get_question_data(question_id, api_key)
    answers = get_answers_data(question_id, api_key)
    comments = get_comments_data(question_id, api_key)

    pdf = FPDF()
    pdf.add_page()

    # add the date and time of the PDF production and content licence type
    pdf.set_font('Arial', 'I', 10)
    pdf.cell(0, 10, f"PDF produced on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, Content license type: {question['content_license']}", 0, 1)
    pdf.ln()
    
    # add the question
    pdf.set_font('Arial', 'B', 14)
    pdf.cell(0, 10, f"{question['title']}", 0, 1)
    pdf.set_font('Arial', 'I', 10) 
    creation_date = datetime.fromtimestamp(question['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
    pdf.cell(0, 10, f"Posted by {question['owner']['display_name']}, {creation_date}, Score: {question['score']}", 0, 1)
    pdf.cell(0, 8, f"{question['link']}", 0, 1, 'L', False, question['link']) 
    pdf.set_font('Arial', '', 10) 
    html = question['body']
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text = text_maker.handle(html)
    num_blank_lines = 0
    for line in text.split('\n'):
        if line.strip() == '':
            continue
        else:
            num_blank_lines = 0
            if '<code>' in line:
                pdf.set_font('Courier', '', 8)
            else:
                pdf.set_font('Arial', '', 10)
            pdf.write(10, line.replace('\u2026', '...'))
            pdf.ln()

    # add the comments on the question
    if comments:
        qcounter = 1
        for comment in comments:
            pdf.cell(10)
            pdf.set_font('Arial', 'B', 8)
            pdf.cell(0, 10, f"Comment {qcounter}", 0, 1)
            pdf.cell(10)
            pdf.set_font('Arial', 'I', 8) 
            creation_date = datetime.fromtimestamp(comment['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
            pdf.set_font('Arial', 'I', 8)
            pdf.cell(0, 10, f"Posted by {comment['owner']['display_name']}, {creation_date}, Score: {comment['score']}", 0, 1)
            pdf.set_font('Arial', '', 8) 
            html = comment['body']
            text_maker = html2text.HTML2Text()
            text_maker.ignore_links = True
            text = text_maker.handle(html)
            num_blank_lines = 0
            for line in text.split('\n'):
                if line.strip() == '':
                    continue
                else:
                    num_blank_lines = 0
                    if '<code>' in line:
                        pdf.set_font('Courier', '', 6)
                    else:
                        pdf.set_font('Arial', '', 8)
                    pdf.cell(10)
                    pdf.write(10, line.replace('\u2026', '...'))
                    pdf.ln()
            qcounter += 1

    # add the answers
    if answers:
        counter = 1
        for answer in sorted(answers, key=lambda x: x['creation_date']):
            creation_date = datetime.fromtimestamp(answer['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
            pdf.set_font('Arial', 'B', 10)
            pdf.cell(0, 10, f"Answer {counter}", 0, 1)
            pdf.set_font('Arial', 'I', 10) 
            if answer['is_accepted']:
                pdf.cell(0, 10, f"Posted by {answer['owner']['display_name']}, {creation_date}, Score: {answer['score']}, Accepted", 0, 1)
            else:
                pdf.cell(0, 10, f"Posted by {answer['owner']['display_name']}, {creation_date}, Score: {answer['score']}, Not-accepted", 0, 1)
            html = answer['body']
            text_maker = html2text.HTML2Text()
            text_maker.ignore_links = True
            text = text_maker.handle(html)
            num_blank_lines = 0
            for line in text.split('\n'):
                if line.strip() == '':
                    continue
                else:
                    num_blank_lines = 0
                    if '<code>' in line:
                        pdf.set_font('Courier', '', 8)
                    else:
                        pdf.set_font('Arial', '', 10)
                    pdf.write(10, line.replace('\u2026', '...'))
                    pdf.ln()
            counter += 1

            # add the comments on the answer
            comments_data = get_comments_data(answer['answer_id'], api_key)
            if comments_data:
                acounter = 1
                for comment in sorted(comments_data, key=lambda x: x['creation_date']):
                    pdf.cell(10)
                    pdf.set_font('Arial', 'B', 8)
                    pdf.cell(0, 10, f"Comment {acounter}", 0, 1)
                    pdf.cell(10)
                    pdf.set_font('Arial', 'I', 8) 
                    creation_date = datetime.fromtimestamp(comment['creation_date']).strftime('%Y-%m-%d %H:%M:%S')
                    pdf.cell(0, 10, f"Posted by {comment['owner']['display_name']}, {creation_date}, Score: {comment['score']}", 0, 1)
                    pdf.set_font('Arial', '', 8) 
                    html = comment['body']
                    text_maker = html2text.HTML2Text()
                    text_maker.ignore_links = True
                    text = text_maker.handle(html)
                    num_blank_lines = 0
                    for line in text.split('\n'):
                        if line.strip() == '':
                            continue
                        else:
                            num_blank_lines = 0
                            if '<code>' in line:
                                pdf.set_font('Courier', '', 6)
                            else:
                                pdf.set_font('Arial', '', 8)
                            pdf.cell(10)
                            pdf.write(10, line.replace('\u2026', '...'))
                            pdf.ln()
                    acounter += 1

    # save the PDF file
    pdf.output(f'stackoverflow_{question_id}.pdf', 'F')

# prompt the user for a list of question IDs
question_ids = input("Enter a comma-separated list of question IDs: ").split(',')

# generate a PDF file for each question ID
for question_id in question_ids:
    generate_pdf(int(question_id.strip()))