In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

email= os.getenv("EMAIL")
password= os.getenv("PASSWORD")

In [3]:
url= 'https://archiveofourown.org/users/Sw33tBl0550m/readings'

def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920, 1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

In [4]:
driver = web_driver()
driver.get(url)


In [5]:
driver.find_element(By.XPATH, '//*[@id="user_login"]').send_keys(email)
driver.find_element(By.XPATH, '//*[@id="user_password"]').send_keys(password)
driver.find_element(By.XPATH, '//*[@id="new_user"]/dl/dd[4]/input').click()

In [18]:
import pandas as pd
import time
def scrape_titles_two_pages(driver):
    titles = []
    links= []
    authors= []
    fandom_tags= []
    relationship_tags= []
    character_tags= []
    freeform_tags= []
    summary= []
    words= []
    chapters= []
    latest_view= []
    comments= []
    kudos= []
    hits= []
    bookmarks= []
    collections= []
    languages= []
    current_page = 1
    max_pages = 426

    while current_page <= max_pages:
        time.sleep(2)
        driver.get(f"https://archiveofourown.org/users/Sw33tBl0550m/readings?page={current_page}")
        print(f"Scraping page {current_page}")

        # getting authors, fandoms tags, relationship tags, character tags
        articles = driver.find_elements(By.CSS_SELECTOR, 'li[role="article"]')

        print(f"Found {len(articles)} works on page {current_page}")

         # getting authors, fandoms tags, relationship tags, character tags
        articles = driver.find_elements(By.CSS_SELECTOR, 'li[role="article"]')

        print(f"Found {len(articles)} works on page {current_page}")
        
        # DEBUGGING CODE :
        print(f"Page URL: {driver.current_url}")
        print(f"Page title: {driver.title}")
        
        page_source_length = len(driver.page_source)
        print(f"Page source length: {page_source_length}")
        
        # pagination info on the page
        try:
            pagination_info = driver.find_element(By.CSS_SELECTOR, '.pagination').text
            print(f"Pagination info: {pagination_info}")
        except:
            print("No pagination info found")


        for article in articles:

            title_element = article.find_elements(By.CSS_SELECTOR, 'h4.heading a[href^="/works/"]')
            if title_element:
                titles.append(title_element[0].text)
                links.append(title_element[0].get_attribute('href'))
            else:
                titles.append("No title")
                links.append("No link")

            # Authors
            author_elements = article.find_elements(By.CSS_SELECTOR, 'h4.heading a[rel="author"]')
            author_list = [author.text for author in author_elements]
            authors.append(", ".join(author_list) if author_list else "Anonymous")

            # Fandoms
            fandom_elements = article.find_elements(By.CSS_SELECTOR, 'h5.fandoms a')
            fandom_tags.append(", ".join([f.text for f in fandom_elements]) if fandom_elements else "No fandom tags")

            # Relationships
            relationship_elements = article.find_elements(By.CSS_SELECTOR, 'li.relationships a')
            relationship_tags.append(", ".join([r.text for r in relationship_elements]) if relationship_elements else "No relationship tags")

            # Characters
            character_elements = article.find_elements(By.CSS_SELECTOR, 'li.characters a')
            character_tags.append(", ".join([c.text for c in character_elements]) if character_elements else "No character tags")

            # Freeforms
            freeform_elements = article.find_elements(By.CSS_SELECTOR, 'li.freeforms a')
            freeform_tags.append(", ".join([f.text for f in freeform_elements]) if freeform_elements else "No freeform tags")

            # Summary
            summary_elements = article.find_elements(By.CSS_SELECTOR, 'blockquote.userstuff p')
            summary.append("\n\n".join([s.text for s in summary_elements]) if summary_elements else "No summary")

            # Word count
            word_element = article.find_elements(By.CSS_SELECTOR, 'dd.words')
            words.append(word_element[0].text if word_element else "0")

            # Kudos
            kudos_elements = article.find_elements(By.CSS_SELECTOR, 'dd.kudos')
            kudos.append(kudos_elements[0].text if kudos_elements else "0")

            # Hits
            hits_elements = article.find_elements(By.CSS_SELECTOR, 'dd.hits')
            hits.append(hits_elements[0].text if hits_elements else "0")

            # Bookmarks
            bookmarks_elements = article.find_elements(By.CSS_SELECTOR, 'dd.bookmarks')
            bookmarks.append(bookmarks_elements[0].text if bookmarks_elements else "0")

            # Collections
            collections_elements = article.find_elements(By.CSS_SELECTOR, 'dd.collections')
            collections.append(collections_elements[0].text if collections_elements else "0")

            # Comments
            comments_elements = article.find_elements(By.CSS_SELECTOR, 'dd.comments')
            comments.append(comments_elements[0].text if comments_elements else "0")

            # Chapters
            chapters_elements = article.find_elements(By.CSS_SELECTOR, 'dd.chapters')
            chapters.append(chapters_elements[0].text if chapters_elements else "0")

            # Language
            language_elements = article.find_elements(By.CSS_SELECTOR, 'dd.language')
            languages.append(language_elements[0].text if language_elements else "No language")

            # Latest view history
            latest_view_elements = article.find_elements(By.CSS_SELECTOR, 'div.user.module.group h4.viewed.heading')
            latest_view.append(latest_view_elements[0].text if latest_view_elements else "No view history")

        #printing for logs

        print(f"Total scrapped: {len(titles)} titles, {len(links)} links, {len(authors)} authors, "
              f"{len(fandom_tags)} fandom tags, {len(relationship_tags)} relationship tags, {len(character_tags)} character tags, "
              f"{len(freeform_tags)} freeform tags, {len(summary)} summaries, {len(words)} words, {len(chapters)} chapters, {len(latest_view)} latest views, "
              f"{len(comments)} comments, {len(kudos)} kudos, {len(hits)} hits, {len(bookmarks)} bookmarks, {len(collections)} collections, {len(languages)} languages")

        current_page += 1
        print(f"Successfully moved to page {current_page}")
        
        if current_page > max_pages:
            print("Reached maximum page limit")
            break


    df= pd.DataFrame({"Title": titles, 'Author': authors, "Fandom Tags": fandom_tags, "Relationship Tags":relationship_tags, "Character Tags":character_tags,
                      "Freeform Tags": freeform_tags, "Summary":summary, "Language": languages, "Chapters": chapters, "Words": words, "Collections": collections, "Kudos": kudos, "Bookmarks": bookmarks, "Hits": hits, "Comments": comments, "Latest View": latest_view, "Link":links})

    return df



In [19]:
# Use the function
all_titles = scrape_titles_two_pages(driver)
print(f"\nTotal titles collected: {len(all_titles)}")

Scraping page 1
Found 20 works on page 1
Found 20 works on page 1
Page URL: https://archiveofourown.org/users/Sw33tBl0550m/readings?page=1
Page title: History | Archive of Our Own
Page source length: 194123
Pagination info: ← Previous1234567…426Next →
Total scrapped: 20 titles, 20 links, 20 authors, 20 fandom tags, 20 relationship tags, 20 character tags, 20 freeform tags, 20 summaries, 20 words, 20 chapters, 20 latest views, 20 comments, 20 kudos, 20 hits, 20 bookmarks, 20 collections, 20 languages
Successfully moved to page 2
Scraping page 2
Found 20 works on page 2
Found 20 works on page 2
Page URL: https://archiveofourown.org/users/Sw33tBl0550m/readings?page=2
Page title: History | Archive of Our Own
Page source length: 174524
Pagination info: ← Previous1234567…426Next →
Total scrapped: 40 titles, 40 links, 40 authors, 40 fandom tags, 40 relationship tags, 40 character tags, 40 freeform tags, 40 summaries, 40 words, 40 chapters, 40 latest views, 40 comments, 40 kudos, 40 hits, 40 b

In [20]:
all_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8495 entries, 0 to 8494
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              8495 non-null   object
 1   Author             8495 non-null   object
 2   Fandom Tags        8495 non-null   object
 3   Relationship Tags  8495 non-null   object
 4   Character Tags     8495 non-null   object
 5   Freeform Tags      8495 non-null   object
 6   Summary            8495 non-null   object
 7   Language           8495 non-null   object
 8   Chapters           8495 non-null   object
 9   Words              8495 non-null   object
 10  Collections        8495 non-null   object
 11  Kudos              8495 non-null   object
 12  Bookmarks          8495 non-null   object
 13  Hits               8495 non-null   object
 14  Comments           8495 non-null   object
 15  Latest View        8495 non-null   object
 16  Link               8495 non-null   object


In [21]:
all_titles

Unnamed: 0,Title,Author,Fandom Tags,Relationship Tags,Character Tags,Freeform Tags,Summary,Language,Chapters,Words,Collections,Kudos,Bookmarks,Hits,Comments,Latest View,Link
0,"better days, satoru",uhsato,"呪術廻戦 | Jujutsu Kaisen (Manga), 呪術廻戦 | Jujutsu ...",Aizawa Shouta | Eraserhead & Gojo Satoru,"Aizawa Shouta | Eraserhead, Gojo Satoru, Origi...","Gojo Satoru is a Little Shit, Gojo Satoru Need...","""why would I want to be rescued by someone who...",English,10/?,26733,0,2322,507,42255,177,Last visited: 26 Aug 2025 (Latest version.) Vi...,https://archiveofourown.org/works/46834918
1,A Blessing In Disguise,Hokkaido_milk_pudding,"呪術廻戦 | Jujutsu Kaisen (Manga), 僕のヒーローアカデミア | B...","Fushiguro Toji/Todoroki Rei, Fushiguro Megumi ...","Fushiguro Toji, Todoroki Rei, Fushiguro Megumi...","Fushiguro Toji Lives, Rei is Megumi's Mother, ...",Toji never intended to have so many kids. He h...,English,23/?,111711,21,3810,1310,132170,705,Last visited: 26 Aug 2025 (Latest version.) Vi...,https://archiveofourown.org/works/36102247
2,To The Life I Could Have Led,Hokkaido_milk_pudding,"呪術廻戦 | Jujutsu Kaisen (Manga), 呪術廻戦 | Jujutsu ...","Fushiguro Toji/Todoroki Rei, Fushiguro Megumi ...","Fushiguro Toji, Todoroki Rei, Fushiguro Megumi...","Crossover, Crossovers & Fandom Fusions, Crosso...",With Megumi and Touya about to set off to Juju...,English,2/?,8164,2,1046,232,17213,126,Last visited: 26 Aug 2025 (Latest version.) Vi...,https://archiveofourown.org/works/50856163
3,Are You My Family?,LadyDyDy35,"Boruto (Anime & Manga), Naruto (Anime & Manga)","Gaara/Haruno Sakura, Gaara & Shinki (Naruto), ...","Gaara (Naruto), Haruno Sakura, Shinki (Boruto)...","Shinki-centric, Angst with a Happy Ending, Gro...",Shinki isn't dumb; he knows the Suna Council i...,English,1/1,9956,0,73,5,740,8,Last visited: 26 Aug 2025 (Latest version.) Vi...,https://archiveofourown.org/works/65654701
4,Brotherly Bonds,LadyDyDy35,"Naruto (Anime & Manga), Boruto (Anime & Manga)","Gaara/Haruno Sakura, Shinki & Uchiha Sarada, G...","Shinki (Boruto), Uchiha Sarada, Gaara (Naruto)...","Established Relationship, GaaSaku - Freeform, ...",Sarada takes her duties as Shinki's younger si...,English,1/1,4999,0,50,3,462,0,Last visited: 26 Aug 2025 (Latest version.) Vi...,https://archiveofourown.org/works/67330183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8490,Haruno Sakura: Omake,Jememe,"Naruto, Boruto: Naruto Next Generations","Haruno Sakura/Hatake Kakashi, Hyuuga Hinata/Uz...","Haruno Sakura, Hatake Kakashi, Shiranui Genma,...","Omake, POV Alternating, many people terrified ...","Side plots, alternate POVs and bonus scenes fr...",English,4/?,5288,2,2277,351,25346,67,Last visited: 08 Dec 2020 (Update available.) ...,https://archiveofourown.org/works/27946265
8491,The Nearest Star,summersirius,Naruto,"Haruno Sakura/Namikaze Minato, Haruno Sakura/C...","Haruno Sakura, Namikaze Minato, Uzumaki Naruto...","Basically, space-time ninjutsu gone wrong, or ...",The quest for strength comes from the spirit o...,English,26/26,190318,20,4532,1441,166933,1350,Last visited: 06 Dec 2020 (Update available.) ...,https://archiveofourown.org/works/23366386
8492,"wrong place, wrong time",orphan_account,僕のヒーローアカデミア | Boku no Hero Academia | My Hero ...,"Haruno Sakura/?, Haruno Sakura/being a badass,...","Haruno Sakura, Midoriya Izuku, Original Charac...","the crossover no one asked for, Reincarnation,...","When Sakura first realizes what's happening, s...",English,10/?,62209,1,3498,1206,72607,741,Last visited: 06 Dec 2020 (Latest version.) Vi...,https://archiveofourown.org/works/23696176
8493,When Mothers Meddle,CalliopesKissingStories,Naruto,Haruno Sakura/Uchiha Itachi,"Haruno Sakura, Uchiha Itachi, Tsunade (Naruto)...","Alternate Universe - Canon Divergence, No Uchi...","Itachi and Sakura both have busy lives, and pl...",English,19/19,37220,0,372,54,9146,55,Last visited: 05 Dec 2020 (Update available.) ...,https://archiveofourown.org/works/27714443


In [23]:
# saving my data
all_titles.to_csv("data/MyAO3Historydata_Final.csv", index=False)
print("Data saved to MyAO3Historydata.csv")

Data saved to MyAO3Historydata.csv
