In [141]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import os
import urllib
from retry import retry
import time
import pdb
from tqdm.notebook import tqdm 
import sqlite3
from ratelimiter import RateLimiter
import re
from ao3 import AO3
from ao3.works import Work as AWork
from math import ceil
import sqlalchemy
from datetime import datetime

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import (Column, Integer, String, JSON, 
                        ForeignKey, Table, DateTime)
from sqlalchemy.orm import relationship, Session, configure_mappers

##### schema 

In [2]:
engine = sqlalchemy.create_engine('sqlite:///kudos.db')
# engine = sqlalchemy.create_engine('sqlite:///:memory:')
Base = declarative_base()


kudos_table = Table('kudos', Base.metadata,
    Column('user', String, ForeignKey('users.id')),
    Column('work', Integer, ForeignKey('works.id')),
    extend_existing=True
)

bookmarks_table = Table(
    'bookmarks',
    Base.metadata,
    Column('user', String, ForeignKey('users.id')),
    Column('work', Integer, ForeignKey('works.id')),
    extend_existing=True
)

class User(Base):
    __tablename__ = 'users'
    __table_args__ = {'extend_existing': True} 

    id = Column(String, primary_key=True)
    works = relationship('Work', backref='author')
    bookmarks_ts = Column(DateTime, nullable=True)
    
    def __repr__(self):
        return f"User(id='{self.id}')"

    
class Work(Base):
    __tablename__ = 'works'
    __table_args__ = {'extend_existing': True} 

    id = Column(Integer, primary_key=True)
    title = Column(String)
    author_id = Column(String, ForeignKey('users.id'), nullable=True)
    json=Column(JSON, nullable=True)

    
    def __repr__(self):
        return f"Work(id={repr(self.id)}, \"{self.title}\" by {self.author.id})"

    kudos = relationship("User",
                    secondary=kudos_table,
                    backref='kudos')
    
    bookmarks = relationship("User",
                secondary=bookmarks_table,
                backref='bookmarks')
    
    bookmarks_ts = Column(DateTime, nullable=True)

    

Base.metadata.create_all(engine)

In [3]:
session = Session(bind=engine)
conn = engine.connect()

In [179]:
ratelimiter = RateLimiter(max_calls=2, period=6.667)

##### scraping

In [4]:
works = session.query(Work).all()
users = session.query(User).all()

In [5]:
s = requests.Session()
s.headers['user-agent'] = 'robot'

In [209]:
@retry(RuntimeError, tries=5, delay=2, backoff=5, max_delay=600)
def helper(url):
    with ratelimiter:
        resp = s.get(url)
    if resp.status_code != 200:
        if resp.status_code == 429:
            raise RuntimeError("Rate limit reached")
        else:
            resp.raise_for_status()
            
    return resp

def get_max_pages(soup):
    pages =soup.find('ol', role='navigation')
    if pages:
        max_pages = int(pages.find_all('li')[-2].text)
        return max_pages
    return 1

def process_page(soup):    
    bookmarks=soup.find_all('h5', class_='byline heading')
    bookmark_users = []
    for x in bookmarks:
        if not (x.a):
            continue
        bookmark_users.append(x.a['href'].split('/')[2])
    return bookmark_users

def estimate_bookmark_pages(work):
    if m := re.search(r'"bookmarks": (\d+),',work.json):
        return ceil(int(m.group(1))/20)
    print(f"Couldn't estimate pages for {work}")

def get_bookmarks_for_work(work, pbar = None):
    url=f'https://www.archiveofourown.org/works/{work.id}/bookmarks'
    resp = helper(url)
    soup = BeautifulSoup(resp.content)
    max_pages = get_max_pages(soup)
    if pbar:
        est_pages = estimate_bookmark_pages(work)
        if max_pages != est_pages:
            pbar.total += max_pages - est_pages
            
    users = process_page(soup)
    if pbar:
        pbar.update(1)
    for i in range(2, max_pages+1):
        resp = helper(url+f'?page={i}')
        users += process_page(BeautifulSoup(resp.content))
        if pbar:
            pbar.update(1)
    return users

def get_total_bookmark_pages(works):
    agg = 0
    for work in works:
        agg += estimate_bookmark_pages(work)
    return agg

def register_bookmarks_to_work(work, bookmarks, user_set):
    work.bookmarks = []
    for username in set(bookmarks):
        if username in user_set:
            work.bookmarks.append(session.query(User).get(username))
        else:
            work.bookmarks.append(User(id=username))
            user_set.add(username)
    work.bookmarks_ts = datetime.now()
    session.commit()



In [210]:
def process_works(works, user_set=None):
    est = get_total_bookmark_pages(works)
    with tqdm(total=est) as pbar:
        for work in works:
            pbar_start = pbar.n
            try:
                if work.bookmarks_ts:
                    raise Exception("Work already has bookmarks")
                bookmarks = get_bookmarks_for_work(work, pbar)
                register_bookmarks_to_work(work, bookmarks, user_set)
            except Exception as e:
                session.rollback()
                print(f"Couldn't process {work}")
                print(e)
                pbar.total -= estimate_bookmark_pages(work)
                pbar.n = pbar_start


In [219]:
user_set = set(map(lambda x: x[0], session.query(User.id)))
est = get_total_bookmark_pages(works)

In [221]:
with tqdm(total=est) as pbar:
    for work in works:
        pbar_start = pbar.n
        try:
            if work.bookmarks_ts:
                raise Exception("skip")
            bookmarks = get_bookmarks_for_work(work, pbar)
            register_bookmarks_to_work(work, bookmarks, user_set)
        except Exception as e:
            if 'skip' not in str(e):
                session.rollback()
                print(f"Couldn't process {work}")
                print(e)
                pbar.n = pbar_start                
            pbar.total -= estimate_bookmark_pages(work)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18086.0), HTML(value='')))

Couldn't process Work(id=20114854, "Was it my Fault?" by GarbageFanfics)
404 Client Error: Not Found for url: https://www.archiveofourown.org/works/20114854/bookmarks
Couldn't process Work(id=20189578, "Uninvited" by GarbageFanfics)
404 Client Error: Not Found for url: https://www.archiveofourown.org/works/20189578/bookmarks
Couldn't process Work(id=24283147, "Don't Let the Bed Bugs Bite." by BlazingNerz)
404 Client Error: Not Found for url: https://www.archiveofourown.org/works/24283147/bookmarks
Couldn't process Work(id=24307228, "We Must Be Strong" by BlazingNerz)
404 Client Error: Not Found for url: https://www.archiveofourown.org/works/24307228/bookmarks
Couldn't process Work(id=24334198, "-Message Incoming-" by BlazingNerz)
404 Client Error: Not Found for url: https://www.archiveofourown.org/works/24334198/bookmarks
Couldn't process Work(id=25987297, ""I Just Came Here To The Party For The-"" by smalltiddygothgf)
404 Client Error: Not Found for url: https://www.archiveofourown.or

In [220]:
user_set

{'apartment',
 'AuddaxEnderDragon',
 'simberthalutz',
 'Lovely_Lies',
 'Just_Roman_Around',
 'Quetampatolles',
 'lauralizzie',
 'dicksp8jr',
 'beingelsewhere',
 'GarlicPigs',
 'Jyair_M',
 'heidzz',
 'melekinh',
 'RabbitofWhite',
 'Adam29',
 'LightsThatNeverGoOut',
 'Awesome_Know_It_All',
 'GlassOwls',
 'MaddieMare',
 'LibraryMinion',
 'princessparkmanor',
 'SimonKilnsworth',
 'BunnieLouise',
 'mostlymeh',
 'BearShapedBastard',
 'telracsactually',
 'sospn',
 'Lady_Romanoff',
 'tellyisdreaming',
 'WereAh',
 'blooplesnoot',
 'irumimii',
 'CBeanz',
 'chiminies',
 'Didiza',
 'Auxcouleur',
 'Sbear45',
 'WolfyGames14',
 'bluemoonbibliophile',
 '8Antisocial_hyooman8',
 'minjiyoo',
 'ill_interrogate_the_cat_castiel',
 'gaykoifish',
 'Rowboat129',
 'alien__puppy',
 'ErrorScreen',
 'Lowqualityenglish',
 'MaryIsANerd',
 'FMusik',
 'georgiou',
 'Brooklin',
 'Camandrea7',
 'UponACrowsWing',
 'Ashstorm160',
 'Death_Madness',
 'spiiderblurbs',
 'zomnic',
 'vampwagner',
 '5elma',
 'helphelp11',
 'catgi

In [196]:
e = Exception('asdf')

In [188]:
session.rollback()

In [157]:
len(works)

11734