In [1]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import os
import urllib
from retry import retry
import time
import pdb
from tqdm.notebook import tqdm 
import sqlite3
from ratelimiter import RateLimiter
import re
from ao3 import AO3
from ao3.works import Work as AWork
import sqlalchemy

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, JSON, ForeignKey, Table
from sqlalchemy.orm import relationship, Session, configure_mappers

In [2]:
# !del kudos.db

##### schema

In [2]:
engine = sqlalchemy.create_engine('sqlite:///kudos.db')
# engine = sqlalchemy.create_engine('sqlite:///:memory:')
Base = declarative_base()


kudos_table = Table('kudos', Base.metadata,
    Column('user', String, ForeignKey('users.id')),
    Column('work', Integer, ForeignKey('works.id')),
    extend_existing=True
)

bookmarks_table = Table(
    'bookmarks',
    Base.metadata,
    Column('user', String, ForeignKey('users.id')),
    Column('work', Integer, ForeignKey('works.id')),
    extend_existing=True
)

class User(Base):
    __tablename__ = 'users'
    __table_args__ = {'extend_existing': True} 

    id = Column(String, primary_key=True)
    works = relationship('Work', backref='author')

    def __repr__(self):
        return f"User(id='{self.id}')"

    
class Work(Base):
    __tablename__ = 'works'
    __table_args__ = {'extend_existing': True} 

    id = Column(Integer, primary_key=True)
    title = Column(String)
    author_id = Column(String, ForeignKey('users.id'), nullable=True)
    json=Column(JSON, nullable=True)
    
    def __repr__(self):
        return f"Work(id={repr(self.id)}, \"{self.title}\" by {self.author.id})"

    kudos = relationship("User",
                    secondary=kudos_table,
                    backref='kudos')
    
    bookmarks = relationship("User",
                secondary=bookmarks_table,
                backref='bookmarks')

Base.metadata.create_all(engine)

In [3]:
session = Session(bind=engine)
conn = engine.connect()

In [4]:
ratelimiter = RateLimiter(max_calls=12, period=41)

##### ingestion

In [5]:

def get_or_create(session, model, **kwargs):
    instance = session.query(model).filter_by(**kwargs).first()
    if instance:
        return instance
    else:
        instance = model(**kwargs)
        session.add(instance)
        session.commit()
        return instance
    
def awork_to_work(session, awork, works, users):
    if awork.author not in users:
        author = User(id=awork.author)
        session.add(author)
        users.add(author.id)
    if awork.id not in works:
        ret = Work(id=awork.id,
                   author_id=awork.author,
                   title=awork.title,
                   json=awork.json())
        session.add(ret)
        works.add(awork.id)


In [6]:
def ingest_page(session, page, works=set(), users=set()):
    with open(f'./data/pages/shera-{page}.html', 'rb') as fin:
        data = fin.read()
    soup = BeautifulSoup(data.decode('utf-8'))
    blurbs = soup.find('ol', {'class': 'work index group'.split()}) \
                .find_all('li', {'class': 'work blurb group'.split()})
    for blurb in blurbs:
        try:
            awork = AWork(int(blurb.a['href'].split('/')[-1]), soup=blurb)
            awork_to_work(session, 
                          awork,
                          works,
                          users
                         )
            session.commit()
        except:
            print(f"Could not process work {awork.id} on page {page}")
            session.rollback()
            continue

In [23]:
works = {x[0] for x in session.query(Work.id).all()}
users = {x[0] for x in session.query(User.id).all()}
for page in tqdm(range(476, 592)): # 476
    ingest_page(session, page, works, users)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=116.0), HTML(value='')))

Could not process work 27588422 on page 477
Could not process work 29148855 on page 543
Could not process work 29434971 on page 553



In [None]:
no_wc = [27588422,29148855,29434971]

In [7]:
session.query(User).count()

4646

In [8]:
session.query(Work).count()

11734

##### kudos

In [29]:
class RatelimitedHttpHandler(object):
    def __init__(self, ratelimiter, sess=None, max_retries=3):
        self.ratelimiter=ratelimiter
        self.sess = sess or requests.Session()
        self.retries=0
        self.max_retries=max_retries
        
    def get_work_soup(self, work_id):
        """Get the BeautifulSoup of a given work.

        Parameters
        ----------
        work_id : str or int
            AO3 ID of work.

        Returns
        -------
        BeautifulSoup object
            Soup of work.

        """
        with ratelimiter:
            req = self.sess.get('https://archiveofourown.org/works/%s?view_adult=true' % work_id)

        if req.status_code == 404:
            raise LookupError('Unable to find a work with id %r' % work_id)
        elif req.status_code != 200:
            if req.text == 'Retry later' and self.retries < self.max_retries:
                time.sleep(300)
                self.retries += 1
                self.get_work_soup(work_id)
                self.retries=0


        # For some works, AO3 throws up an interstitial page asking you to
        # confirm that you really want to see the adult works.  Yes, we do.
        if 'This work could have adult content' in req.text:
            with ratelimiter:
                req = self.sess.get(
                    f'https://archiveofourown.org/works/{work_id}?view_adult=true')

                
        if 'This work is only available to registered users' in req.text:
            raise RuntimeError('Looking at work ID {} requires login'.format(work_id))

        return req.text


In [5]:
s = requests.Session()
s.headers['accept'] = '*/*;q=0.5, text/javascript, application/javascript, application/ecmascript, application/x-ecmascript'
s.headers['accept-encoding'] = 'gzip, deflate, br'
s.headers['x-requested-with'] = 'XMLHttpRequest'
'''
    # headers_str=''

    # headers = {}
    # lines =  headers_str.split('\n')
    # for line in lines:
    #     if not line:
    #         continue
    #     try:
    #         k, *v = line.split(':')
    #         headers[k] = ''.join(v).strip()
    #     except:
    #         print(line)
    #         break

    # s.headers.update(headers)''
    ''';

In [6]:
@retry(RuntimeError, tries=4, delay=2, backoff=5, max_delay=300)
def helper(url):
    resp = s.get(url)
    if resp.status_code != 200:
        if 'retry later' in resp.text.lower():
            raise RuntimeError("Rate limit reached")
        else:
            resp.raise_for_status()
            
    return resp

def get_kudos_for_work(work_id):
    if isinstance(work_id, Work):
        work_id = work_id.id
    before = None
    while True:
        base_url = f'https://www.archiveofourown.org/works/{work_id}/kudos'
        with ratelimiter:
            if before:
                resp = helper(base_url+f'?before={before}')
            else:
                resp = helper(base_url)

        soup = BeautifulSoup(resp.content)
        yield [x['href'].split('/')[-1][:-2] for x in soup.find_all('a', {'href': re.compile(r'/users/.*')})]
        if match := re.search(r'.*\/kudos\?before=(\d*).*', resp.content.decode()):
            before = match.group(1)
        else:
            return
        
def get_kudos_for_work_list(work_id):
    if isinstance(work_id, Work):
        work_id = work_id.id
    before = None
    ret = []
    while True:
        base_url = f'https://www.archiveofourown.org/works/{work_id}/kudos'
        with ratelimiter:
            if before:
                resp = s.get(base_url+f'?before={before}')
            else:
                resp = s.get(base_url)

        soup = BeautifulSoup(resp.content)
        breakpoint()
        ret += [x['href'].split('/')[-1][:-2] for x in soup.find_all('a', {'href': re.compile(r'/users/.*')})]
        if match := re.search(r'.*\/kudos\?before=(\d*).*', resp.content.decode()):
            before = match.group(1)
        else:
            return ret

In [7]:
def insert_kudos_for_work(work, user_set):
    work.kudos = []
    for user_list in get_kudos_for_work(work.id):
        existing_users = []
        new_users = []
        for user in user_list:
            if user not in user_set:
                new_users.append(User(id=user))
                user_set.add(user)
            else:
                existing_users.append(user)
        work.kudos += (session.query(User)
                      .filter(User.id.in_(existing_users))
                      .all()) + new_users
        session.add_all(new_users)
    session.commit()
            

In [8]:
user_set = {x.id for x in session.query(User)}
works = session.query(Work).all()

In [16]:
for work in tqdm(works[1400:]):
    if work.kudos:
        continue
    try:
        insert_kudos_for_work(work, user_set)
    except:
        print(f"Failed to insert kudos for {work}")
        session.rollback()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10334.0), HTML(value='')))

Failed to insert kudos for Work(id=22129282, "A Night to Remember" by Alice_h)
Failed to insert kudos for Work(id=24318193, "finally, a future" by scatteredpeas)



In [21]:
works = session.query(Work).all()

In [22]:
import csv

In [27]:
with open('kudos.csv', 'w') as fout:
    writer = csv.writer(fout, quoting=csv.QUOTE_ALL)
    for work in tqdm(works):
        for user in work.kudos:
            writer.writerow([work.title.encode('utf-8'), 
                             str(work.id).encode('utf-8'), 
                             user.id.encode('utf-8')])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11734.0), HTML(value='')))




In [31]:
me = session.query(User).get('McRibbedForHerPleasure')

In [34]:
len(me.kudos)

98