### This is a script you can use and modify to see what your historical/recent AO3 activity has been. It keeps your login credentials local to your computer, so you don't have to worry about compromising your account. You can select whether to look at all your history or just what you've bookmarked, and it will tell you what your most commonly read freeform tags, characters, and relationships are. It will also tell you the total and average wordcount of fics you've read in the set you select. If some freeform tags should logically be grouped together (e.g., "Slow Burn" means the same thing as "Slooooow Burn"), you can do that, and if you'd like to ignore some tags, you can do that also.

### You will need to fill in a few pieces of information, specifically the headers and your username. Everything else you can leave as-is, though you can also modify it if you want. 

### Here are instructions on how to get the headers: https://stackoverflow.com/questions/23102833/how-to-scrape-a-website-which-requires-login-using-python-and-beautifulsoup

### Tl;dr
1. In your browser, open the developer tools or inspect mode
2. Go to AO3's website and login
3. After the login, go to the network tab, and then refresh the page
4. Right click the site request (the top one), hover over copy, and then copy as cURL
5. Then go to this site which converts cURL into python requests: https://curl.trillworks.com/
6. Add 'bot' to the end of the user-agent per AO3's request
7. Take the python code and use the generated headers to proceed

## User inputs

In [None]:
# fill this parameter according to the directions above
# the formatting should look a bit like what's below but more filled in
# please also add 'bot' to the end of your user-agent string to play nice with AO3
# https://archiveofourown.org/admin_posts/18804
headers = {
    'authority': 'archiveofourown.org',
    'cache-control': '',
    'sec-ch-ua': '',
    'sec-ch-ua-mobile': '',
    'sec-ch-ua-platform': '',
    'upgrade-insecure-requests': '',
    'user-agent': '',
    'accept': '',
    'sec-fetch-site': '',
    'sec-fetch-mode': '',
    'sec-fetch-user': '',
    'sec-fetch-dest': '',
    'referer': 'https://archiveofourown.org/users/',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': '',
} 

username = 'octocombo' # put your username here
num_pages_bookmark = 5 # the number of pages of your bookmarks you want to scrape, starting with the most recent
num_pages_hist = 15 # the number of pages of your full history you want to scrape, starting with the most recent
num_tags_wanted = 25 # number of top freeform tag groups you want to see

# define any tags you consider equivalent that don't necessarily have a great regular expression to group together
# this is case-sensitive
equivalencies = [['Specific Tag Number 1', 'Similar Tag Number 2', 'Related Tag Number 3'],
                 ['First Time','First Kiss','Loss of Virginity','First Love'],
                 ] 

# define any tags you consider equivalent and would want to see grouped that do have a decent regular expression
# this is not case-sensitive
stem_list = ['.*the period means any character, the asterisk means zero or more times.*', 
             '(put things in parentheses with a pipe in between)|(if there are a few options)',
             '.*Slow B.*', '(.*Non-Canon.*)|(.*Crack.*)',
             ]

# define specific tags to ignore that are just very common tags or tags you don't care about the frequency of
# this is case-sensitive
blacklist = ['Canon-Typical Violence','Getting Together','Fluff and Smut',
            'Fluff',]

# define regular expression tags to ignore because they're not meaningful to you
# this is not case-sensitive
stemblacklist = ['(.*Alternate Universe.*)|(.* AU.*)', '.*No Beta.*',]


##  Imports and class definitions

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time

delay = 5

class ao3_tag:
    
    def __init__(self, c=1, t=''):
        self.count = c
        self.text = t
        self.equivs = [self.text]
        for i in range(len(equivalencies)):
            if self.text in equivalencies[i]:
                self.equivs = equivalencies[i]

    def get_data(self):
        print(f'{self.count} - {self.text}')

    def get_count(self):
        return self.count

    def get_text(self):
        return self.text

    def add_text(self,t=''):
        self.text = self.text+t

    def get_equivs(self):
        return self.equivs

    def increment(self, adder=1):
        self.count = self.count + adder

## Collect URLs to scrape

In [None]:
# Bookmarks urls
# Run this code block if you want the results to be from only the fics you have bookmarked
tag_list = []
stem_add_list = [[] for i in range(len(stem_list))] 
stem_add_blacklist = []
stem_counts = [0] * len(stem_list)

bookmark0 = 'https://archiveofourown.org/users/'+username+'/bookmarks'
bookmarkx = 'https://archiveofourown.org/users/'+username+'/bookmarks?page=' 
urls = [bookmark0]
for i in range(num_pages_bookmark-1):
    item_next = bookmarkx + str(i+2)
    urls.append(item_next)

In [None]:
# History urls
# Run this code block if you want the results to be from all your recent history 
# If you run this code block after running the bookmark block, you will get your full history, not just bookmarks
tag_list = []
stem_add_list = [[] for i in range(len(stem_list))] 
stem_add_blacklist = []
stem_counts = [0] * len(stem_list)

history0 = 'https://archiveofourown.org/users/'+username+'/readings'
historyx = 'https://archiveofourown.org/users/'+username+'/readings?page='
urls = [history0]
for i in range(num_pages_hist-1):
    item_next = historyx + str(i+2)
    urls.append(item_next)

## Freeform tags

In [None]:
# The output of this cell will be the total number of tags found in the set of fics you looked at
for u in urls:
    time.sleep(delay)
    response = requests.get(u, headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    job_elements = soup.find_all("ul", class_="tags commas") 
    
    for j in job_elements:
        tags = j.find_all("li", class_="freeforms") 
        
        for t in tags:
            tx = t.text
            if tx not in blacklist:
                found = False
                for i in range(len(stem_list)):
                    if re.match(stem_list[i], tx, re.IGNORECASE) is not None:
                        found = True
                        if tx not in stem_add_list[i]:
                            stem_add_list[i].append(tx)
                        stem_counts[i] = stem_counts[i] + 1
                if not found:
                    for i in range(len(stemblacklist)):
                        if re.match(stemblacklist[i], tx, re.IGNORECASE) is not None:
                            found = True
                            if tx not in stem_add_blacklist:
                                stem_add_blacklist.append(tx)
                if not found:
                    for i in range(len(tag_list)):
                        if tx in tag_list[i].get_equivs():
                            tag_list[i].increment()
                            found = True
                if not found:
                    new_tag = ao3_tag(t=tx)
                    tag_list.append(new_tag)

print(len(tag_list))

In [None]:
# Get results
# The number is how many times the tag or tag group appeared in fics you read
# If you have groupings, everything that was included in that grouping will be listed together
# If you find this list includes a lot of tags you don't care about, add them to the blacklist and re-run
# If you find this list has a lot of similar tags, you can group them in the initial parameters and re-run
# These will be sorted from most common to least common, if there's a tie the sorting is arbitrary
# The number of results in this list is equal to the num_tags_wanted parameter you set at the start

for i in range(len(stem_list)):
    new_tag = ao3_tag(t=stem_add_list[i],c=stem_counts[i])
    tag_list.append(new_tag)
sortedlist = sorted(tag_list, key=lambda ao3_tag: ao3_tag.count, reverse = True)

for i in range(num_tags_wanted):
    sortedlist[i].get_data()

In [None]:
# If you want to check that your regex blacklist isn't filtering out things that matter to you, 
# you can see what's being filtered out by the blacklist by running this pring
print(stem_add_blacklist)

## Character or relationship tags

In [None]:
# Get characters or relationships
tag_list = []
for u in urls:
    time.sleep(delay)
    response = requests.get(u, headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    job_elements = soup.find_all("ul", class_="tags commas") 
    
    for j in job_elements:
        tags = j.find_all("li", class_="relationships") # leave as-is if you want relationships, comment out for characters
        #tags = j.find_all("li", class_="characters") # leave as-is if you want relationships, uncomment for characters
        
        for t in tags:
            tx = t.text
            if tx not in blacklist:
                found = False
                for i in range(len(tag_list)):
                    if tx in tag_list[i].get_equivs():
                        tag_list[i].increment()
                        found = True
                if not found:
                    new_tag = ao3_tag(t=tx)
                    tag_list.append(new_tag)

print(len(tag_list))

sortedlist = sorted(tag_list, key=lambda ao3_tag: ao3_tag.count, reverse = True)

for i in range(num_tags_wanted):
    sortedlist[i].get_data()

## Word count

In [None]:
fic_count = 0
word_sum = 0
for u in urls:
    time.sleep(delay)
    response = requests.get(u, headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    job_elements = soup.find_all("dl", class_="stats") 
    
    for j in job_elements:
        tags = j.find_all("dd", class_="words") 
        
        for t in tags:
            tx = int(re.sub(',','',t.text))
            fic_count = fic_count + 1
            word_sum = word_sum + tx

print("Total words in all included fics: "+str(word_sum))
print("Avg words per fic: "+ str(word_sum / fic_count))