### Read in the myanimelist data as a pandas dataframe

In [2]:
import pandas as pd

columns = ['title', 'title_english', 'genre', 'opening_theme', 'ending_theme']

df = pd.read_csv('../data/AnimeList.csv', usecols=columns)
df.head()

Unnamed: 0,title,title_english,genre,opening_theme,ending_theme
0,Inu x Boku SS,Inu X Boku Secret Service,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,Seto no Hanayome,My Bride is a Mermaid,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."
2,Shugo Chara!! Doki,Shugo Chara!! Doki,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
3,Princess Tutu,Princess Tutu,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ..."
4,Bakuman. 3rd Season,Bakuman.,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)..."


### Read in the pickled reddit/r/animethemes wiki page

In [46]:
import pickle 
import bs4

with open('../data/animethemes_wiki.pickle', 'rb') as f:
    wiki_raw = pickle.load(f)
    soup = bs4.BeautifulSoup(wiki_raw, 'html.parser')

In [20]:
wiki = soup.find(class_='md wiki')
links = wiki.find_all('p')


example_song = links[1].a
link = example_song.get('href')

print('name:', example_song.contents[0])
print('link:', link)

name: .hack//Liminality (2002)
link: /r/AnimeThemes/wiki/2002#wiki_.hack.2F.2Fliminality


In [2]:
import json
import praw

user_agent = 'AnimeThemes wiki scraper (by /u/nickyu42)'

with open('../data/credentials.json', 'r') as f:
    credentials = json.load(f)['praw']
    
reddit = praw.Reddit(client_id=credentials['client_id'], 
                     client_secret=credentials['client_secret'],
                     user_agent=user_agent)

wiki_page = reddit.subreddit('AnimeThemes').wiki

In [18]:
soup1 = bs4.BeautifulSoup(wiki_page['2002'].content_html)
wiki1 = soup1.find(class_='md wiki')

# only use 20 in the table for test
links = wiki1.find_all(['h3', 'table'])[:20]

links[:2]

[<h3 id="wiki_duel_masters"><a href="https://myanimelist.net/anime/1685/" rel="nofollow">Duel Masters</a></h3>,
 <table><thead>
 <tr>
 <th>Theme title</th>
 <th align="center">Links</th>
 <th align="center">Episodes</th>
 <th align="center">Notes</th>
 </tr>
 </thead><tbody>
 <tr>
 <td>OP ""</td>
 <td align="center"><a href="https://animethemes.moe/video/DuelMastersDub-OP1.webm" rel="nofollow">Webm</a></td>
 <td align="center"></td>
 <td align="center"></td>
 </tr>
 </tbody></table>]

In [57]:
def is_link(a):
    return isinstance(a, bs4.element.Tag) and a.has_attr('href')


def parse_row(row):
    cols = row.find_all(True)
    
    if is_link(cols[1].contents[0]):
        # name, link
        return cols[0].string, cols[1].a.get('href')
    
    return cols[0].string, None
        

song_table = {}
current_anime = ''

for l in links:
    if l.name == 'h3':
        current_anime = l.string
        
        # WARNING: duplicate detection is not done
        # another anime with the same name may break links
        song_table[current_anime] = []
        
    else:
        for row in l.tbody.find_all('tr'):
            name, link = parse_row(row)
            
            if link is not None:
                song_table[current_anime].append((name, link))

for name, songs in song_table.items():
    print(name)
    print('\n'.join(f'- {song} :: {link}' for song, link in songs))

Duel Masters
- OP "" :: https://animethemes.moe/video/DuelMastersDub-OP1.webm
GetBackers
- OP1 "Yuragu Koto Nai Ai" :: https://animethemes.moe/video/GetBackers-OP1.webm
- OP2 V1 "Barairo no Sekai" :: https://animethemes.moe/video/GetBackers-OP2.webm
- OP2 V2 "Barairo no Sekai" :: https://animethemes.moe/video/GetBackers-OP2v2.webm
- ED1 "Ichibyo No Refrain" :: https://animethemes.moe/video/GetBackers-ED1.webm
- ED2 "Namida no Hurricane" :: https://animethemes.moe/video/GetBackers-ED2.webm
- ED3 "Mr. Déjà vu" :: https://animethemes.moe/video/GetBackers-ED3.webm
- ED4 "Changin'" :: https://animethemes.moe/video/GetBackers-ED4.webm
Ghost in the Shell: Stand Alone Complex
- OP "Inner Universe" :: https://animethemes.moe/video/GhostInTheShellSAC-OP1.webm
- ED "Lithium Flower" :: https://animethemes.moe/video/GhostInTheShellSAC-ED1.webm
- OP "GET9" :: https://animethemes.moe/video/GhostInTheShellSAC-OP2.webm
Green Green OVA
- OP "Green Green" :: https://animethemes.moe/video/GreenGreenOVA-OP