### Read in the myanimelist data as a pandas dataframe

In [1]:
import pandas as pd

columns = ['title', 'title_english', 'title_japanese', 'genre', 'opening_theme', 'ending_theme']

df = pd.read_csv('../analysis/data/AnimeList.csv', usecols=columns)
df.head()

Unnamed: 0,title,title_english,title_japanese,genre,opening_theme,ending_theme
0,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ..."
1,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi..."
2,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
3,Princess Tutu,Princess Tutu,プリンセスチュチュ,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ..."
4,Bakuman. 3rd Season,Bakuman.,バクマン。,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)..."


In [2]:
def filter_genre(genre):
    def wrap(row):
        genres = row['genre']
        
        if isinstance(genres, str):
            return genre in genres.split(', ')
        
        return False
    
    return wrap

df_shounen = df[df.apply(filter_genre('Shounen'), axis=1)]
df_shoujo = df[df.apply(filter_genre('Shoujo'), axis=1)]
df_shoujo.head()

Unnamed: 0,title,title_english,title_japanese,genre,opening_theme,ending_theme
2,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ..."
5,Yume-iro Pâtissière,,夢色パティシエール,"Kids, School, Shoujo","['""Yume ni Yell! Patissiere♪ (夢にエール！パティシエール♪)""...","['""Ichigo no Miracle (いちごのミラクルール)"" by Yukina S..."
6,Ultra Maniac,Ultramaniac - Magical Girl,ウルトラマニアック,"Magic, Comedy, Romance, School, Shoujo","['""Kagami no Naka"" by can/goo']","['""Hitotsu=Unmei Kyoudoutai"" by can/goo']"
9,Ouran Koukou Host Club,Ouran High School Host Club,桜蘭高校ホスト部,"Comedy, Harem, Romance, School, Shoujo","['""Sakura Kiss"" by Chieko Kawabe']","['""Shissou"" by Last Alliance']"
11,Kaichou wa Maid-sama!,Maid Sama!,会長はメイド様!,"Comedy, Romance, School, Shoujo","['""My Secret"" by Saaya Mizuno (水野佐彩)']","['#1: ""Yokan (予感)"" by heidi. (eps 1-15)', '#2:..."


### Read in the pickled reddit/r/animethemes wiki page

In [3]:
import pickle 
import bs4

with open('../analysis/data/animethemes_wiki.pickle', 'rb') as f:
    wiki_raw = pickle.load(f)
    soup = bs4.BeautifulSoup(wiki_raw, 'html.parser')
    
# load all links
wiki = soup.find(class_='md wiki')
links = wiki.find_all('p')

example_song = links[1].a
example_link = example_song.get('href')

print('name:', example_song.string)
print('link:', example_link)

name: .hack//Liminality (2002)
link: /r/AnimeThemes/wiki/2002#wiki_.hack.2F.2Fliminality


In [4]:
existing_links = {}

# the first <p> is a message, so ignore
for link in links[1:]:
    link = link.a
    
    # the names are in the format <name> (<year>)
    # remove the redundant year
    name = link.string.split('(')[0].strip()
    
    # add each link with the anime name as key
    # NOTE: doesn't matter if anime with duplicate name exist
    # as we don't care which version of op is used as long as both have same genres
    existing_links[name] = link.get('href')

In [5]:
def pre_process_title(title):
    # should create multiple versions of given title
    pass


def find_samples(df, sample_size):
    """Find anime in df that have a link on /r/AnimeThemes"""
    result = []
    
    count = 0
    for index, row in df.iterrows():
        for t in ('title', 'title_english', 'title_japanese'):
            if row[t] in existing_links:
                result.append(row[t])
                count += 1
                break

        if count == sample_size:
            break
            
    return result
    
    
# sample count per label
# N shounen/ N shoujo
SAMPLE_SIZE = 100

shounen_samples = find_samples(df_shounen, SAMPLE_SIZE)
shoujo_samples = find_samples(df_shoujo, SAMPLE_SIZE)
        
shoujo_samples[:5]

['Shugo Chara!! Doki',
 'Ouran Koukou Host Club',
 'Kaichou wa Maid-sama!',
 'Tonari no Kaibutsu-kun',
 'Kimi ni Todoke']

### Get download links from /r/AnimeThemes wiki

In [6]:
import json
import praw

user_agent = 'AnimeThemes wiki scraper (by /u/nickyu42)'

with open('../analysis/data/credentials.json', 'r') as f:
    credentials = json.load(f)['praw']
    
reddit = praw.Reddit(client_id=credentials['client_id'], 
                     client_secret=credentials['client_secret'],
                     user_agent=user_agent)

wiki_page = reddit.subreddit('AnimeThemes').wiki

In [7]:
soup1 = bs4.BeautifulSoup(wiki_page['2002'].content_html)
wiki1 = soup1.find(class_='md wiki')

# only use 20 in the table for test
links = wiki1.find_all(['h3', 'table'])[:20]

links[:2]

[<h3 id="wiki_duel_masters"><a href="https://myanimelist.net/anime/1685/" rel="nofollow">Duel Masters</a></h3>,
 <table><thead>
 <tr>
 <th>Theme title</th>
 <th align="center">Links</th>
 <th align="center">Episodes</th>
 <th align="center">Notes</th>
 </tr>
 </thead><tbody>
 <tr>
 <td>OP ""</td>
 <td align="center"><a href="https://animethemes.moe/video/DuelMastersDub-OP1.webm" rel="nofollow">Webm</a></td>
 <td align="center"></td>
 <td align="center"></td>
 </tr>
 </tbody></table>]

In [8]:
def is_link(a):
    return isinstance(a, bs4.element.Tag) and a.has_attr('href')


def parse_row(row):
    cols = row.find_all(True)
    
    if is_link(cols[1].contents[0]):
        # name, link
        return cols[0].string, cols[1].a.get('href')
    
    return cols[0].string, None
        

song_table = {}
current_anime = ''

for l in links:
    if l.name == 'h3':
        current_anime = l.string
        
        # WARNING: duplicate detection is not done
        # another anime with the same name may break links
        song_table[current_anime] = []
        
    else:
        for row in l.tbody.find_all('tr'):
            name, link = parse_row(row)
            
            if link is not None:
                song_table[current_anime].append((name, link))

for name, songs in song_table.items():
    print(name)
    print('\n'.join(f'- {song} :: {link}' for song, link in songs))

Duel Masters
- OP "" :: https://animethemes.moe/video/DuelMastersDub-OP1.webm
GetBackers
- OP1 "Yuragu Koto Nai Ai" :: https://animethemes.moe/video/GetBackers-OP1.webm
- OP2 V1 "Barairo no Sekai" :: https://animethemes.moe/video/GetBackers-OP2.webm
- OP2 V2 "Barairo no Sekai" :: https://animethemes.moe/video/GetBackers-OP2v2.webm
- ED1 "Ichibyo No Refrain" :: https://animethemes.moe/video/GetBackers-ED1.webm
- ED2 "Namida no Hurricane" :: https://animethemes.moe/video/GetBackers-ED2.webm
- ED3 "Mr. Déjà vu" :: https://animethemes.moe/video/GetBackers-ED3.webm
- ED4 "Changin'" :: https://animethemes.moe/video/GetBackers-ED4.webm
Ghost in the Shell: Stand Alone Complex
- OP "Inner Universe" :: https://animethemes.moe/video/GhostInTheShellSAC-OP1.webm
- ED "Lithium Flower" :: https://animethemes.moe/video/GhostInTheShellSAC-ED1.webm
- OP "GET9" :: https://animethemes.moe/video/GhostInTheShellSAC-OP2.webm
Green Green OVA
- OP "Green Green" :: https://animethemes.moe/video/GreenGreenOVA-OP

In [19]:
s = df_shounen.sample()['title']

In [20]:
print(type(s))
s.values

<class 'pandas.core.series.Series'>


array(['Doraemon Movie 22: Nobita to Tsubasa no Yuusha-tachi'],
      dtype=object)