In [5]:
from pathlib import Path

articles = sorted(Path("./dump").glob("*.wiki"), key=lambda path: int(path.stem))

IGNORE_LIST = [
    '39' # Sandbox
]

print('all_articles:', len(articles))
articles = [article for article in articles if article.stem not in IGNORE_LIST]

print('content_articles', len(articles))

all_articles: 10246
content_articles 10245


In [6]:
import csv
page_names = {}
# Reading the TSV file
with open('dump/page_names.tsv', newline='', encoding='utf-8') as tsvfile:
    reader = csv.DictReader(tsvfile, delimiter='\t')
    for row in reader:
        page_names[row['id']] = row['name']  # Map id to name

print(page_names['1'])


トップページ


In [7]:
import re
def extract_song_details(text: str) -> dict:
    details = {}

    # Extract furigana
    furigana_match = re.search(r'&furigana\((.*?)\)', text)
    if furigana_match:
        details['furigana'] = furigana_match.group(1)

    # Extract composer (作曲), lyricist (作詞), arranger (if exists), and singer (唄)
    composer_match = re.search(r'作曲：\[\[(.*?)\]\]', text)
    lyricist_match = re.search(r'作詞：\[\[(.*?)\]\]', text)
    arranger_match = re.search(r'編曲：\[\[(.*?)\]\]', text)
    singer_match = re.search(r'唄：\[\[(.*?)\]\]', text)
    
    details['composer'] = composer_match.group(1) if composer_match else None
    details['lyricist'] = lyricist_match.group(1) if lyricist_match else None
    details['arranger'] = arranger_match.group(1) if arranger_match else None
    details['singer'] = singer_match.group(1) if singer_match else None

    # Extract lyrics
    lyrics_match = re.search(r'\*\*歌詞\n(.*?)(?=\n//)', text, re.DOTALL)
    if lyrics_match:
        details['lyrics'] = lyrics_match.group(1).strip()

    return details


In [8]:
from tqdm.notebook import tqdm
# Explore
for file in tqdm(articles):
    with open(file) as f:
        content = f.read()
        lines = content.splitlines()
        # Check if song have nicovideo_mylist 
        # nicovideo_mylist = next((l for l in lines[:10] if l.startswith('&nicovideo_mylist')), None)
        # tsukuritemei = next((l for l in lines[:15] if l.startswith('作り手名')), None)
        tokuchou = next((l for l in lines[:15] if l.startswith('**特徴')), None)
        kyokushoukai = next((l for l in lines if l.startswith('**曲紹介') or l.startswith('** 曲紹介')), None)
        # if nicovideo_mylist and kyokushoukai:
        # if tsukuritemei and kyokushoukai:
        if tokuchou and kyokushoukai:
            print('-'* 40)
            print(file.stem)
            print('-'* 40)
            print(content)
            print('-'* 40)


  0%|          | 0/10245 [00:00<?, ?it/s]

In [9]:
from tqdm.notebook import tqdm
songs = []

for file in tqdm(articles):
    with open(file) as f:
        id = file.stem
        content = f.read()
        if not content.startswith('#right()'):
            continue
        else:
            details = extract_song_details(content)
            furigana = details['furigana'] if 'furigana' in details else ''
            lines = content.splitlines()
            nicovideo_mylist = next((l for l in lines[:10] if l.startswith('&nicovideo_mylist')), None)
            tokuchou = next((l for l in lines[:15] if l.startswith('**特徴') or l.startswith('**とくちょう')), None)
            if nicovideo_mylist or tokuchou:
                continue
            print(f"{id:>5} {page_names[id]:<25} {furigana:<25}")
            

  0%|          | 0/10245 [00:00<?, ?it/s]

   11 ミクたんのテーマ                  みくたんのてーま                 
   12 恋スルVOC@LOID               こいするVOCALOID             
   14 大好き！お兄ちゃん！                たいすきおにいちゃん               
   15 猫になった日                    ねこになったひ                  
   16 Packaged                  Packaged                 
   17 仕方ないのよね                   しかたないのよね                 
   18 1枚のはね                     1まいのはね                   
   19 White Letter/GonGoss      WhiteLetter              
   20 みくみくにしてあげる♪               みくみくにしてあける               
   21 あなたの歌姫                    あなたのうたひめ                 
   22 宝物                        たからもの                    
   24 シューティング☆スター/かじゅきP         しゅーてぃんくすたー               
   25 moon/iroha                moon                     
   28 I Sing For You/haruno-suke ISingForYou              
   29 おしえて！だぁりん                 おしえてたぁりん                 
   30 恋愛ボーカロイド                  れんあいほーかろいと               
   31 BEST FRIENDS              BESTFRIENDS              
   32 この想い伝えた