In [1]:
import requests
import time
import random
import numpy as np
import pandas as pd
import multiprocessing
import concurrent.futures
from bs4 import BeautifulSoup


def extract_features(a_soup):
    # Init empty dict
    a_dict = {}
    req_features_list = ['anime_id', 'Title', 'Synonyms', 'Type', 'Episodes', 'Status', 'Aired', 'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source', 'Genres', 'Themes', 'Demographic', 'Duration', 'Rating', 'Score', 'Ranked', 'Popularity', 'Members', 'Favorites']
    
    # Fetch anime title & id
    a_dict['Title'] = a_soup.find('h1', class_='title-name h1_bold_none').get_text()
    a_dict['anime_id'] = a_soup.find('div', id='content').find('a')['href'].split(sep='/')[4]

    # Fetch alternative titles, information & statistics
    for div in a_soup.findAll('div', class_='spaceit_pad'):
        if (div.span != None):
            feature = div.getText(strip=True).split(':', maxsplit=1)
            if (feature[0] == 'Theme'):
                a_dict['Themes'] = feature[1]
            else:
                a_dict[feature[0]] = feature[1]

    # # Fetch characters
    # a_characters = []
    # for h3 in a_soup.find_all('h3', class_='h3_characters_voice_actors'):
    #     tmp = h3.getText(strip=True).split(', ')
    #     a_characters.append(' '.join(tmp[::-1]))
    # a_dict['Characters'] = ','.join(a_characters)

    # # Fetch voice actors
    # a_voice_actors = []
    # for td in a_soup.find_all('td', class_='va-t ar pl4 pr4'):
    #     tmp = td.a.getText(strip=True).split(', ')
    #     a_voice_actors.append(' '.join(tmp[::-1]))
    # a_dict['Voice Actors'] = ','.join(a_voice_actors)

    # Filter dictionary
    a_features = {}
    for (key, value) in a_dict.items():
            if key in req_features_list:
                a_features[key] = value
    a_dict = a_features

    # Populate and return dataframe
    a_col_names = pd.read_csv(file_path_1, sep=';').columns
    return pd.DataFrame(columns=a_col_names).append(pd.Series(a_dict, name=0))


def extract_reviews(a_soup):
    # Init empty dict
    r_dict = {}

    # Fetch title & id
    r_dict['anime_id'] = a_soup.find('div', id='content').find('a')['href'].split(sep='/')[4]
    r_dict['Title'] = a_soup.find('h1', class_='title-name h1_bold_none').get_text()
    
    # Fetch top 5 reviews
    r_count = 0
    for div in a_soup.findAll('div' , attrs={'class':'spaceit textReadability word-break pt8 mt8'}):
        text_0 = div.contents[2].strip()
        text_1 = div.contents[3].get_text(strip=True)
        r_dict[f'review_{r_count}'] = ''.join([text_0, ' ', text_1])
        r_count += 1

    # Populate dataframe and return
    return pd.DataFrame(r_dict, index=[0])


def scrap_url(url):
    # Fetch page from URL
    a_page = requests.get(url, headers=headers)
    time.sleep(random.uniform(0, wait_time))

    if (a_page.status_code != 200):
        print('SERVER OVERLOAD', end='\r')
    
    # Request agian after 120s if Error 403 is hit
    while (a_page.status_code == 403):
        time.sleep(120)
        a_page = requests.get(url, headers=headers)

    # Parse the request response
    a_soup = BeautifulSoup(a_page.content, 'html.parser')
    
    # Remove hidden tags
    for tag in a_soup.findAll('span', attrs={'style':'display: none'}):
        tag.clear()

    # Fetch anime-info features and write to `anime-info.csv`
    a_features_df = extract_features(a_soup)
    a_features_df.to_csv(file_path_1, mode='a', sep=';', index=False, header=False)

    # Fetch anime reviews and wrtie to `anime-reviews.csv`
    a_reviews_df = extract_reviews(a_soup)
    a_reviews_df.to_csv(file_path_2, mode='a', sep=';', index=False, header=False)


def build_dataset(anime_urls):
    threads = min(MAX_THREADS, len(anime_urls))

    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(scrap_url, anime_urls)

def build_dataset_multiprocess(urls, n_cores):
    processes = []
    url_lists = np.array_split(urls, n_cores)

    for i in range(n_cores):
        processes.append(multiprocessing.Process(target=build_dataset, args=(url_lists[i].tolist(), )))
        
    for i in range(n_cores):
        processes[i].start()
        
    for i in range(n_cores):
        processes[i].join()

In [2]:
wait_time = 1

# Define file paths
file_path_1 = './Datasets/anime-info.csv'
file_path_2 = './Datasets/anime-reviews.csv'
url_file_path = './Datasets/anime-urls-main.csv'

# Init anime-info dataset
df = pd.DataFrame(
    columns=['anime_id', 'Title', 'Synonyms', 'Type', 'Episodes', 'Status', 'Aired', 'Premiered', 'Broadcast', 'Producers', 'Licensors', 'Studios', 'Source', 'Genres', 'Themes', 'Demographic', 'Duration', 'Rating', 'Score', 'Ranked', 'Popularity', 'Members', 'Favorites'],
    dtype=object
)
df.to_csv(file_path_1, sep=';', index=False, header=df.columns)

# Init anime-review dataset
df = pd.DataFrame(
    columns=['anime_id', 'Title', 'review_1','review_2','review_3','review_4','review_5'],
    dtype=object
)
df.to_csv(file_path_2, sep=';', index=False, header=df.columns)

headers = {
    'Host': 'myanimelist.net',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://myanimelist.net/anime.php',
    'Connection': 'keep-alive'
}

MAX_THREADS = 30

sample_urls = [
    'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood',
    'https://myanimelist.net/anime/38524/Shingeki_no_Kyojin_Season_3_Part_2',
    'https://myanimelist.net/anime/37521/Vinland_Saga',
    'https://myanimelist.net/anime/1535/Death_Note'
]

# Read URLs from CSV file written by `fetch_urls()`
df_urls = pd.read_csv(url_file_path, header=None)
urls = df_urls[0].to_list()

In [7]:
# Sample run
build_dataset_multiprocess(sample_urls * 8, n_cores=8)