# 1. Data Collection

### 1.1 Get the List of Animes

In [47]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.request 
import os
import codecs
import pandas as pd
from datetime import datetime
import time
import csv

Downloading each anime's url to a single text file

In [24]:
links_text = open("links.txt", "w")
for page in tqdm(range(0, 400)):
    url = 'https://myanimelist.net/topanime.php?limit=' + str(page * 50)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    for tag in soup.find_all('tr'):
        links = tag.find_all('a')
        for link in links:        
            if type(link.get('id')) == str and len(link.contents[0]) > 1:
                data = link.get('href')
                links_text.write(data)
                links_text.write("\n")
links_text.close()

100%|██████████| 400/400 [05:44<00:00,  1.16it/s]


Reading how many lines in the text file

In [25]:
file = open("links.txt", "r")
line_count = 0
for line in file:
    if line != "\n":
        line_count += 1
file.close()

print('There are total {} lines in this file.'.format(line_count))


There are total 19124 lines in this file.


### 1.2 Crawl the Animes

In [161]:
directory = 'html_pages'
parent_dir = "/Users/onurergun/Desktop/ADM-HW3"
path = os.path.join(parent_dir, directory)
os.mkdir(path)

Creating subfolders for each page of the list </br>
After creating the subfolders, we add each html file to corresponding page number folder.

In [34]:
directory = 'html_pages'
file_read = open('links.txt', 'r')
anime_urls_list = file_read.readlines()
file_read.close()

for i in range(1,401):
    html_page_name = 'page'+str(i)
    os.makedirs(os.path.join(directory, html_page_name ))
    directory_subfolder = directory+'/'+html_page_name+'/'
    
    for j in range(1,51):
        anime_num = 50*(i-1)+j
        html_file_name = directory_subfolder+'article_'+str(anime_num)+'.html'
        temp_text = open(html_file_name, "w")
        url = anime_urls_list[(anime_num-1)].encode('ascii','backslashreplace').decode('utf-8')
        urllib.request.urlretrieve(url,html_file_name)
        temp_text.close()

FileExistsError: [Errno 17] File exists: 'html_pages/page1'

### 1.3 Parse downloaded pages


In [254]:
animeTitle = []
animeType = []
animeNumEpisode = []
releaseDate = []
endDate = []
animeNumMembers = []
animeScore = []
animeUsers = []
animeRank = []
animePopularity = []
animeDescription = []
animeRelated = []
animeCharacters = []
animeVoices = []
animeStaff = []
directory = 'html_pages'

In [259]:
def parse_function(html_file_path):
    """
    Function that extracts anime's informations.
    Input: path (a string that is related to the position of each anime page in the folder tree)
    Output: a list of lists with all the informations mentioned above
    """
    # take article_i.html from the directory 
    soup = BeautifulSoup(open(html_file_path), "html.parser")
    divs = soup.find_all("div", {"class": "spaceit_pad"})
    try:
        animeTitle.append(str(soup.find_all('strong')[0].contents[0]))
    except:
        animeTitle.append('...')

    for div in divs:
        spans = div.find_all("span")
        for span in spans:
            # TYPES
            if span.contents[0] == 'Type:':
                try:
                    animeType.append(str(div.find_all('a')[0].contents[0]))
                except:
                    animeType.append('...')
            # NUMBER OF EPISODES
            if span.contents[0] == 'Episodes:':
                try: 
                    animeNumEpisode.append(int(div.contents[2]))
                except:
                    animeNumEpisode.append(0)
            # DATES
            if span.contents[0] == 'Aired:':
                try:
                    if len(div.contents[2]) > 21:
                        release = pd.to_datetime(div.contents[2][1:16]).to_pydatetime().strftime('%m/%d/%Y')
                        releaseDate.append(release)
                        end = pd.to_datetime(div.contents[2][1:16]).to_pydatetime().strftime('%m/%d/%Y')
                        endDate.append(end)
                    else:
                        release = pd.to_datetime(div.contents[2][1:16]).to_pydatetime().strftime('%m/%d/%Y')
                        releaseDate.append(release)
                        endDate.append('-')
                except:
                        releaseDate.append('unknown')
                        endDate.append('unknown')

    divs = soup.find_all("div", {"class": "stats-block po-r clearfix"})
    for div in divs:
        
        # MEMBERS
        members = div.find_all("span", {"class": "numbers members"})
        animeNumMembers.append(int(members[0].contents[1].contents[0].replace(',', '')))
        
        
        # SCORE
        # center of the html page
        rating=soup.find(name="div",attrs={"class":"fl-l score"})
        try:        
            animeScore.append(float(rating.text.strip()))
        except:
            animeScore.append(None)

     
        # USERS
        users = div.find_all("div", {"class": "fl-l score"})
        # here we we eliminate the word 'user '   
        # that is why there is the [:-6] part
        # we also replace the comma divisor
        try:
            animeUsers.append(int(users[0]['data-user'][:-6].replace(',', '')))
        except:
            animeUsers.append(0)


        # RANK
        rank = div.find_all("span", {"class": "numbers ranked"})
        try:
            animeRank.append(int(rank[0].contents[1].contents[0][1:]))
        except:
            animeRank.append(None)

        # POPULARITY
        popularity = div.find_all("span", {"class": "numbers popularity"})
        animePopularity.append(int(popularity[0].contents[1].contents[0][1:]))
    
    # DESCRIPTION
    # center of the html page
    animeDescription.append(soup.find_all("p", itemprop = "description")[0].text.strip().replace('\n', '').replace('  ', ''))


    # RELATED 
    related = soup.find_all("table", {"class": "anime_detail_related_anime"})
    if(len(related)!=0):
        x = []
        y = []
        for tr in related:
            td = tr.find_all("td")
            for i in range(0, len(td), 2):
                x.append(td[i].contents[0])
                t = td[i+1].find_all("a")
                if(len(t[0].contents)!=0):  
                    y.append(t[0].contents[0])
                else:
                    y.append('...')
            animeRelated.append('\n'.join([f'{x} {y}' for x, y in dict(zip(x, y)).items()]).split('\n'))
    else:
        animeRelated.append('...')
    
    # CHARACTERS
    try:
        characters = soup.find_all("div", {"class": "detail-characters-list clearfix"})
        chars = characters[0].find_all("h3", {"class": "h3_characters_voice_actors"})
        x = []
        for i in chars:
            x.append(i.contents[0].contents[0])
        animeCharacters.append(x)
    except:
        animeCharacters.append("NA")
    
    
   # VOICES
    try:
        voices = characters[0].find_all("td", {"class": "va-t ar pl4 pr4"})
        y = []
        for i in voices:
            y.append(i.contents[1].contents[0])
        animeVoices.append(y)
    except:
        animeVoices.append("NA")
    
    # STAFF
    try:
        staff = soup.find_all("div", {"class": "detail-characters-list clearfix"})
        staff = staff[1].find_all("td")
        x = []
        y = []
        for i in range(1, len(staff), 2):
            x.append(staff[i].contents[1].contents[0])
            y.append(staff[i].find_all("small")[0].contents[0])
        animeStaff.append([list(i) for i in list(zip(x,y))])
    
    except:
        animeStaff.append("NA")
               

In [256]:
os.makedirs('tsv_files')

In [257]:
def tsv_create(i):
    tsv_columns = ['animeTitle','animeType','animeNumEpisode','releaseDate','endDate','animeNumMembers','animeScore',
                  'animeUsers','animeRank','animePopularity','animeDescription','animeRelated','animeCharacters',
                  'animeVoices','animeStaff']
    data = zip([animeTitle[i-1]],[animeType[i-1]],[animeNumEpisode[i-1]],[releaseDate[i-1]],[endDate[i-1]],[animeNumMembers[i-1]],[animeScore[i-1]],[animeUsers[i-1]],[animeRank[i-1]],[animePopularity[i-1]],[animeDescription[i-1]],[animeRelated[i-1]],[animeCharacters[i-1]],[animeVoices[i-1]],[animeStaff[i-1]])
    tsv_file_name = 'tsv_files/anime_'+str(i)+'.tsv'
    with open(tsv_file_name, 'w', newline='') as f_output:
        tsv_output = csv.writer(f_output, delimiter='\t')
        tsv_output.writerow(tsv_columns)
        for title,typ,numEp,relD,endD,numMem,score,user,rank,popularity,descr,relat,charac,voices,staff in data:
                tsv_output.writerow([title,typ,numEp,relD,endD,numMem,score,user,rank,popularity,descr,relat,charac,voices,staff])


In [260]:
for i in range(1,384):
    html_page_name = 'page'+str(i)
    directory_subfolder = directory+'/'+html_page_name+'/'
    if(i!=383):
        # 383th page has less than 50 animes
        for j in range(1,51):
            anime_num = 50*(i-1)+j
            html_file_path = directory_subfolder+'article_'+str(anime_num)+'.html'
            soup = BeautifulSoup(open(html_file_path), "html.parser")
            parse_function(html_file_path)
            tsv_create(anime_num)
    else:
        for j in range(1,25):
            anime_num = 50*(i-1)+j
            html_file_path = directory_subfolder+'article_'+str(anime_num)+'.html'
            soup = BeautifulSoup(open(html_file_path), "html.parser")
            parse_function(html_file_path)
            tsv_create(anime_num)