In [1]:
import bs4
from typing import List
import requests
import os
import tqdm
import pandas as pd
import json
import numpy as np

In [2]:
def string_to_enum(s):
    
    s = s.replace(' ','_')
    s = s.replace('-','_')
    
    return s

In [3]:
def convert_abv(s):
    ab_dict = {'K':1000,'M':1000000,'B':1000000000}
    
    if s[-1].isdigit():
        if ',' in s:
            s = s.replace(',','')
        return int(s)
    
    return int(float(s[:-1]) * ab_dict[s[-1]])

In [4]:
def scrape_genres():
    url = 'https://www.webtoons.com/en/genre'
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content,'html.parser')
    
    visibles = soup.find('ul',{'class':'snb _genre'})
    hiddens = soup.find('ul',{'class':'ly_lst_genre as_genre'})
    
    visible_genres = visibles.find_all('li')
    hidden_genres = hiddens.find_all('li')

    genres = [g['data-genre'] for g in visible_genres[:-1]+hidden_genres]
    return genres

In [5]:
def scrape_details(url):
    
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content,'html.parser')
    
    info_div = soup.find('div',{'class':'info'})
    genre = info_div.find('h2').text
    grade_area = soup.find('ul',{'class':'grade_area'})
    back_div = soup.find('div',{'class':'detail_header'})
    
    details = soup.find('div',{'class':'aside detail'})
    summary = details.find('p',{'class':'summary'}).text
    day_info = details.find('p',{'class':'day_info'}).text
    back_img_url = back_div.find('img')['src']
    
    grade_area = [x.text for x in grade_area.find_all('em')]
    view, follower, rate = grade_area
    
    view = convert_abv(view)
    follower = convert_abv(follower)
    
    #day info UPEVERY WEDNESDAY -> WEDNESDAY
    if len(day_info.split(' '))==2:
        day_info = day_info.split(' ')[1]
    
    genre = string_to_enum(genre)
    
    return {
        'summary':summary,
        'day_info':day_info,
        'genre':genre,
        'view':view,
        'follower':follower,
        'rate':rate,
        'back_img_url':back_img_url
           }

In [6]:
def get_all_mangas(url,limit=2):
    data = []
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content,'html.parser')

    card_list = soup.find('ul',{'class':'card_lst'})
    cards = card_list.find_all('li')
    
    for c in tqdm.tqdm(cards[:limit]):

        subj = c.find('p',{'class':'subj'}).text
        author = c.find('p',{'class':'author'}).text
        grade = c.find('em',{'class':'grade_num'}).text
        link = c.find('a')['href']
        tumbnail = c.find('a').find('img')['src']
        
        
        grade = convert_abv(grade)
        d = scrape_details(link)

        tmp = {
            'name':subj,
            'author':author,
            'grade':grade,
            'genre':d['genre'],
            'summary':d['summary'],
            'day_info':d['day_info'],
            'view':d['view'],
            'follower':d['follower'],
            'rate':d['rate'],
            'link':link,
            'tumbnail':tumbnail,
            'back_img_url':d['back_img_url']

        }

        data.append(tmp)
    df = pd.DataFrame(data)
    return df

In [7]:
df = get_all_mangas('https://www.webtoons.com/en/genre',500)
df

100%|█████████████████████████████████████████| 241/241 [02:26<00:00,  1.65it/s]


Unnamed: 0,name,author,grade,genre,summary,day_info,view,follower,rate,link,tumbnail,back_img_url
0,The Remarried Empress,Alphatart / Sumpul,22100000,Fantasy,Navier Ellie Trovi was an empress perfect in e...,SUNDAY,241700000,2600000,9.87,https://www.webtoons.com/en/fantasy/the-remarr...,https://webtoon-phinf.pstatic.net/20200904_268...,https://webtoon-phinf.pstatic.net/20200904_29/...
1,True Beauty,Yaongyi,46800000,Romance,"After binge-watching beauty videos online, a s...",WEDNESDAY,885100000,7000000,9.52,https://www.webtoons.com/en/romance/truebeauty...,https://webtoon-phinf.pstatic.net/20210129_65/...,https://webtoon-phinf.pstatic.net/20210129_175...
2,Midnight Poppy Land,Lilydusk,13900000,Romance,After making a grisly discovery in the country...,SATURDAY,203300000,2300000,9.80,https://www.webtoons.com/en/romance/midnight-p...,https://webtoon-phinf.pstatic.net/20191119_163...,https://webtoon-phinf.pstatic.net/20191119_132...
3,Reunion,stephattyy,867255,Romance,"After moving away for a decade, Rhea returns t...",FRIDAY,11500000,707935,9.77,https://www.webtoons.com/en/romance/reunion/li...,https://webtoon-phinf.pstatic.net/20220311_14/...,https://webtoon-phinf.pstatic.net/20220311_196...
4,Teenage Mercenary,YC / Rakyeon,4000000,Action,"At the age of eight, Ijin Yu lost his parents ...",WEDNESDAY,45700000,1000000,9.86,https://www.webtoons.com/en/action/teenage-mer...,https://webtoon-phinf.pstatic.net/20210430_189...,https://webtoon-phinf.pstatic.net/20210430_11/...
...,...,...,...,...,...,...,...,...,...,...,...,...
236,The Shadow Prophet,Anne Delseit / Marissa Delbressine,303443,Drama,The Great Prophet Godo has a place for everyon...,SUNDAY,1800000,91067,9.55,https://www.webtoons.com/en/drama/the-shadow-p...,https://webtoon-phinf.pstatic.net/20210329_188...,https://webtoon-phinf.pstatic.net/20200125_134...
237,In the Bleak Midwinter;,Kat / Ali,2400000,Sci_fi,With a dead sister and 25 years to go on her s...,WEDNESDAY,21600000,588535,9.77,https://www.webtoons.com/en/sf/in-the-bleak-mi...,https://webtoon-phinf.pstatic.net/20210329_117...,https://webtoon-phinf.pstatic.net/20200312_45/...
238,Unlovable Replacement,Nylana,6000000,Romance,"When his relationship went down the drain, Chi...",THURSDAY,76700000,1500000,9.77,https://www.webtoons.com/en/romance/unlovable-...,https://webtoon-phinf.pstatic.net/20210329_185...,https://webtoon-phinf.pstatic.net/20191029_163...
239,Version Day and Night,dewkneelight,1100000,Drama,Candy always dreamed of something more excitin...,FRIDAY,10400000,480302,9.52,https://www.webtoons.com/en/drama/version-day-...,https://webtoon-phinf.pstatic.net/20210329_80/...,https://webtoon-phinf.pstatic.net/20191114_100...


In [8]:
df['day_info'].value_counts()

COMPLETED           104
SATURDAY             25
TUESDAY              25
WEDNESDAY            21
SUNDAY               19
FRIDAY               19
MONDAY               17
THURSDAY             10
UPEVERY MON, THU      1
Name: day_info, dtype: int64

In [9]:
def send_req(x):
    name,author,grade,genre,summary,day_info,view,follower,rate,link,tumbnail,back_img = x
    url = "http://localhost:8080/add"

    payload = json.dumps({
      "id": 122,
      "name": name,
      "authorName":author,
      "genre":genre.upper(),
      "description":summary,
      "followerNumber": follower,
      "viewNumber": view,
      "rate": rate,
      "newEpisodeDay":day_info.upper(),
      "tumbnailURL": tumbnail,
      "backgroudImageURL": back_img
    },default= lambda x: x.item() if isinstance(x,np.generic) else x)
    headers = {
      'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    

In [14]:
df.apply(lambda x: send_req(x),axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
236    None
237    None
238    None
239    None
240    None
Length: 241, dtype: object

In [11]:
def get_episodes(url):
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content,'html.parser')
    episodes = soup.find('ul',{'id':'_listUl'})
    episodes = episodes.find_all('li')
    
    result = []
    for episode in episodes:
        episode_img = episode.find('img')['src']
        episode_num = episode.find('span',{'class':'subj'}).text
        episode_date = episode.find('span',{'class':'date'}).text
        episode_likes = episode.find('span',{'class':'like_area'}).text[4:] #like123,23

        result.append({
            'episode_img':episode_img,
            'episode_num':episode_num,
            'episode_date':episode_date,
            'episode_lies':episode_likes
        })
    return result
    

In [12]:
get_episodes('https://www.webtoons.com/en/romance/truebeauty/list?title_no=1436')

[{'episode_img': 'https://webtoon-phinf.pstatic.net/20220322_207/1647935465711Udp8U_PNG/thumb_164793532601414362010.png?type=q90',
  'episode_num': 'Episode 200',
  'episode_date': 'May 31, 2022',
  'episode_lies': '71,627'},
 {'episode_img': 'https://webtoon-phinf.pstatic.net/20220322_242/1647935300738jmJ6D_PNG/thumb_164793495411114362004.png?type=q90',
  'episode_num': 'Episode 199',
  'episode_date': 'May 24, 2022',
  'episode_lies': '89,953'},
 {'episode_img': 'https://webtoon-phinf.pstatic.net/20220322_273/16479349315486199w_PNG/thumb_164793482947314361998.png?type=q90',
  'episode_num': 'Episode 198',
  'episode_date': 'May 17, 2022',
  'episode_lies': '98,305'},
 {'episode_img': 'https://webtoon-phinf.pstatic.net/20220307_289/16466517239892QIh6_PNG/thumb_164665164509914361983.png?type=q90',
  'episode_num': 'Episode 197',
  'episode_date': 'May 10, 2022',
  'episode_lies': '97,465'},
 {'episode_img': 'https://webtoon-phinf.pstatic.net/20220307_123/164665146929293txK_PNG/thumb_16

In [48]:
# traverse
resp = requests.get('https://www.webtoons.com/en/romance/truebeauty/episode-0-/viewer?title_no=1436&episode_no=1')
soup = bs4.BeautifulSoup(resp.content,'html.parser')

In [51]:
try:
    soup.find('a',{'class':'pg_next'})['href']
except Exception as e:
    print(e)

'https://www.webtoons.com/en/romance/truebeauty/episode-1/viewer?title_no=1436&episode_no=2'

In [55]:
def get_image_links_and_next(url):
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content,'html.parser')
    print(soup.find('h1',{'class':'subj_episode'}).text)
    next_link = None
    try:
        next_link = soup.find('a',{'class':'pg_next'})['href']
    except Exception as e:
        print(e)
    return [], next_link

In [60]:
def get_all_episodes(url):
    _, next_exist = get_image_links_and_next(url)
    while next_exist and next_exist !='#':
        _,next_exist = get_image_links_and_next(next_exist)
    print('finished')
    

In [61]:
get_all_episodes('https://www.webtoons.com/en/romance/truebeauty/episode-199/viewer?title_no=1436&episode_no=200')

Episode 199
Episode 200
finished


In [9]:
resp = requests.get('https://www.webtoons.com/en/romance/truebeauty/list?title_no=1436')
soup = bs4.BeautifulSoup(resp.content,'html.parser')

In [13]:
pagi = soup.find('div',{'class':'paginate'})

In [46]:
def ge_and_pagination(url,visited):
    print('Start with '+url)
    if url not in visited:
        visited.add(url)
        resp = requests.get(url)
        soup = bs4.BeautifulSoup(resp.content,'html.parser')
        pagi = soup.find('div',{'class':'paginate'})
        base_url = 'https://webtoons.com'
        result = []

        pag_nums = pagi.find_all('a')

        for p in pagi.find_all('a')[1:]:
            if p['href'] != '#':
                print(base_url+p['href'])
                a = get_episodes(base_url+p['href'])
                result.append(a)
        x = base_url+p['href']
        ge_and_pagination(x,visited)


In [47]:
r = ge_and_pagination('https://www.webtoons.com/en/romance/truebeauty/list?title_no=1436',set())

Start with https://www.webtoons.com/en/romance/truebeauty/list?title_no=1436
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=2
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=3
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=4
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=5
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=6
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=7
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=8
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=9
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=10
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=11
Start with https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=11
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=12
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=13
https://we

AttributeError: 'NoneType' object has no attribute 'find_all'

In [21]:
url = 'https://www.webtoons.com/en/romance/truebeauty/list?title_no=1436'
base_url = 'https://webtoons.com'
for p in pagi.find_all('a')[1:]:
    print(base_url+p['href'])

https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=2
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=3
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=4
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=5
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=6
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=7
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=8
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=9
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=10
https://webtoons.com/en/romance/truebeauty/list?title_no=1436&page=11


In [37]:
def dummy(url):
    base_url = 'https://webtoons.com'
    result = []
    for p in pagi.find_all('a')[1:]:
        print(base_url+p['href'])
        a = get_episodes(base_url+p['href'])
        result.append(a)
    x = base_url+p['href']
    print('new round',x)
    ge_and_pagination(x)