### Video Downloader (Beautifulsoup)

<br>

### Development Environment

In [193]:
import os
import re
import time
import requests
import urllib
import threading
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

### Page Information

In [181]:
def make_page_soup(link):
    target = link
    response = requests.get(target)
    response.raise_for_status()
    response.encoding='EUC-KR'
    page = str(response.content)
    soup = BeautifulSoup(response.text, 'html.parser') 
    return page, soup

In [None]:
link = "http://ropas.snu.ac.kr/~kwang/4190.310/mooc/"
page, soup = make_page_soup(link)

### Video File Name Dataframe

In [182]:
def make_title_list(soup):
    titles = soup.select("h4")
    title_list = []
    for i in titles:
        title = i.get_text()
        if "Part" in title:
            title = title.replace(':', '')
            title = title.replace('\n','')
            title_list.append(title)
    return title_list

In [183]:
def make_subtitle_list(soup):
    li = soup.select("li")
    text_list = []
    li_texts = str(li).split("<li>")
    for li_text in li_texts:
        if "a href" not in li_text:
            text = li_text[ : li_text.find("<ul>") ]
            text = text.strip()
            if len(li_text) > 1:
                text = text.replace("?", "")
                text = re.sub('(&|;|:|,)', '', text)
                text_list.append(text)
    text_list.append("강의 마무리")
    subtitle_dict = dict.fromkeys(text_list)
    subtitle_list = list(subtitle_dict)   
    return subtitle_list     

In [184]:
def make_single_title_list(soup):
    a = soup.select("a")
    text_list = []
    for a_text in a:
        a_text = str(a_text)
        text = a_text[ a_text.find(">")+1 : a_text.find("</a>") ]
        text = text.strip()
        if 'font' not in text and 'img' not in text:
            text = text.replace('"','')
            text = text.replace('\n','')
            text = text.replace('\t',' ')
            text = text.replace("?", "")
            text = re.sub('(&|;|:|,)', '', text)
            text_list.append(text)
    single_title_list = text_list
    return single_title_list

In [185]:
def make_idx_list(single_title_list):
    idx_list = []
    for single_title in single_title_list:
        if "프로그래밍 언어 배우기" in single_title or \
            "귀납 증명 예제" in single_title or \
            "형식논리(formal logic) 표기법" in single_title or \
            "설탕 구조(syntactic sugar)" in single_title or \
            "메모리 주소의 유효기간(lifetime)" in single_title or \
            "K- 언어 전체 의미 파악 예제" in single_title or \
            "stop amp copy 알고리즘" in single_title or \
            "강의 전반부 마무리 및 강의 후반부 소개" in single_title or \
            "의미 정의 설탕 구조" in single_title or \
            "동일화 알고리즘은 충실한 구현" in single_title or \
            "기계중심 언어의 다형 타입 추론 2" in single_title or \
            "물건중심 언어(object oriented language)와의 관계 2" in single_title or \
            "실행중 결정되는 함수의 실체(late binding dynamic binding)" in single_title or \
            "강의 리뷰" in single_title:
            idx_list.append(single_title_list.index(single_title))
    return idx_list

In [186]:
def make_video_title_df(soup, title_list, subtitle_list, single_title_list, idx_list):
    video_title_df = pd.DataFrame({'title':[0]})
    for idx, chapter in enumerate(single_title_list): 
        if idx <= idx_list[0]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[0] + " - " + title_list[0]]
        elif idx >= idx_list[0] and idx <= idx_list[1]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[1] + " - " + title_list[0]]
        elif idx >= idx_list[1] and idx <= idx_list[2]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[2] + " - " + title_list[0]]
        elif idx >= idx_list[2] and idx <= idx_list[3]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[3] + " - " + title_list[1]]
        elif idx >= idx_list[3] and idx <= idx_list[4]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[4] + " - " + title_list[1]]
        elif idx >= idx_list[4] and idx <= idx_list[5]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[5] + " - " + title_list[1]]
        elif idx >= idx_list[5] and idx <= idx_list[6]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[6] + " - " + title_list[1]]
        elif idx >= idx_list[6] and idx <= idx_list[7]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[7] + " - " + title_list[1]]
        elif idx >= idx_list[7] and idx <= idx_list[8]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[8] + " - " + title_list[2]]
        elif idx >= idx_list[8] and idx <= idx_list[9]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[9] + " - " + title_list[2]]
        elif idx >= idx_list[9] and idx <= idx_list[10]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[10] + " - " + title_list[2]]
        elif idx >= idx_list[10] and idx <= idx_list[11]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[11] + " - " + title_list[2]]
        elif idx >= idx_list[11] and idx <= idx_list[12]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[12] + " - " + title_list[2]]  
        elif idx >= idx_list[12] and idx <= idx_list[13]:
            video_title_df.loc[idx] = [chapter + " - " + subtitle_list[13] + " - " + title_list[2]]     

        video_title_df.to_excel("video_title_df.xlsx")
        
    return video_title_df

In [None]:
title_list = make_title_list(soup)
subtitle_list = make_subtitle_list(soup)
single_title_list = make_single_title_list(soup)
idx_list = make_idx_list(single_title_list)

In [None]:
video_title_df = make_video_title_df(soup, title_list, subtitle_list, single_title_list, idx_list)

In [188]:
video_title_df

Unnamed: 0,title
0,강의의 목표 - 프로그래밍 언어란 무엇인가 - Part 0 기초
1,강의 내용 개괄 - 프로그래밍 언어란 무엇인가 - Part 0 기초
2,프로그래밍 언어란 무엇인가 - 프로그래밍 언어란 무엇인가 - Part 0 기초
3,프로그래밍 언어의 쓰임새 - 프로그래밍 언어란 무엇인가 - Part 0 기초
4,프로그래밍 언어 배우기 - 프로그래밍 언어란 무엇인가 - Part 0 기초
...,...
118,물건중심 언어(object oriented language)와의 관계 1 - 계층타...
119,물건중심 언어(object oriented language)와의 관계 2 - 계층타...
120,유일한 특성 상속(inheritance) - 물건중심의 언어(object-orien...
121,실행중 결정되는 함수의 실체(late binding dynamic binding) ...


### Video Downloader

In [189]:
def getURL(page):
    start_link = page.find("a href")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    return url, end_quote

In [190]:
def download(url, fileName, file_new_name):
    urlretrieve(url, fileName)
    os.rename(fileName, file_new_name)

In [None]:
def video_downloader(page, viedo_title_df, file_extension):
        idx = -1
    while True:
        url, n = getURL(page)
        page = page[n:]

        if url:

            if url.endswith(file_extension):

                while 10 < threading.active_count():
                    time.sleep(3)

                idx += 1
                if idx % 10 == 0 and idx != 0:
                    print()
                if len(url[:-4]) > 1:
                    print(url[:-4], end="  |  ")
                file_new_name = url[:-4] + " - " + viedo_title_df.loc[idx]['title'] + file_extension
                t = threading.Thread(target=download, args=(target+url, url, file_new_name))
                t.daemon = True
                t.start()

        else:
            while 0 < threading.active_count():
                time.sleep(3)
            break
    

In [200]:
def check_download_file(video_title_df, file_extension):
    video_file_list = glob("*.mp4")
    if len(video_file_list) == len(video_title_df):
        print("Video Download Complete")
    video_file_df = pd.DataFrame({'file':video_file_list})
    return video_file_df

In [191]:
video_downloader(page, viedo_title_df, '.mp4')

01-01  |  01-02  |  01-03  |  01-04  |  01-05  |  02-01  |  02-02  |  02-03  |  02-04  |  02-05  |  
02-06  |  02-07  |  02-08  |  02-09  |  02-10  |  03-01  |  03-02  |  03-03  |  03-04  |  03-05  |  
03-06  |  04-01  |  04-02  |  04-03  |  04-04  |  04-05  |  04-06  |  04-07  |  05-01  |  05-02  |  
05-03  |  05-04  |  05-05  |  05-06  |  05-07  |  05-08  |  05-09  |  05-10  |  05-11  |  06-01  |  
06-02  |  06-03  |  06-04  |  06-05  |  06-06  |  06-07  |  06-08  |  06-09  |  06-10  |  06-11  |  
06-12  |  06-13  |  06-14  |  06-15  |  06-16  |  06-17  |  07-01  |  07-02  |  07-03  |  07-04  |  
07-05  |  07-06  |  08-01  |  08-02  |  08-03  |  08-04  |  08-05  |  08-06  |  09-01  |  09-02  |  
09-03  |  09-04  |  09-05  |  09-06  |  09-07  |  09-08  |  09-09  |  09-10  |  09-11  |  09-12  |  
09-13  |  10-01  |  10-02  |  11-01  |  11-02  |  11-03  |  11-04  |  11-05  |  11-06  |  11-07  |  
11-08  |  11-09  |  11-10  |  11-11  |  11-12  |  11-13  |  11-14  |  11-15  |  11-16  |  1

KeyboardInterrupt: 

In [204]:
video_file_df = check_download_file(video_title_df, "*.mp4")
video_file_df

Video Download Complete


Unnamed: 0,file
0,01-01 - 강의의 목표 - 프로그래밍 언어란 무엇인가 - Part 0 기초.mp4
1,01-02 - 강의 내용 개괄 - 프로그래밍 언어란 무엇인가 - Part 0 기초.mp4
2,01-03 - 프로그래밍 언어란 무엇인가 - 프로그래밍 언어란 무엇인가 - Part...
3,01-04 - 프로그래밍 언어의 쓰임새 - 프로그래밍 언어란 무엇인가 - Part ...
4,01-05 - 프로그래밍 언어 배우기 - 프로그래밍 언어란 무엇인가 - Part 0...
...,...
118,13-03 - 물건중심 언어(object oriented language)와의 관계...
119,13-04 - 물건중심 언어(object oriented language)와의 관계...
120,14-01 - 유일한 특성 상속(inheritance) - 물건중심의 언어(obje...
121,14-02 - 실행중 결정되는 함수의 실체(late binding dynamic b...


<br>

### Reference

<b>Github<b>
<br>[serithemage](https://github.com/serithemage/python_exercise/blob/master/downloader/downloader.py)

<br><b>MOOC<b>
<br>[Kwangkeun Yi 4190.310 Programming Languages](http://ropas.snu.ac.kr/~kwang/4190.310/mooc/)