## Video Downloader (Beautifulsoup)

<br>

### Development Environment

In [1]:
import os
import re
import time
import requests
import urllib
import threading
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

### Page Information

In [2]:
def make_page_soup(link):

    target = link
    response = requests.get(target)
    response.raise_for_status()
    response.encoding='EUC-KR'
    page = str(response.content)
    soup = BeautifulSoup(response.text, 'html.parser') 
    
    return page, soup

In [3]:
link = "http://ropas.snu.ac.kr/~kwang/4190.310/mooc/"
page, soup = make_page_soup(link)

### Video File Name Dataframe

In [4]:
def make_title_list(soup):

    titles = soup.select("h4")
    title_list = []
    for i in titles:
        title = i.get_text()
        if "Part" in title:
            title = title.replace(':', '')
            title = title.replace('\n','')
            title_list.append(title)
            
    return title_list

In [289]:
def make_subtitle_list(soup):

    li = soup.select("li")
    text_list = []
    li_texts = str(li).split("<li>")
    for li_text in li_texts:
        if "a href" not in li_text:
            text = li_text[ : li_text.find("<ul>") ]
            text = text.strip()
            if len(li_text) > 1:
                text = text.replace("?", "")
                text = re.sub('(&|;|:|,)', '', text)
                text_list.append(text)
    text_list.append("강의 마무리")
    subtitle_dict = dict.fromkeys(text_list)
    subtitle_list = list(subtitle_dict)  
     
    return subtitle_list     

In [290]:
def make_single_title_list(soup):

    a = soup.select("a")
    text_list = []
    for a_text in a:
        a_text = str(a_text)
        text = a_text[ a_text.find(">")+1 : a_text.find("</a>") ]
        text = text.strip()
        if 'font' not in text and 'img' not in text:
            text = text.replace('"','')
            text = text.replace('\n','')
            text = text.replace('\t',' ')
            text = text.replace("?", "")
            text = re.sub('(&|;|:|,)', '', text)
            text_list.append(text)
    single_title_list = text_list
    
    return single_title_list

In [291]:
def make_idx_list(single_title_list, if_statement_interval):

    idx_list = []
    for single_title in single_title_list:
        if any(interval in single_title for interval in if_statement_interval):
            idx_list.append(single_title_list.index(single_title)+1)
    idx_list.insert(0, 0)
    
    return idx_list

In [292]:
def make_video_title_dict(idx_list, title_list, subtitle_list, single_title_list, if_statement_interval):

    nested_single_list = []
    interval_01 = subtitle_list.index(if_statement_interval[0]) + 1
    interval_02 = subtitle_list.index(if_statement_interval[1]) + 1
    interval_03 = subtitle_list.index(if_statement_interval[2]) + 1
    interval_04 = subtitle_list.index(if_statement_interval[3]) + 1
        
    for i in range(len(idx_list)):
        if i == interval_01:
            inner_list = []
            for j in range(i):
                if j > -1:
                    inner_list.append(single_title_list[idx_list[j]:idx_list[j+1]])
            nested_single_list.append(inner_list)
        elif i == interval_02:
            inner_list = []
            for j in range(i):
                if j >= interval_01:
                    inner_list.append(single_title_list[idx_list[j]:idx_list[j+1]])
            nested_single_list.append(inner_list)
        elif i == interval_03:
            inner_list = []
            for j in range(i):
                if j >= interval_02:
                    inner_list.append(single_title_list[idx_list[j]:idx_list[j+1]])  
        elif i == interval_04:
            for j in range(i):
                if j >= interval_03:
                    inner_list.append(single_title_list[idx_list[j]:idx_list[j+1]])  
            nested_single_list.append(inner_list)

    video_title_dict = {val: key for key, val in enumerate(title_list)}
    for i in range(len(nested_single_list)):
        video_title_dict[title_list[i]] = {subtitle_list[j] : k for j, k in enumerate(nested_single_list[i])}
        subtitle_list = subtitle_list[len(nested_single_list[i]):]
        
    return video_title_dict

In [293]:
def make_video_title_df(video_title_dict):
    
    video_title_df = pd.DataFrame({'title':[0]})
    idx = -1
    for title, title_idx in video_title_dict.items():
        for subtitle, single_titles in video_title_dict[title].items():
            for single_title in single_titles:
               idx += 1
               video_title_df.loc[idx] = [single_title +  " - " + subtitle + " - " + title]
    
    video_title_df.to_excel("video_title_df.xlsx")
        
    return video_title_df

In [294]:
title_list = make_title_list(soup)
subtitle_list = make_subtitle_list(soup)
single_title_list = make_single_title_list(soup)

In [295]:
if_statement_interval = ["프로그래밍 언어 배우기", "귀납 증명 예제", "형식논리(formal logic) 표기법", 
                         "설탕 구조(syntactic sugar)", "메모리 주소의 유효기간(lifetime)", 
                         "K- 언어 전체 의미 파악 예제", "stop amp copy 알고리즘", 
                         "강의 전반부 마무리 및 강의 후반부 소개", "의미 정의 설탕 구조", 
                         "동일화 알고리즘은 충실한 구현", "기계중심 언어의 다형 타입 추론 2", 
                         "물건중심 언어(object oriented language)와의 관계 2", 
                         "실행중 결정되는 함수의 실체(late binding dynamic binding)", "강의 리뷰"]

idx_list = make_idx_list(single_title_list, if_statement_interval)

In [296]:
if_statement_interval = ["추론규칙(inference rules)", "번역(translation)과 가상 기계(virtual machine)", 
                         "물건중심의 언어(object-oriented language)", "강의 마무리"]

video_title_dict = make_video_title_dict(idx_list, title_list, subtitle_list, single_title_list, if_statement_interval)

In [298]:
video_title_df = make_video_title_df(video_title_dict)

In [299]:
video_title_df

Unnamed: 0,title
0,강의의 목표 - 프로그래밍 언어란 무엇인가 - Part 0 기초
1,강의 내용 개괄 - 프로그래밍 언어란 무엇인가 - Part 0 기초
2,프로그래밍 언어란 무엇인가 - 프로그래밍 언어란 무엇인가 - Part 0 기초
3,프로그래밍 언어의 쓰임새 - 프로그래밍 언어란 무엇인가 - Part 0 기초
4,프로그래밍 언어 배우기 - 프로그래밍 언어란 무엇인가 - Part 0 기초
...,...
118,물건중심 언어(object oriented language)와의 관계 1 - 계층타...
119,물건중심 언어(object oriented language)와의 관계 2 - 계층타...
120,유일한 특성 상속(inheritance) - 물건중심의 언어(object-orien...
121,실행중 결정되는 함수의 실체(late binding dynamic binding) ...


### Video Downloader

In [189]:
def getURL(page):

    start_link = page.find("a href")
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1: end_quote]
    
    return url, end_quote

In [190]:
def download(url, fileName, file_new_name):
    urlretrieve(url, fileName)
    os.rename(fileName, file_new_name)

In [None]:
def video_downloader(page, viedo_title_df, file_extension):
    
    idx = -1
    while True:
        url, n = getURL(page)
        page = page[n:]

        if url:

            if url.endswith(file_extension):

                while 10 < threading.active_count():
                    time.sleep(3)

                idx += 1
                if idx % 10 == 0 and idx != 0:
                    print()
                if len(url[:-4]) > 1:
                    print(url[:-4], end="  |  ")
                file_new_name = url[:-4] + " - " + viedo_title_df.loc[idx]['title'] + file_extension
                t = threading.Thread(target=download, args=(target+url, url, file_new_name))
                t.daemon = True
                t.start()

        else:
            while 0 < threading.active_count():
                time.sleep(3)
            break
    

In [200]:
def check_download_file(video_title_df, file_extension):

    video_file_list = glob("*.mp4")
    if len(video_file_list) == len(video_title_df):
        print("Video Download Complete")
    video_file_df = pd.DataFrame({'file':video_file_list})
    
    return video_file_df

In [191]:
video_downloader(page, viedo_title_df, '.mp4')

01-01  |  01-02  |  01-03  |  01-04  |  01-05  |  02-01  |  02-02  |  02-03  |  02-04  |  02-05  |  
02-06  |  02-07  |  02-08  |  02-09  |  02-10  |  03-01  |  03-02  |  03-03  |  03-04  |  03-05  |  
03-06  |  04-01  |  04-02  |  04-03  |  04-04  |  04-05  |  04-06  |  04-07  |  05-01  |  05-02  |  
05-03  |  05-04  |  05-05  |  05-06  |  05-07  |  05-08  |  05-09  |  05-10  |  05-11  |  06-01  |  
06-02  |  06-03  |  06-04  |  06-05  |  06-06  |  06-07  |  06-08  |  06-09  |  06-10  |  06-11  |  
06-12  |  06-13  |  06-14  |  06-15  |  06-16  |  06-17  |  07-01  |  07-02  |  07-03  |  07-04  |  
07-05  |  07-06  |  08-01  |  08-02  |  08-03  |  08-04  |  08-05  |  08-06  |  09-01  |  09-02  |  
09-03  |  09-04  |  09-05  |  09-06  |  09-07  |  09-08  |  09-09  |  09-10  |  09-11  |  09-12  |  
09-13  |  10-01  |  10-02  |  11-01  |  11-02  |  11-03  |  11-04  |  11-05  |  11-06  |  11-07  |  
11-08  |  11-09  |  11-10  |  11-11  |  11-12  |  11-13  |  11-14  |  11-15  |  11-16  |  1

KeyboardInterrupt: 

In [204]:
video_file_df = check_download_file(video_title_df, "*.mp4")
video_file_df

Video Download Complete


Unnamed: 0,file
0,01-01 - 강의의 목표 - 프로그래밍 언어란 무엇인가 - Part 0 기초.mp4
1,01-02 - 강의 내용 개괄 - 프로그래밍 언어란 무엇인가 - Part 0 기초.mp4
2,01-03 - 프로그래밍 언어란 무엇인가 - 프로그래밍 언어란 무엇인가 - Part...
3,01-04 - 프로그래밍 언어의 쓰임새 - 프로그래밍 언어란 무엇인가 - Part ...
4,01-05 - 프로그래밍 언어 배우기 - 프로그래밍 언어란 무엇인가 - Part 0...
...,...
118,13-03 - 물건중심 언어(object oriented language)와의 관계...
119,13-04 - 물건중심 언어(object oriented language)와의 관계...
120,14-01 - 유일한 특성 상속(inheritance) - 물건중심의 언어(obje...
121,14-02 - 실행중 결정되는 함수의 실체(late binding dynamic b...


<br>

### Reference

<b>Github<b>
<br>[serithemage](https://github.com/serithemage/python_exercise/blob/master/downloader/downloader.py)

<br><b>MOOC<b>
<br>[Kwangkeun Yi 4190.310 Programming Languages](http://ropas.snu.ac.kr/~kwang/4190.310/mooc/)