## Two Minute Papers @ Youtube

this script parses videos of https://www.youtube.com/c/K%C3%A1rolyZsolnai/videos  
the index was obtained manually  
Youtube has different layouts, to prevent scraping  

In [39]:
# imports
import requests
import json
import os
import sys
import time
import math
import pandas as pd
import numpy as np
import platform
import datetime
import urllib.parse
import re

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

In [2]:
# let's speed up the scraping proccess by scraping multiple urls in one batch

# generic url scraper

def url_scraper_batch(
                urlbase = '',
                urls = [],
                selenium = True,
                windowSize = "1280,720",
                headless = True,
                quitOnEnd = True,
                waitForElement = '',
                waitForId = '',
                waitForClass = '',
                waitForIframeById = '',
                pressLink = [],
                waitBetweenPress = 30,
                waitUntilTimeout = 20,
               ):
    #meta = {
    #    'url': url
    #}
    
    # simple version
    if selenium == False:
        #page = requests.get(url)
        #meta['status'] = page.status_code
        #html = page.content
        print('batched version is for selenium only')
        return ''
      
    # selenium version
    if selenium == True:
        CHROMEDRIVER_PATH = './.selenium/chromedriver'
        WINDOW_SIZE = windowSize

        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)

        driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
        driver.implicitly_wait(waitBetweenPress)
        
        result = []
        for url in urls:
            try:
                item = {
                    'url': url
                }

                driver.get(urlbase + url)

                # wait for a given tag to be loaded (javascript generated code)
                if waitForElement != '':
                    try:
                        element = WebDriverWait(driver, waitUntilTimeout).until(EC.visibility_of_element_located((By.TAG_NAME, waitForElement)))
                        print ("Page is ready!", urlbase + url)
                    except TimeoutException:
                        print ("element - loading took too much time!", urlbase + url)

                # wait for a given ID to be loaded (javascript generated code)
                if waitForId != '':
                    try:
                        element = WebDriverWait(driver, waitUntilTimeout).until(EC.visibility_of_element_located((By.ID, waitForId)))
                        print ("Page is ready!", urlbase + url)
                    except TimeoutException:
                        print ("id - loading took too much time!", urlbase + url)

                # wait for a given class to be loaded (javascript generated code)
                if waitForClass != '':
                    try:
                        # not-found
                        element = WebDriverWait(driver, waitUntilTimeout).until(EC.visibility_of_element_located((By.CLASS_NAME, waitForClass)))
                        print ("Page is ready!", urlbase + url)
                    except TimeoutException:
                        print ("class - loading took too much time!", urlbase + url)

                # wait for a given ID to be loaded (javascript generated code)
                if waitForIframeById != '':
                    try:
                        element = WebDriverWait(driver, waitUntilTimeout).until(EC.presence_of_element_located((By.ID, waitForIframeById)))

                        driver.switch_to.frame(element)

                        item['iframe'] = driver.page_source

                        driver.switch_to.default_content()

                        print ("Page is ready!", urlbase + url)
                    except TimeoutException:
                        print ("iframe - loading took too much time!", urlbase + url)

                # press link
                if len(pressLink)>0:
                    driver.implicitly_wait(waitBetweenPress)
                    try:
                        for link in pressLink:
                            print(link)
                            element = WebDriverWait(driver, waitUntilTimeout).until(EC.visibility_of_element_located((By.LINK_TEXT, link)))
                            element.click()
                    except TimeoutException:
                        print ("link - loading took too much time!", urlbase + url)

                item['html'] = driver.page_source
                result.append(item)
                
            except Exception as e:
                print("Oops!", e.__class__, "occurred.")
                print(e)
                #break
            
            
        if quitOnEnd:
            driver.close()
    
    return result

urlbase = ''
urls = [
    'https://thecleverprogrammer.com/2020/11/15/machine-learning-projects/',
]

result = url_scraper_batch('', urls, waitForClass='ct-image-container', headless = False, quitOnEnd = True)
#print(result)
for item in result:
    print(item['url'], len(item['html']))
    #print(len(item['iframe']))

Page is ready! https://thecleverprogrammer.com/2020/11/15/machine-learning-projects/
https://thecleverprogrammer.com/2020/11/15/machine-learning-projects/ 174008


In [2]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
#print(store_data('Hello', '../data/repositories/mlart/test.txt'))
#print(load_data('../data/repositories/mlart/test.txt'))

# test json
#print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
#print(load_data('../data/repositories/mlart/test.json', fromJson=True))

#store_data(result[0]['html'], '../data/repositories/kaggle/notebook.html')
#store_data(result[0]['iframe'], '../data/repositories/kaggle/kernel.html')

In [3]:
# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

In [40]:
# scoring function to get a score between 0...1 for integer-values, 0.5 should be at ~100
def score(n, precision=3):
    if isinstance(n, int) or isinstance(i, float):
        return round(1-1/math.pow(1+n, 0.15), precision)
    else:
        return 0

for n in [0,1,10,25,50,100,1000,10000]:
    print(score(n))

0.0
0.099
0.302
0.387
0.446
0.5
0.645
0.749


In [12]:
# scrape links from index.html

folder = '../data/repositories/twominutepapers/'
file = 'index.html'
out = 'items.json'
#url = 'https://thecleverprogrammer.com/2020/11/15/machine-learning-projects/'

def scrape_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    result = []
    
    partial = soup.find('ytd-section-list-renderer', class_="style-scope ytd-two-column-browse-results-renderer")
    if partial == None:
        return []
    items = partial.find_all('a', class_='yt-simple-endpoint style-scope ytd-grid-video-renderer') #.find_all('a', {"role": "listitem"})
    #print(len(items))
    
    for item in items:
        title = item.text.encode('ascii', 'ignore').decode('utf-8', 'ignore').strip()
        link = item.get('href')
        result.append({title: link})
        #result.append(link)
        #print(link)
    
    return result

links = []

# run once to test
html = load_data(folder+file)
links = scrape_links(html)
print(links)
store_data(links, folder+out, toJson=True)

[{'Building A Liquid Labyrinth!': '/watch?v=nJ86LCA0Asw'}, {'OpenAI DALLE: Fighter Jet For The Mind!': '/watch?v=C7D5EzkhT6A'}, {'Light Fields - Videos From The Future!': '/watch?v=9XM5-CJzrU0'}, {'NERFIES: The Selfies of The Future!': '/watch?v=IDMiMKWucaI'}, {'This AI Gave Elon Musk A Majestic Beard!': '/watch?v=Lt4Z5oOAeEY'}, {'Is Simulating Jelly And Bunnies Possible?': '/watch?v=tiO43nJKGJY'}, {'Painting the Mona Lisa...With Triangles!': '/watch?v=JmVQJg-glYA'}, {'Can An AI Design Our Tax Policy?': '/watch?v=Sr2ga3BBMTc'}, {'What Is 3D Photography?': '/watch?v=BjkgyKEQbSM'}, {'Soft Body Wiggles And JigglesEffortlessly!': '/watch?v=s8Nm_ytwO6w'}, {'Simulating Honey And Hot Showers For Bunnies!': '/watch?v=K940MNp7V8M'}, {'These Are Pixels Made of Wood!': '/watch?v=fPrxiRceAac'}, {'This Blind Robot Learned To Climb Any Terrain!': '/watch?v=knIzDj1Ocoo'}, {'Remember, This Meeting Never Happened!': '/watch?v=2pWK0arWAmU'}, {'AI-Based Sky Replacement Is Here!': '/watch?v=l_C3KFeI_l0'},

45836

In [30]:
# download projects

file = '../data/repositories/twominutepapers/items.json'
path = '../data/repositories/twominutepapers/items/html/'
urlbase = 'https://www.youtube.com'

df_links = load_data(file, fromJson=True)
print(len(df_links))
batch_size = 100

chunks = [df_links[i:i + batch_size] for i in range(0, len(df_links), batch_size)]
chunks_len = len(chunks)
print('total/chunks', len(df_links), len(chunks))

for i, chunk in enumerate(chunks):
    print('### chunk', i, '/', chunks_len, '###')
    
    # check if chunk is already scraped
    chunk_dataset = []
    names = []
    for j, item in enumerate(chunk):
        print('# chunk', i, '/', chunks_len, '# item ', j, item)
        
        #name = list(item.keys())[0]
        #name = name.replace('!','').replace('?','').replace('.','').replace(',','').replace('"','').replace(' ','_') + '.html'
        link = list(item.values())[0]
        name = link.split('/watch?v=')[-1] + '.html'
        
        print(name, link)
        #sys.exit()
    
        # check datasets
        file = path + name
        
        if not os.path.isfile(file):
            chunk_dataset.append(link)
            names.append(name)
    
    # get content for chunk
    if len(chunk_dataset) > 0:
        result = url_scraper_batch(urlbase, chunk_dataset, waitForClass='yt-simple-endpoint', headless = False, quitOnEnd = True)
    
        for k, item in enumerate(result):
            print(item['url'], len(item['html']))
            #name = names[k]
            # /watch?v=Tb6-JfI0HA0
            name = item['url'].split('/watch?v=')[-1] + '.html'
            file = path + name
            print(file)
            store_data(item['html'], file)



550
total/chunks 550 6
### chunk 0 / 6 ###
# chunk 0 / 6 # item  0 {'Building A Liquid Labyrinth!': '/watch?v=nJ86LCA0Asw'}
nJ86LCA0Asw.html /watch?v=nJ86LCA0Asw
# chunk 0 / 6 # item  1 {'OpenAI DALLE: Fighter Jet For The Mind!': '/watch?v=C7D5EzkhT6A'}
C7D5EzkhT6A.html /watch?v=C7D5EzkhT6A
# chunk 0 / 6 # item  2 {'Light Fields - Videos From The Future!': '/watch?v=9XM5-CJzrU0'}
9XM5-CJzrU0.html /watch?v=9XM5-CJzrU0
# chunk 0 / 6 # item  3 {'NERFIES: The Selfies of The Future!': '/watch?v=IDMiMKWucaI'}
IDMiMKWucaI.html /watch?v=IDMiMKWucaI
# chunk 0 / 6 # item  4 {'This AI Gave Elon Musk A Majestic Beard!': '/watch?v=Lt4Z5oOAeEY'}
Lt4Z5oOAeEY.html /watch?v=Lt4Z5oOAeEY
# chunk 0 / 6 # item  5 {'Is Simulating Jelly And Bunnies Possible?': '/watch?v=tiO43nJKGJY'}
tiO43nJKGJY.html /watch?v=tiO43nJKGJY
# chunk 0 / 6 # item  6 {'Painting the Mona Lisa...With Triangles!': '/watch?v=JmVQJg-glYA'}
JmVQJg-glYA.html /watch?v=JmVQJg-glYA
# chunk 0 / 6 # item  7 {'Can An AI Design Our Tax Policy?'

# chunk 1 / 6 # item  17 {'Cubify All The Things!': '/watch?v=g1sAjtDoItE'}
g1sAjtDoItE.html /watch?v=g1sAjtDoItE
# chunk 1 / 6 # item  18 {'AI Learns To Compute Game Physics In Microseconds': '/watch?v=atcKO15YVD8'}
atcKO15YVD8.html /watch?v=atcKO15YVD8
# chunk 1 / 6 # item  19 {'AIs Are Getting Too Smart - Time For A New "IQ Test': '/watch?v=nSHU-4Yt4eQ'}
nSHU-4Yt4eQ.html /watch?v=nSHU-4Yt4eQ
# chunk 1 / 6 # item  20 {'OpenAI Plays Hide and Seekand Breaks The Game!': '/watch?v=Lu56xVlZ40M'}
Lu56xVlZ40M.html /watch?v=Lu56xVlZ40M
# chunk 1 / 6 # item  21 {'AI Learns Human Movement From Unorganized Data': '/watch?v=882O_7hsAms'}
882O_7hsAms.html /watch?v=882O_7hsAms
# chunk 1 / 6 # item  22 {'Is a Realistic Honey Simulation Possible?': '/watch?v=7SM816P5G9s'}
7SM816P5G9s.html /watch?v=7SM816P5G9s
# chunk 1 / 6 # item  23 {'DeepFake Detector AIs Are Good Too!': '/watch?v=RoGHVI-w9bE'}
RoGHVI-w9bE.html /watch?v=RoGHVI-w9bE
# chunk 1 / 6 # item  24 {'Finally, Style Transfer For Smoke Simul

# chunk 2 / 6 # item  49 {'4 Experiments Where the AI Outsmarted Its Creators': '/watch?v=GdTBqBnqhaQ'}
GdTBqBnqhaQ.html /watch?v=GdTBqBnqhaQ
# chunk 2 / 6 # item  50 {'Gaussian Material Synthesis (SIGGRAPH 2018)': '/watch?v=6FzVhIV_t3s'}
6FzVhIV_t3s.html /watch?v=6FzVhIV_t3s
# chunk 2 / 6 # item  51 {'Evolving Generative Adversarial Networks | Two Minute Papers #242': '/watch?v=ni6P5KU3SDU'}
ni6P5KU3SDU.html /watch?v=ni6P5KU3SDU
# chunk 2 / 6 # item  52 {'Das tuscht deine Sicht | Two Minute Papers # 241': '/watch?v=AbxPbfODGcs'}
AbxPbfODGcs.html /watch?v=AbxPbfODGcs
# chunk 2 / 6 # item  53 {'One Pixel Attack Defeats Neural Networks | Two Minute Papers #240': '/watch?v=SA4YEAWVpbk'}
SA4YEAWVpbk.html /watch?v=SA4YEAWVpbk
# chunk 2 / 6 # item  54 {"DeepMind's AI Learns Complex Behaviors From Scratch | Two Minute Papers #239": '/watch?v=veWkBsK0nwU'}
veWkBsK0nwU.html /watch?v=veWkBsK0nwU
# chunk 2 / 6 # item  55 {"DeepMind's AI Masters Even More Atari Games | Two Minute Papers #238": '/w

# chunk 3 / 6 # item  84 {'3D Printing Acoustic Filters | Two Minute Papers #109': '/watch?v=7JbN9vXxGYE'}
7JbN9vXxGYE.html /watch?v=7JbN9vXxGYE
# chunk 3 / 6 # item  85 {'Synchronizing Animations To Sound | Two Minute Papers #108': '/watch?v=aMo7pkkaZ9o'}
aMo7pkkaZ9o.html /watch?v=aMo7pkkaZ9o
# chunk 3 / 6 # item  86 {'Deep Learning Program Simplifies Your Drawings | Two Minute Papers #107': '/watch?v=4MfG9CDufPA'}
4MfG9CDufPA.html /watch?v=4MfG9CDufPA
# chunk 3 / 6 # item  87 {'Human Pose Estimation With Deep Learning | Two Minute Papers #106': '/watch?v=NnzzSkKKoa8'}
NnzzSkKKoa8.html /watch?v=NnzzSkKKoa8
# chunk 3 / 6 # item  88 {'Computer Games Empower Deep Learning Research | Two Minute Papers #105': '/watch?v=QkqNzrsaxYc'}
QkqNzrsaxYc.html /watch?v=QkqNzrsaxYc
# chunk 3 / 6 # item  89 {'Building a Community Around Two Minute Papers': '/watch?v=sWZQxB2es88'}
sWZQxB2es88.html /watch?v=sWZQxB2es88
# chunk 3 / 6 # item  90 {'How To Steal a Lost Election With Gerrymandering | Two Minu

# chunk 5 / 6 # item  10 {'TU Wien Rendering #32 - Bidirectional Path Tracing, Multiple Importance Sampling': '/watch?v=RuBjYa4Q3dA'}
RuBjYa4Q3dA.html /watch?v=RuBjYa4Q3dA
# chunk 5 / 6 # item  11 {'TU Wien Rendering #31 - Unbiased, Consistent Algorithm Classes': '/watch?v=LB6NGEHtD7Y'}
LB6NGEHtD7Y.html /watch?v=LB6NGEHtD7Y
# chunk 5 / 6 # item  12 {'TU Wien Rendering #30 - Dispersion and Spectral Rendering': '/watch?v=UcI-RnWzASk'}
UcI-RnWzASk.html /watch?v=UcI-RnWzASk
# chunk 5 / 6 # item  13 {'TU Wien Rendering #29 - Path Tracing Implementation & Code Walkthrough': '/watch?v=cDi-uti2oLQ'}
cDi-uti2oLQ.html /watch?v=cDi-uti2oLQ
# chunk 5 / 6 # item  14 {'TU Wien Rendering #28 - Assignment 3': '/watch?v=z9p2nis3amM'}
z9p2nis3amM.html /watch?v=z9p2nis3amM
# chunk 5 / 6 # item  15 {'TU Wien Rendering #27 - Russian Roulette Path Termination': '/watch?v=vPwiqXjDgeo'}
vPwiqXjDgeo.html /watch?v=vPwiqXjDgeo
# chunk 5 / 6 # item  16 {'TU Wien Rendering #26 - Low Discrepancy Sequences': '/watch

In [118]:
def scrape_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    result = {}
    url_blacklist = [
        'https://www.wandb.com/papers',
        'https://www.patreon.com/TwoMinutePapers',
        'https://www.instagram.com/twominutepapers/',
        'https://twitter.com/twominutepapers',
        'https://cg.tuwien.ac.at/~zsolnai/',
        'http://twominutepapers.com/',
        'https://twitter.com/karoly_zsolnai',
        'http://felicia.hu',
    ]
    
    title = soup.find('h1', class_='title') # style-scope ytd-video-primary-info-renderer')
    result['title'] = title.text.encode('ascii', 'ignore').decode('utf-8', 'ignore').strip()
    
    result['views'] = soup.find('span', class_='view-count').text.split(' ')[0] # style-scope yt-view-count-renderer
    result['views'] = result['views'].replace('.', '').strip()
    
    result['date'] = soup.find('div', attrs={"id":"date"}).text
    result['date'] = result['date'].replace('•','').strip()
    
    #result['comments'] = soup.find_all('h2', attrs={"id":"count"}).text
    
    partial = soup.find('div', attrs={"id":"description"})
    partial = urllib.parse.unquote(partial)
    
    d = partial.text.encode('ascii', 'ignore').decode('utf-8', 'ignore').strip()
    #result['description_raw'] = d
    d = d.replace('_','').split('We would like to thank our generous Patreon')[0]
    d = d.split('\n')
    d = [x.strip() for x in d if x != '']
    d = [x for x in d if not 'Check out Weights & Biases' in x]
    d = ' '.join(d)
    result['description'] = d
    
    
    
    items = partial.find_all('a')
    #result['links_raw'] = items
    links = []
    for item in items:
        #title = item.text
        link = item.get('href')
        #print(link)
        link = link.replace('/redirect?q=','')
        #link = link.split('&redir_token')[0]
        link = link.split('=')[-1]
        link = urllib.parse.unquote(link)
        if not link in links:
            #print(link)
            links.append(link)
        
    result['links'] = links
    #print('links:', links)
    
    if len(links) > 0:
        #print('replace links')
        d = d.replace('...','###')
        tag_in = 'http'
        tag_out = '###'
        #regex = re.compile('{}(.*){}'.format(re.escape(tag_in), re.escape(tag_out)))
        #shortened_links = regex.findall(d)
        shortened_links = re.findall(r'http(.+?)(###| )', d)
        #print('shortened_links:', shortened_links)
        for e in shortened_links:
            #print(type(e), e)
            e = e[0].replace(tag_out,'')
            l = [x for x in links if x.find(e) > -1]
            #print(e, l)
            if len(l) > 0:
                #print(e, l)
                d = d.replace(tag_in + e + tag_out, l[0])
                
        d = d.replace('###','...')
        result['description'] = d
        
    result['links'] = [x for x in result['links'] if not x in url_blacklist and 'http' in x and not 'twominutepapers' in x.lower() ]
    
    try:
        t = d.split('The paper')[1]
        t = t.split('is available here')[0]
        result['topic'] = t.replace('"','').strip()
    except:
        result['topic'] = ''
    
    #likes = soup.find('div', attrs={'id': 'menu'})
    #result['likes'] = likes
    
    likes = soup.find_all('yt-formatted-string', class_='style-scope ytd-toggle-button-renderer style-text')
    likes = [x.text for x in likes]
    result['likes'] = likes[0].replace('.','').strip()
    result['dislikes'] = likes[1].replace('.','').strip()
    
    for i in ['views', 'likes', 'dislikes']:
        try:
            result[i] = int(result[i])
        except:
            pass
    
    return result

import time

path_in = '../data/repositories/twominutepapers/items/html/'
path_out = '../data/repositories/twominutepapers/items/json/'
file = '2pWK0arWAmU.html'
file = 'XSWqLb0VyzM.html'
file = 'a-ovvd_ZrmA.html'

html = load_data(path_in+file)
content = scrape_content(html)
print(content)

files = os.listdir(path_in)

#sys.exit()

for i, file in enumerate(files):
    print(i, path+file)
    fp_out = path_out+file.replace('html','json')
    if not os.path.isfile(fp_out):
        html = load_data(path_in+file)
        print(len(html))
        
        try:
            content = scrape_content(html)
            content['url'] = file.replace('.html','')
            #items.append(content)
            store_data(content, fp_out, toJson=True)
        except Exception as e:
            print('error:', file)
            print("Oops!", e.__class__, "occurred.")
            print(e)
            #break

    if i > 5:
        #break
        pass

#print(items)


{'title': "How DeepMind's AlphaGo Defeated Lee Sedol | Two Minute Papers #53", 'views': 14426, 'date': '15.03.2016', 'description': 'This time around, Google DeepMind embarked on a journey to write an algorithm that plays Go. Go is an ancient chinese board game where the opposing players try to capture each other\'s stones on the board. Behind the veil of this deceptively simple ruleset, lies an enormous layer of depth and complexity. As scientists like to say, the search space of this problem is significantly larger than that of chess. So large, that one often has to rely on human intuition to find a suitable next move, therefore it is not surprising that playing Go on a high level is, or maybe was widely believed to be intractable for machines. The result is Google DeepMind\'s AlphaGo, the deep learning technique that defeated a professional player and world champion, Lee Sedol. What it also important to note is that the techniques used in this algorithm are general, and can be used 

52 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json54YvCE8_7lM.html
3150721
53 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json58tsN03IXlw.html
3102557
error: 58tsN03IXlw.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
54 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json5ePD83StI6A.html
3184698
55 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json5NM_WBI9UBE.html
3107867
error: 5NM_WBI9UBE.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
56 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json5PSWr2ovBvU.html
3193597
57 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json5vpklJw7uL0.html
3227904
58 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json5xLSbj5SsSE.html
3252441
59 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.json62Q1NL4k8cI.html
3192380
60 ../data/repositories/twominutepapers/items/json

128 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonBui3DWs02h4.html
3176940
129 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonBv3yat484aQ.html
3447452
error: Bv3yat484aQ.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
130 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonbVGubOt_jLI.html
3201784
131 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonbVXPnP8k6yo.html
3210935
132 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonbXzauli1TyU.html
3157964
133 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonbYGL3fLYudM.html
3233152
134 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonC3DtGTr0jX8.html
3187446
135 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonC6nonNRoF7g.html
3182209
136 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonC7D5EzkhT6A.html
3313973
137 ../data/repositories/twominutepapers/items/json/_ZLX

203 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsong1sAjtDoItE.html
3164301
204 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonGdTBqBnqhaQ.html
3154631
205 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsongHMY40kEXzs.html
3158664
206 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonGm7szS1hQxs.html
3145530
207 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsongnctSz2ofU4.html
3167967
208 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonGniyQkgGlUA.html
3180488
209 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonGNx8rgNcw5c.html
3333318
210 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsongoD36hVVl7M.html
3221765
211 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonGRQuRcpf5Gc.html
3210196
212 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsongvjCu7zszbQ.html
3183747
213 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonHA

281 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonknIzDj1Ocoo.html
3092908
error: knIzDj1Ocoo.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
282 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonkQ2bqz3HPJE.html
3200289
283 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonKs7wDYsN4yM.html
3181443
284 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonksCSL6Ql0Yg.html
3191686
285 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonkwqme8mEgz4.html
3193020
286 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonL7MOeQw47BM.html
3183594
287 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonLB6NGEHtD7Y.html
3159346
288 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonLBezOcnNJ68.html
3212417
289 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonlc93pVlewGM.html
3165798
290 ../data/repositories/twominutepapers/items/json/_ZLX

363 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonpBkFAIUmWu0.html
3324461
364 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonpc_k-sgUYmY.html
3200726
365 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonpjc1QAI6zS0.html
3134376
366 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonPMSV7CjBuZI.html
3204823
367 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonPopg7ej4AUU.html
3121441
error: Popg7ej4AUU.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
368 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonpQA8Wzt8wdw.html
3131590
369 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonprMk6Znm4Bc.html
3277508
error: prMk6Znm4Bc.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
370 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonpsOPu3TldgY.html
3184814
371 ../data/repositories/twominutepapers/i

441 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonu7kQ5lNfUfg.html
3208031
442 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonu90TbxK7VEA.html
3147741
443 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonu9kvJbWb_1U.html
3189187
444 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonu9UUWqVquXo.html
3192317
445 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonua8Aaf-XIO8.html
3115866
446 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonUBORpapdAfU.html
3210004
447 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonUcI-RnWzASk.html
3128883
448 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonUEPbzj-ekAI.html
3476545
449 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonUePDRN94C8c.html
3214103
450 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonUGAzi1QBVEg.html
3194144
451 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonug

error: ZaFqvM1IsP8.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
523 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZBWTD2aNb_o.html
3265712
524 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZEjUqZU1hNQ.html
3203619
525 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZhN5-o397QI.html
3083604
error: ZhN5-o397QI.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
526 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZHoNpxUHewQ.html
3169304
527 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonziMHaGQJuSI.html
3215371
528 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZi_CVTgqJqI.html
3149956
529 ../data/repositories/twominutepapers/items/json/_ZLXKt4L-AA.jsonZj1N4uE1ehk.html
3109404
error: Zj1N4uE1ehk.html
Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'text'
530 ../data/repositories/two

In [22]:
# scan text for predefined terms

text = 'We use LSTM for anomaly and object detection. As Convolutional Neural Networks are great for ML.'

pd_ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_terms = pd_ml_terms['Term'].tolist()
ml_slugs = pd_ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_tags = pd_ml_terms['Tag'].tolist()
ml_tags = [x for x in ml_tags if str(x) != 'nan']

#print(ml_tags)

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

def match_tags(haystack):
    df = pd.read_csv('../data/patterns/ml_terms.csv')
    tags = []
    
    df.set_index('Term', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(tag)
        except:
            pass
        
    df.set_index('Slug', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(str(tag))
        except:
            pass
        
    #if 'ANN' in tags or 'CNN' in tags or 'RNN' in tags:
    #    tags.remove('NN')
    
    return list(set(tags))

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}
needles_need_str_lower = {
    'ml_slugs': False,
    'ml_terms': True,
    'ml_libs': False,
}

matches = []

matches.extend(match_text(text, ml_terms, True))
matches.extend(match_text(text, ml_slugs, False))
print('matches', matches)

tags = match_tags(matches)
print('tags', tags)

matches ['neural network', 'anomaly', 'detect', 'convolutional neural network', 'object detection', 'lstm', 'ML']
tags ['LSTM', 'NN', 'CNN', 'Object Detection', 'ML']


In [23]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [68]:
# clear text formatting
import re
def clear_text(s):
    s = s.replace('\n',' ').replace('\r','').replace('¶','').strip()
    s = s.replace('_',' ')
    s = re.sub("\s\s+" , " ", s)
    s = s.encode('ascii', 'ignore').decode('utf-8', 'ignore').strip()
    return s

print(clear_text("The   fox jumped   over    the log."))

The fox jumped over the log.


In [122]:
# throw all parsed meta-data together in a single csv
# select only true ML cases

path_in = '../data/repositories/twominutepapers/items/json/'
csv_out = '../data/database/twominutepapers_01_original.csv'

base_url = 'https://www.youtube.com/watch?v='

quit = 0 # quit after n files processed / 0 ... no limit

files = os.listdir(path_in)
print('files:', len(files))
i = 0

runtime_start = time.time()
df = pd.DataFrame()

for file in files:
    #print('#', i, file)
    path = os.path.join(path_in, file)
    i += 1
    store = True

    #print(' - ', j, 'author:', author, '/ notebook:', notebook)
    data = load_data(path, fromJson=True)
    #print(data)
    
    data['url'] = base_url + data['url']
    
    # shorten description
    data['description'] = clear_text(data['description'])
    data['description'] = data['description'].split('WE WOULD LIKE TO THANK')[0].strip()
    data['description'] = data['description'].split('links: Patreon')[0].strip()
    data['description'] = data['description'].split('Patreon')[0].strip()
    data['description'] = data['description'].split('Subscribe')[0].strip()
    
    # reformat date
    date = data['date'].replace('Premiere am','').split('.')
    date = [x.strip() for x in date]
    data['date'] = '-'.join(reversed(date))
    data['date_scraped'] = '2021-02-01'
    data['score_likes'] = score(data['likes'] - data['dislikes'])
    data['score_views'] = score(data['views'])
    
    # get terms and slugs
    data['ml_terms'] = match_text(data['description'], ml_terms, True)
    data['ml_slugs'] = match_text(data['description'], ml_slugs, False)

    data['tags'] = match_tags(data['ml_terms'] + data['ml_slugs'])
     
    # until 2018 there were several types of videos
    # we are only interested in "Two Minute Papers"
    if int(date[0]) < 2018:
        if not 'Two Minute Papers' in data['title']:
            store = False
    
    data['title'] = data['title'].split('| Two Minute Papers')[0].strip()
    
    # count words
    words = data['description'].split(' ')
    data['words'] = len(words)
    
    # remove items with missing topic
    if data['topic'] == '':
        #store = False
        pass
    
    # remove items with missing paper
    if not 'paper' in data['description'].lower():
        store = False
        #data['paper'] = False
        #pass
    else:
        #data['paper'] = True
        pass
    
    '''
    data['text'] = clear_text(data['text'])
    
    # store only items with:
    # - ml_score >= 0.5
    if data['ml_score'] >= 0.5 and len(data['ml_libs']) > 0:
        if not 'all-machine-learning-algorithms-explained' in data['link']:
            df = df.append(data, ignore_index=True)
            #print(data['link'], 'is ML use case')
    '''
    
    if store:
        df = df.append(data, ignore_index=True)

    if quit!=0 and j>quit:
        break
        
# drop duplicates
#df = df.drop_duplicates(['link'])

# drop columns
#df.drop(columns=['description_raw'], inplace=True)

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', i, 'items')
print(df.shape)
print(df.head())

df.to_csv(csv_out, sep=';', index=False)
print('done')

files: 489
runtime: 3.005 seconds for 489 items
(229, 16)
         date date_scraped                                        description  \
0  2017-02-08   2021-02-01  Our Twitter feed is available here: https://tw...   
1  2016-02-28   2021-02-01  Image and color editing is an actively researc...   
2  2015-08-29   2021-02-01  Artificial neural networks were inspired by th...   
3  2016-05-25   2021-02-01  The paper "Surface-Only Liquids" is available ...   
4  2015-11-29   2021-02-01  Humanity is getting closer and closer to creat...   

   dislikes   likes                                              links  \
0       4.0   628.0  [http://gaps-zju.org/mlchai/resources/qin2014c...   
1       0.0    94.0  [http://gfx.cs.princeton.edu/pubs/Chang_2015_P...   
2      21.0  1990.0  [http://arxiv.org/abs/1508.06576v1, http://mas...   
3       3.0   324.0  [http://www.cs.columbia.edu/cg/surfaceliquids/...   
4      41.0   911.0  [http://waitbutwhy.com/2015/01/artificial-inte...   

  ml_slugs

In [63]:
# analyze links
#steps = 50
#df['words_round'] = [round(x/steps,0)*steps+steps for x in df['words']]
from urllib.parse import urlparse

data = []
for item in df['links']:
    if isinstance(item, list):
        data.extend(item)
    else:
        data.append(item)
        
data = list(set(data))
data = [urlparse(x).netloc for x in data]
data = list(set(data))

print(json.dumps(data, indent=2))

[
  "lightrig.de",
  "universe.openai.com",
  "belcour.github.io",
  "hi.cs.waseda.ac.jp:8081",
  "scholar.google.hu",
  "wwwcg.in.tum.de",
  "stanfordmlgroup.github.io",
  "tcwang0509.github.io",
  "www.cs.jhu.edu",
  "link.springer.com",
  "codewords.recurse.com",
  "www.wolframalpha.com",
  "tinyclouds.org",
  "www.luxrender.net",
  "yann.lecun.com",
  "swanintelligence.com",
  "jiaxianyao.github.io",
  "cs.stanford.edu",
  "www.deeplearningbook.org",
  "www.care2.com",
  "pjreddie.com",
  "www.cs.columbia.edu",
  "thatsmaths.com",
  "vc.cs.ovgu.de",
  "forum.computerschach.de",
  "www.idsc.ethz.ch",
  "ieeexplore.ieee.org",
  "playground.tensorflow.org",
  "aaronsplace.co.uk",
  "www.wired.com",
  "research.edm.uhasselt.be",
  "work.caltech.edu",
  "ge.in.tum.de",
  "www.hao-li.com",
  "audionautix.com",
  "neuralnetworksanddeeplearning.com",
  "gitxiv.com",
  "blog.mrtz.org",
  "www.cg.tuwien.ac.at",
  "luboslenco.com",
  "gamma.cs.unc.edu",
  "deepart.io",
  "wxs.ca",
  "www.natu