In [1]:
# imports
import requests
import json
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

In [2]:
# generic url scraper
def url_scraper(url,
                selenium = False,
                windowSize = "1280,720",
                headless = True,
                quitOnEnd = True,
                waitForElement = '',
                waitForId = '',
                waitForClass = '',
                pressLink = [],
                waitBetweenPress = 30,
               ):
    meta = {
        'url': url
    }
    
    # simple version
    if selenium == False:
        page = requests.get(url)
        meta['status'] = page.status_code
        html = page.content
      
    # selenium version
    if selenium == True:
        CHROMEDRIVER_PATH = './.selenium/chromedriver'
        WINDOW_SIZE = windowSize

        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)

        driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
        #driver.implicitly_wait(1)
        driver.get(url)
        
        # wait for a given tag to be loaded (javascript generated code)
        if waitForElement != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.TAG_NAME, waitForElement)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # wait for a given ID to be loaded (javascript generated code)
        if waitForId != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.ID, waitForId)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # wait for a given class to be loaded (javascript generated code)
        if waitForClass != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.CLASS_NAME, waitForClass)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # press link
        if len(pressLink)>0:
            driver.implicitly_wait(waitBetweenPress)
            try:
                for link in pressLink:
                    print(link)
                    element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.LINK_TEXT, link)))
                    element.click()
            except TimeoutException:
                print ("Loading took too much time!")
            
                
                
        
        html = driver.page_source
        if quitOnEnd:
            driver.close()
        meta['status'] = 0
    
    return {'meta': meta, 'html': html }

url = 'https://mlart.co/'
result = url_scraper(url, selenium=True, waitForClass='sv-viewer-header',headless = False, quitOnEnd = False, pressLink = ['2','3'])
#print(result['html'])

Page is ready!
2
3


In [3]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
print(store_data('Hello', '../data/repositories/mlart/test.txt'))
print(load_data('../data/repositories/mlart/test.txt'))

# test json
print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
print(load_data('../data/repositories/mlart/test.json', fromJson=True))

5
Hello
22
{'msg': 'Hello World'}


In [4]:
# get all sites from mlart.co and store them for later
url = 'https://mlart.co/'
for i in range(4,5):
    print(i)
    #steps = list(range(2, i+1))
    steps = ["{}".format(n) for n in range(2, i+1)]
    print(steps)
    result = url_scraper(url, selenium=True, waitForClass='sv-viewer-header',headless = False, quitOnEnd = False, pressLink = steps,waitBetweenPress=60)
    print(result['html'])


4
['2', '3', '4']
Page is ready!
2
3


ElementClickInterceptedException: Message: element click intercepted: Element <a data-v-8296...a40="" tabindex="0">3</a> is not clickable at point (645, 571). Other element would receive the click: <div class="sv-tiles-list" theme="sv-is-dark">...</div>
  (Session info: chrome=87.0.4280.66)


In [5]:
# writing a good automated scraper for mlart.co is hard
# navigating pages can only done by clicking the next pagination item
# for now i manually load the html content and store it

In [6]:
# scrape projects from html-code

def mlart_scrape_item_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    #title = soup.title.text
    #print(title)
    result = []

    link_detail = soup.find_all('a', class_='sv-tile__slider')
    for a in link_detail:
        result.append(a.get('href'))
        
    return result

#mlart_scrape_item_links(result['html'])

result = []
for i in range (1,18):
    selector = str(i).zfill(2)
    print(selector)
    
    html = load_data('../data/repositories/mlart/index/mlart-'+selector+'.html')
    result.extend(mlart_scrape_item_links(html))
    
print(result)
print(len(result))

store_data(result, '../data/repositories/mlart/items.json', toJson=True)

01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
['/item/neural-ca-based-experience_-highlighting-the-relation-between-individual-cells-and-their-collective', '/item/a-collaborative-fiction-with-gpt-3_-video_-personas_-and-voices-are-all-generated', '/item/an-interactive_-gradient-based-audio-and-video-alignment-of-thousands-of-covers-of-billie-eilish_s-_bad-guy', '/item/train-an-lstm-on-sacred-texts_-use-voice-synthesis-to-play-the-generated-scriptures-on-unsecured-surveillance-cameras-with-speakers', '/item/painting-by-viktor-vasnetsov_-animated-with-_animating-landscape_-and-multi-domain-multi-modality-i2i-translation', '/item/using-posenet-to-create-art-using-your-body-parts-like-eyes_-hands_-chest-and-knees', '/item/stylegan-interpolation_-trained-on-karl-blossfeldt_s-herbarium-_1928', '/item/an-interactive-demo-to-type-with-stylegan-generated-letterforms', '/item/generated-choreography-with-vaes-trained-on-high-quality-motion-capture-data-of-improvisational-dance', '/item/using

34200

In [7]:
# reload from file
mlart_items = load_data('../data/repositories/mlart/items.json', fromJson=True)
print(len(mlart_items))

389


In [8]:
# load some item details
url1 = 'https://mlart.co/item/neural-ca-based-experience_-highlighting-the-relation-between-individual-cells-and-their-collective'
url2 = 'https://mlart.co/item/a-collaborative-fiction-with-gpt-3_-video_-personas_-and-voices-are-all-generated'

result = url_scraper(url2, selenium=True, waitForClass = 'sv-product__cta-btn')
print(result)

Page is ready!
{'meta': {'url': 'https://mlart.co/item/a-collaborative-fiction-with-gpt-3_-video_-personas_-and-voices-are-all-generated', 'status': 0}, 'html': '<html lang="en"><head><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1"><meta name="host" content=""><title>Lisha Li, Vladimir Glazachev, Dzmitry Pletnikau, Eric Liu, Jessica Hu</title><link href="/css/index.css" rel="preload" as="style"><link href="/js/index.js" rel="preload" as="script"><link href="/css/index.css" rel="stylesheet"><style type="text/css">a[data-v-82963a40]{cursor:pointer}</style><script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-PTWX9Z9"></script><script async="" src="https://static.mailerlite.com/js/universal.js?v1606893"></script><script async="true" id="xo-pptm" src="https://www.paypal.com/tagmanage

In [15]:
# scrape project-details from html-code

def mlart_scrape_details(html):
    soup = BeautifulSoup(html, 'html.parser')
    #title = soup.title.text
    #print(title)
    result = {}

    result['title'] = soup.find_all('h1', class_='sv-product__title')[0].text.strip()
    result['subtitle'] = soup.find_all('h2', class_='sv-product__subtitle')[0].text.strip()
    result['link'] = soup.find_all('a', class_='sv-product__cta-btn')[0].get('href')
    specs_names = soup.select('span.sv-product__spec-name')
    specs_values = soup.select('span.sv-product__spec-value')
    #result['specs'] = {}
    for i, elem in enumerate(specs_names):
        key = specs_names[i].text.replace(':','')
        val = specs_values[i].select('span')[0].text.strip()
        if ', ' in val:
            val = val.split(', ')
        #result['specs'][key] = val
        result[key] = val
    
        
    return result

print(mlart_scrape_details(result['html']))

{'title': 'Lisha Li, Vladimir Glazachev, Dzmitry Pletnikau, Eric Liu, Jessica Hu', 'subtitle': 'A collaborative fiction with GPT-3. Video, personas, and voices are all generated', 'link': 'https://rosebud.ai/humansofai', 'Theme': ['Narrative', 'Stories', 'Collaborative Fiction'], 'Medium': 'Video', 'Technology': ['GAN', 'Few Shot Animation', 'Text To Speech', 'GPT'], 'Days Since Featured': '8', 'Date': '11/01/2020'}


In [10]:
# traverse the items and download html

url_base = 'https://mlart.co'
store_raw_path = '../data/repositories/mlart/items/'

i = 0
for elem in mlart_items:
    url = url_base + mlart_items[i]
    fp = store_raw_path + mlart_items[i].replace('/item/','') + '.html'
    print(i, url)
    #print(fp)
    
    if os.path.isfile(fp):
        print('already downloaded')
        
    else:
        print('starting download...')
        
        # load page
        response = url_scraper(url, selenium=True, waitForClass = 'sv-product__cta-btn')

        # save page
        store_data(response['html'], fp)
    
    i += 1
    
    if i == 400:
        print('Forced Quit @ line:', i)
        break

0 https://mlart.co/item/neural-ca-based-experience_-highlighting-the-relation-between-individual-cells-and-their-collective
already downloaded
1 https://mlart.co/item/a-collaborative-fiction-with-gpt-3_-video_-personas_-and-voices-are-all-generated
already downloaded
2 https://mlart.co/item/an-interactive_-gradient-based-audio-and-video-alignment-of-thousands-of-covers-of-billie-eilish_s-_bad-guy
already downloaded
3 https://mlart.co/item/train-an-lstm-on-sacred-texts_-use-voice-synthesis-to-play-the-generated-scriptures-on-unsecured-surveillance-cameras-with-speakers
already downloaded
4 https://mlart.co/item/painting-by-viktor-vasnetsov_-animated-with-_animating-landscape_-and-multi-domain-multi-modality-i2i-translation
already downloaded
5 https://mlart.co/item/using-posenet-to-create-art-using-your-body-parts-like-eyes_-hands_-chest-and-knees
already downloaded
6 https://mlart.co/item/stylegan-interpolation_-trained-on-karl-blossfeldt_s-herbarium-_1928
already downloaded
7 https://

In [17]:
# traverse the items and scrape data

url_base = 'https://mlart.co'
store_raw_path = '../data/repositories/mlart/items/'
#store_csv_path = '../data/repositories/mlart/mlart.csv'
store_csv_path = '../data/database/mlart_01_original.csv'

df = pd.DataFrame()

for i, elem in enumerate(mlart_items):
    url = url_base + mlart_items[i]
    fp = store_raw_path + mlart_items[i].replace('/item/','') + '.html'
    print(i, url)
    #print(fp)
    
    if os.path.isfile(fp):
        print('already downloaded')
        html = load_data(fp)
        
    else:
        print('starting download...')
        
        # load page
        response = url_scraper(url, selenium=True, waitForClass = 'sv-product__cta-btn')

        # save page
        store_data(response['html'], fp)
        
        html = response['html']
    
    data = {'url': url}
    data.update(mlart_scrape_details(html))
    
    df = df.append(data, ignore_index=True)
    
    if i >= 1000:
        print('Forced Quit @ line:', i)
        break
        
df.to_csv(store_csv_path)
print(df.head())

0 https://mlart.co/item/neural-ca-based-experience_-highlighting-the-relation-between-individual-cells-and-their-collective
already downloaded
1 https://mlart.co/item/a-collaborative-fiction-with-gpt-3_-video_-personas_-and-voices-are-all-generated
already downloaded
2 https://mlart.co/item/an-interactive_-gradient-based-audio-and-video-alignment-of-thousands-of-covers-of-billie-eilish_s-_bad-guy
already downloaded
3 https://mlart.co/item/train-an-lstm-on-sacred-texts_-use-voice-synthesis-to-play-the-generated-scriptures-on-unsecured-surveillance-cameras-with-speakers
already downloaded
4 https://mlart.co/item/painting-by-viktor-vasnetsov_-animated-with-_animating-landscape_-and-multi-domain-multi-modality-i2i-translation
already downloaded
5 https://mlart.co/item/using-posenet-to-create-art-using-your-body-parts-like-eyes_-hands_-chest-and-knees
already downloaded
6 https://mlart.co/item/stylegan-interpolation_-trained-on-karl-blossfeldt_s-herbarium-_1928
already downloaded
7 https://

59 https://mlart.co/item/gan-interpolation
already downloaded
60 https://mlart.co/item/gan-generated-music-video
already downloaded
61 https://mlart.co/item/using-artbreeder-to-translate-statues-of-roman-emperors-to-latent-vectors_-and-adjusting-the-vector-to-make-them-real
already downloaded
62 https://mlart.co/item/gan-trained-on-personal-pictures-of-leafs
already downloaded
63 https://mlart.co/item/stylegan-trained-on-plasma-photos_-displayed-in-a-collage
already downloaded
64 https://mlart.co/item/video-transferred-into-next-frame-predictions-via-a-conditional-gan
already downloaded
65 https://mlart.co/item/train-a-stylegan-on-nature-images-and-interpolate-the-model-inside-of-a-vr-cinema
already downloaded
66 https://mlart.co/item/a-stylegan-trained-on-images-from-mars-mro-and-interpolated-frame-by-frame-to-create-a-3d-effect
already downloaded
67 https://mlart.co/item/display-installation-with-gan-interpolations-with-photos-of-eyes-with-makeup
already downloaded
68 https://mlart.c

127 https://mlart.co/item/dj-video-with-video-texture-advection-with-flipfluids-and-optical-flow-based-style-transfer
already downloaded
128 https://mlart.co/item/translate-a-painting-to-a-real-world-photograph-with-gaugan
already downloaded
129 https://mlart.co/item/use-photogrammetry-to-extract-vector-points-from-images_-and-apply-a-optical-flow-based-styletransfer-with-noj-barke_s-dot-paintings
already downloaded
130 https://mlart.co/item/start-with-a-video-clip-of-a-rollercoaster-and-then-predict-the-next-frame-with-a-pix2pix-model_-feeding-the-previous-frame
already downloaded
131 https://mlart.co/item/gan-generated-japanese-wood-prints_-and-then-print-them-using-the-same-technique
already downloaded
132 https://mlart.co/item/use-gpt-3-to-create-a-text-adventure
already downloaded
133 https://mlart.co/item/tom-cruise-deep-fake
already downloaded
134 https://mlart.co/item/extract-face-markers-with-a-camera-and-use-a-pix2pix-model-to-translate-the-face-markers-to-paintings
already d

197 https://mlart.co/item/reverse-super-resolution-by-turning-high-res-images-into-low-res
already downloaded
198 https://mlart.co/item/t-sne-exploration-in-vr
already downloaded
199 https://mlart.co/item/deepdream-applied-to-klimt_s-adele-bloch-bauer
already downloaded
200 https://mlart.co/item/an-rnn-analysis-sound-data-to-control-a-video-game
already downloaded
201 https://mlart.co/item/blend-two-melodies-using-the-latent-vectors-from-a-vae
already downloaded
202 https://mlart.co/item/blend-several-beats-melodies-by-mixing-the-latent-vector-of-midi-files
already downloaded
203 https://mlart.co/item/use-a-camera-to-find-objects-and-a-cnn-identifies-if-it_s-correct
already downloaded
204 https://mlart.co/item/an-rnn-that-generates-knitting-instructions_-then-a-knitter-follows-the-instructions
already downloaded
205 https://mlart.co/item/a-real-time-implementation-of-sketch-rnn-which-predicts-future-strokes-while-you-draw
already downloaded
206 https://mlart.co/item/word-association-ga

271 https://mlart.co/item/light-projections-on-a-sculpture-based-on-facial-expressions
already downloaded
272 https://mlart.co/item/train-gan-on-maps-and-project-on-naked-bodies
already downloaded
273 https://mlart.co/item/visualize-how-variables-in-a-music-vae-are-updated-on-a-2d-surface
already downloaded
274 https://mlart.co/item/create-a-video-with-a-conditional-gan-using-next-frame-predictions
already downloaded
275 https://mlart.co/item/generate-lyrics-with-an-vae-trained-with-seven-different-styles
already downloaded
276 https://mlart.co/item/translate-online-maps-to-19th-century-maps-in-real-time-with-cyclegan
already downloaded
277 https://mlart.co/item/vae-generated-3d-point-clouds-for-different-objects_-interpolated-with-a-midi-controller
already downloaded
278 https://mlart.co/item/extract-patches-from-an-image-to-generate-a-texture-with-a-gan-discriminator
already downloaded
279 https://mlart.co/item/rnn-generated-fake-articles-seeded-by-real-world-article-titles
already d

339 https://mlart.co/item/translate-image-features-from-a-graphical-user-interface-to-rnn-generated-website
already downloaded
340 https://mlart.co/item/use-a-cnn-to-classify-low-fidelity-wireframes-to-mobile-app-components
already downloaded
341 https://mlart.co/item/create-a-texture-by-combining-two-images-with-style-transfer
already downloaded
342 https://mlart.co/item/deleting-weights-in-gans-to-create-glitches
already downloaded
343 https://mlart.co/item/extract-face-markers-and-use-a-pix2pix-model-to-translate-the-face-markers-into-paintings
already downloaded
344 https://mlart.co/item/generate-a-pose_-use-a-pix2pix-model-to-turn-the-pose-into-an-artwork-and-then-paint-it-live
already downloaded
345 https://mlart.co/item/create-a-deep-fake-with-kellyanne-conway_s-by-translating-face-markers-into-a-video-of-a-person-speaking
already downloaded
346 https://mlart.co/item/organise-personal-art-collection-with-t-sne_-compose-music-according-to-each-clusters_-and-detect-each-clusters-w