In [1]:
# imports
import requests
import json
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

In [2]:
# generic url scraper
def url_scraper(url,
                selenium = False,
                windowSize = "1280,720",
                headless = True,
                quitOnEnd = True,
                waitForElement = '',
                waitForId = '',
                waitForClass = '',
                pressLink = [],
                waitBetweenPress = 30,
               ):
    meta = {
        'url': url
    }
    
    # simple version
    if selenium == False:
        page = requests.get(url)
        meta['status'] = page.status_code
        html = page.content
      
    # selenium version
    if selenium == True:
        CHROMEDRIVER_PATH = './.selenium/chromedriver'
        WINDOW_SIZE = windowSize

        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)

        driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
        #driver.implicitly_wait(1)
        driver.get(url)
        
        # wait for a given tag to be loaded (javascript generated code)
        if waitForElement != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.TAG_NAME, waitForElement)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # wait for a given ID to be loaded (javascript generated code)
        if waitForId != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.ID, waitForId)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # wait for a given class to be loaded (javascript generated code)
        if waitForClass != '':
            try:
                element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.CLASS_NAME, waitForClass)))
                print ("Page is ready!")
            except TimeoutException:
                print ("Loading took too much time!")
                
        # press link
        if len(pressLink)>0:
            driver.implicitly_wait(waitBetweenPress)
            try:
                for link in pressLink:
                    print(link)
                    element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.LINK_TEXT, link)))
                    element.click()
            except TimeoutException:
                print ("Loading took too much time!")
            
                
                
        
        html = driver.page_source
        if quitOnEnd:
            driver.close()
        meta['status'] = 0
    
    return {'meta': meta, 'html': html }

url = 'https://www.kaggle.com'
link = '/austinreese/craigslist-carstrucks-data'
result = url_scraper(url+link, selenium=True, waitForClass='content-box', headless = False, quitOnEnd = False)
print(result['html'])

Page is ready!
    
    
    
    
    
        
        
    
    <script type="text/javascript" async="" src="https://www.google-analytics.com/gtm/js?id=GTM-52LNT9S&amp;t=gtag_UA_12629138_1&amp;cid=577171676.1607274543" nonce=""></script><script src="https://apis.google.com/_/scs/apps-static/_/js/k=oz.gapi.de.PjifxZUegZw.O/m=auth2,client/rt=j/sv=1/d=1/ed=1/am=wQE/rs=AGLTcCPka5e4c91jDFgpgP9kBnrqbmjxfA/cb=gapi.loaded_0" nonce="" async=""></script><script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js" nonce=""></script><script nonce="" type="text/javascript">
        if ('serviceWorker' in navigator) {
            navigator.serviceWorker.getRegistrations()
                .then(function(registrations) {
                    for (let registration of registrations) {
                        registration.unregister();
                    }
                })
                .catch(function(err) {
                    console.error("Service worker unregist

In [3]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
print(store_data('Hello', '../data/repositories/mlart/test.txt'))
print(load_data('../data/repositories/mlart/test.txt'))

# test json
print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
print(load_data('../data/repositories/mlart/test.json', fromJson=True))

5
Hello
22
{'msg': 'Hello World'}


In [4]:
# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

In [15]:
# get all sites from csv and store them for later feature scraping

csv = '../data/repositories/kaggle/kaggle_index.csv'
folder = '../data/repositories/kaggle/datasets'

df = pd.read_csv(csv)

for i, link in enumerate(df['link']):
    print('###')
    print(i, link)
    temp = link.split('/')
    author = temp[1]
    name = temp[2]
    print(author, name)
    
    # create folders
    create_folder(os.path.join(folder, author+'/'))
    create_folder(os.path.join(folder, author, name+'/'))
    
    file = os.path.join(folder, author, name+'/dataset.html')
    if not os.path.isfile(file):
        result = url_scraper(url+link, selenium=True, waitForClass='content-box', headless = True)
        store_data(result['html'], file)
        html = result['html']
        #print(result['html'])
    else:
        print('dataset.html already exists')
        html = load_data(file)
        
    # let's take a look, if there are notebooks for this dataset
    # notebooks make datasets ML use cases
    soup = BeautifulSoup(html, 'html.parser')
    result = []
    
    #partial = soup.find('div', class_="km-list km-list--three-line")
    #items = partial.find_all('li', {"role": "listitem"})
    notebooks = soup.select('span.pageheader__link-count > span')
    if len(notebooks) > 0:
        notebooks = int(notebooks[0].text.replace(',',''))
    else:
        notebooks = 0
    print('notebooks:', notebooks)
    
    if notebooks > 0:
        file = os.path.join(folder, author, name+'/notebooks.html')
        if not os.path.isfile(file):
            result = url_scraper(url+link+'/notebooks', selenium=True, waitForClass='km-list', headless = True)
            if len(result['html']) == 0:
                print('Unknown Error')
                break
            store_data(result['html'], file)
            #print(result['html'])
        else:
            print('notebooks.html already exists')

    print('done:', link)
    
    force_break = False
    if force_break == True and i >= 1:
        print('Forced Quit')
        break


###
0 /alexgude/california-traffic-collision-data-from-switrs
alexgude california-traffic-collision-data-from-switrs
dataset.html already exists
notebooks: 1
notebooks.html already exists
done: /alexgude/california-traffic-collision-data-from-switrs
###
1 /babyoda/women-entrepreneurship-and-labor-force
babyoda women-entrepreneurship-and-labor-force
dataset.html already exists
notebooks: 1
notebooks.html already exists
done: /babyoda/women-entrepreneurship-and-labor-force
###
2 /szymonjanowski/internet-articles-data-with-users-engagement
szymonjanowski internet-articles-data-with-users-engagement
dataset.html already exists
notebooks: 6
notebooks.html already exists
done: /szymonjanowski/internet-articles-data-with-users-engagement
###
3 /sakshigoyal7/credit-card-customers
sakshigoyal7 credit-card-customers
dataset.html already exists
notebooks: 1
notebooks.html already exists
done: /sakshigoyal7/credit-card-customers
###
4 /imoore/2020-us-general-election-turnout-rates
imoore 2020-us-g

Page is ready!
done: /jerzydziewierz/bee-vs-wasp
###
47 /unsdsn/world-happiness
unsdsn world-happiness
dataset.html already exists
notebooks: 2
Page is ready!
done: /unsdsn/world-happiness
###
48 /hacker-news/hacker-news
hacker-news hacker-news
dataset.html already exists
notebooks: 1937
Page is ready!
done: /hacker-news/hacker-news
###
49 /christianlillelund/donald-trumps-rallies
christianlillelund donald-trumps-rallies
dataset.html already exists
notebooks: 2
Page is ready!
done: /christianlillelund/donald-trumps-rallies
###
50 /sudalairajkumar/novel-corona-virus-2019-dataset
sudalairajkumar novel-corona-virus-2019-dataset
dataset.html already exists
notebooks: 6
Page is ready!
done: /sudalairajkumar/novel-corona-virus-2019-dataset
###
51 /uciml/iris
uciml iris
dataset.html already exists
notebooks: 1
Page is ready!
done: /uciml/iris
###
52 /allen-institute-for-ai/CORD-19-research-challenge
allen-institute-for-ai CORD-19-research-challenge
dataset.html already exists
notebooks: 17
Pa

notebooks: 3
Page is ready!
done: /rishidamarla/judicial-expenditures-across-all-50-states
###
95 /puneet6060/intel-image-classification
puneet6060 intel-image-classification
dataset.html already exists
notebooks: 1
Page is ready!
done: /puneet6060/intel-image-classification
###
96 /kwullum/fatal-police-shootings-in-the-us
kwullum fatal-police-shootings-in-the-us
dataset.html already exists
notebooks: 762
Page is ready!
done: /kwullum/fatal-police-shootings-in-the-us
###
97 /smithsonian/volcanic-eruptions
smithsonian volcanic-eruptions
dataset.html already exists
notebooks: 717
Page is ready!
done: /smithsonian/volcanic-eruptions
###
98 /mathan/fifa-2018-match-statistics
mathan fifa-2018-match-statistics
dataset.html already exists
notebooks: 407
Page is ready!
done: /mathan/fifa-2018-match-statistics
###
99 /theworldbank/world-bank-intl-education
theworldbank world-bank-intl-education
dataset.html already exists
notebooks: 264
Page is ready!
done: /theworldbank/world-bank-intl-educati

notebooks: 229
Page is ready!
done: /rush4ratio/video-game-sales-with-ratings
###
140 /leonardopena/top50spotify2019
leonardopena top50spotify2019
dataset.html already exists
notebooks: 5
Page is ready!
done: /leonardopena/top50spotify2019
###
141 /datatattle/covid-19-nlp-text-classification
datatattle covid-19-nlp-text-classification
dataset.html already exists
notebooks: 1
Page is ready!
done: /datatattle/covid-19-nlp-text-classification
###
142 /uciml/glass
uciml glass
dataset.html already exists
notebooks: 343
Page is ready!
done: /uciml/glass
###
143 /crowdflower/twitter-airline-sentiment
crowdflower twitter-airline-sentiment
dataset.html already exists
notebooks: 368
Page is ready!
done: /crowdflower/twitter-airline-sentiment
###
144 /camnugent/sandp500
camnugent sandp500
dataset.html already exists
notebooks: 1
Page is ready!
done: /camnugent/sandp500
###
145 /osmi/mental-health-in-tech-survey
osmi mental-health-in-tech-survey
dataset.html already exists
notebooks: 158
Page is r

KeyboardInterrupt: 

In [None]:
# this process is working, but it's too slow
# let's try to improve scraping
# but beware of multithreading or too many requests at once
# to many requests leeds to a DDOS attack