In [1]:
# imports
import requests
#from requests_html import HTMLSession, AsyncHTMLSession
from bs4 import BeautifulSoup
import re
import json

In [4]:
# scrape github-page for metadata
# last pushed is added per javascript

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'

def github_scraper(url):
    meta = {
        'url': url
    }

    #page = requests.get(url, timeout=(3.05, 27))

    # requests-html is slightly better than requests in catching ajax data (last_commit)
    #session = HTMLSession()
    #page = session.get(url)

    # requests-html offers rendering javascript-functions
    #asession = AsyncHTMLSession()
    #r = await asession.get(url)
    #await r.html.arender()
    #page = r.html.html
    
    # soup = BeautifulSoup(page.content, 'html.parser')
    
    # selenium
    #CHROME_PATH = '/usr/bin/google-chrome'
    CHROMEDRIVER_PATH = './.selenium/chromedriver'
    WINDOW_SIZE = "1920,1080"

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
    #chrome_options.binary_location = CHROME_PATH

    driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
    #driver.implicitly_wait(1)
    driver.get(url)
    # wait for the tag <relative-time>
    try:
        element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.TAG_NAME, 'relative-time')))
        print ("Page is ready!")
    except TimeoutException:
        print ("Loading took too much time!")
        
    html = driver.page_source
    driver.close()

    #print(html)

    soup = BeautifulSoup(html, 'html.parser')
    #title = soup.title.text
    #meta['title'] = title

    #print(page.text)
    #print(page.status_code)
    #print(title)

    try:
        about = soup.select('p.f4')[0].text.strip()
        # remove emojis, check: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
        about = about.encode('ascii', 'ignore').decode('ascii').strip()
        #print(about)
        meta['about'] = about
    except:
        meta['about'] = ''

    #lic = soup.select('div.mt-3')[-1].text.strip()
    lic = soup.select('div.mt-3')
    lic = [elem.text.strip() for elem in lic]
    if len(lic) > 0 and "Readme" in lic:
        readme = lic.index("Readme")
        if readme == len(lic)-1:
            lic = 'Nan'
        else:
            lic = lic[readme+1]
        #print (lic)
        meta['license'] = lic

    lang = soup.select('li.d-inline')
    lang = [elem.text.strip().replace('\n', ': ') for elem in lang]
    #print(lang)
    meta['languages'] = lang

    social = soup.select('a.social-count')
    social = [elem.text.strip() for elem in social]
    #print(social)
    watch, stars, forks = social
    #print(watch, stars, forks)
    #meta['watch'] = watch
    if 'k' in stars:
        stars = int(float(stars.replace('k','')) * 1000)
    meta['stars'] = stars
    #meta['forks'] = forks

    contributors = soup.select('span.Counter')[-1].text.strip()
    #print(contributors)
    meta['contributors'] = contributors

    commits = soup.find('span', {"aria-label" : re.compile("Commits")}).parent.text.strip().replace('\ncommits', '')
    #print(commits)
    commits = commits.replace(' ','').replace('\n','').replace('commits','')
    meta['commits'] = commits

    # this in an ajax-element, which often is not fetched
    #last_commit = soup.select('relative-times')
    #if len(last_commit) > 0:
    #    last_commit = last_commit[0].get("datetime")
    #else:
    #    last_commit = 'NaN'

    #last_commit = soup.select('a.link-gray.ml-2')[-1].findChildren()[0].get("datetime")
    last_commit = soup.find('relative-time').get('datetime')
    #print(last_commit)
    meta['last_commit'] = last_commit
    
    return {'meta': meta, 'html': html }

result = github_scraper(url3)
print(json.dumps(result['meta']))

Page is ready!
{"url": "https://github.com/PatrickJS/awesome-angular", "about": "A curated list of awesome Angular resources", "license": "CC0-1.0 License", "languages": ["HTML: 74.7%", "CSS: 23.2%", "Shell: 2.1%"], "stars": 7700, "contributors": "246", "commits": "592", "last_commit": "2020-11-29T03:38:20Z"}


In [10]:
# traverse csv and scrape all github-pages for meta-data
import os
import pandas as pd

git_store = '../data/repositories/git/'
meta_store = '.meta/github_meta.json'
html_store = '.meta/github_raw.html'
csv = '../data/database/db_03_cleanup.csv'
column = 'Base_URL'
df = pd.read_csv(csv)

# blobcity

git_store = '../data/repositories/blobcity/git/'
csv = '../data/database/blobcity_01_index.csv'
column = 'github_link'
df = pd.read_csv(csv, sep=';')


#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

for i, url in enumerate(df[column]):
    print(i, url)
    if isinstance(url, str) and 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        
        path_meta = os.path.join(git_store, path, meta_store)
        path_raw = os.path.join(git_store, path, html_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            result = github_scraper(url)
            #print(json.dumps(meta))

            create_folder(path_raw)
            with open(path_raw, 'w', encoding='utf-8') as fp:
                fp.write(result['html'])
              
            create_folder(path_meta)
            with open(path_meta, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['meta']))
                
        else:
            print('already scraped')

0 https://github.com/dnc1994/Kaggle-Playground
already scraped
1 https://github.com/microsoft/cognitive-services-notebooks
already scraped
2 https://github.com/cantaro86/Financial-Models-Numerical-Methods
already scraped
3 https://github.com/gabehollombe-aws/jupyter-notebooks
already scraped
4 nan
5 https://github.com/jasonstrimpel/PyData-Meetup
already scraped
6 https://github.com/xtreamsrl/jupytemplate
already scraped
7 https://github.com/digitalearthafrica/deafrica-sandbox-notebooks
already scraped
8 https://github.com/jpcolino/IPython_notebooks
already scraped
9 https://github.com/jonkrohn/tf2
already scraped
10 https://github.com/prathameshtari/Predicting-Football-Match-Outcome-using-Machine-Learning
already scraped
11 https://github.com/1024hub/Go-Learning-Notebook
already scraped
12 https://github.com/oschuett/appmode
already scraped
13 https://github.com/zakiso/Postgraduate_notebook_for_SJTU_CS
already scraped
14 https://github.com/jldowns/google_earth_engine_notebook
already s

304 https://github.com/jseabold/csc432-notebooks
already scraped
305 https://github.com/mozilla/jupyter-notebook-gist
already scraped
306 https://github.com/okfn-brasil/notebooks
already scraped
307 https://github.com/abulbasar/machine-learning
already scraped
308 https://github.com/dataflowr/notebooks
already scraped
309 https://github.com/araffin/rl-tutorial-jnrr19
already scraped
310 https://github.com/gzc/CLRS
already scraped
311 https://github.com/beader/mlnotebook
already scraped
312 https://github.com/wizardforcel/data-science-notebook
already scraped
313 https://github.com/quantopian/qgrid-notebooks
already scraped
314 https://github.com/tirthajyoti/Machine-Learning-with-Python
already scraped
315 https://github.com/ageron/handson-ml
already scraped
316 https://github.com/brunow/NoteItIOS
already scraped
317 https://github.com/JohnLaTwC/Shared
already scraped
318 https://github.com/Axelrod-Python/Axelrod-notebooks
already scraped
319 https://github.com/michhar/python-jupyter-no

already scraped
512 https://github.com/MalayAgarwal-Lee/real_python_data_cleaning_tutorial
already scraped
513 https://github.com/thibo73800/tensorflow2.0-examples
already scraped
514 https://github.com/SELinuxProject/selinux-notebook
already scraped
515 https://github.com/asharifiz/Probability_Statistics
already scraped
516 https://github.com/JsonChao/Awesome-Android-Notebook
already scraped
517 https://github.com/feyeleanor/GoNotebook
already scraped
518 https://github.com/marrrcin/ml-twitter-sentiment-analysis
already scraped
519 https://github.com/dannguyen/python-notebooks-data-wrangling
already scraped
520 https://github.com/gururajang/Notebooks_SparkML
already scraped
521 https://github.com/bokeh/bokeh-notebooks
already scraped
522 https://github.com/polynote/polynote
already scraped
523 https://github.com/SheffieldML/notebook
already scraped
524 https://github.com/wulee510505/notebook
already scraped
525 https://github.com/WillKoehrsen/jupyter-notebook-extensions
already scrape

Page is ready!
635 https://github.com/aflaxman/pymc-examples
Page is ready!
636 https://github.com/influenist/Mi-NB-Gaming-Laptop-MacOS
Page is ready!
637 https://github.com/bhattbhavesh91/time_series_notebooks
Page is ready!
638 https://github.com/nakaizura/Source-Code-Notebook
Page is ready!
639 https://github.com/nicolaskruchten/jupyter_pivottablejs
Page is ready!
640 https://github.com/PegasusWang/notebooks
Page is ready!
641 https://github.com/aws-samples/sagemaker-run-notebook
Page is ready!
642 https://github.com/markjay4k/fourier-transform
Page is ready!
643 https://github.com/python-engineer/python-engineer-notebooks
Page is ready!
644 https://github.com/mimoralea/applied-reinforcement-learning
Page is ready!
645 https://github.com/sympy/quantum_notebooks
Page is ready!
646 https://github.com/hemanta212/blogger-cli
Page is ready!
647 https://github.com/erhwenkuo/deep-learning-with-keras-notebooks
Page is ready!
648 https://github.com/GEMScienceTools/notebooks
Page is ready!
64

Page is ready!
760 https://github.com/cgoliver/Notebooks
Page is ready!
761 https://github.com/minerandodados/mdrepo
Page is ready!
762 https://github.com/manujeevanprakash/Matplot-lib-Basics
Page is ready!
763 https://github.com/codenode/codenode
Page is ready!
764 https://github.com/plutov/notebook
Page is ready!
765 https://github.com/biplobsd/OneClickRun
Page is ready!
766 https://github.com/captainsafia/blazoract
Page is ready!
767 https://github.com/vsbuffalo/devnotes
Page is ready!
768 https://github.com/tesla809/intro-to-python-jupyter-notebooks
Page is ready!
769 https://github.com/rothnic/anaconda-notebook
Page is ready!
770 https://github.com/lyhue1991/spark_tutorial
Page is ready!
771 https://github.com/quantopian/qgrid
Page is ready!
772 https://github.com/eclarson/DataMiningNotebooks
Page is ready!
773 https://github.com/jmportilla/Udemy---Machine-Learning
Page is ready!
774 https://github.com/christophebourguignat/notebooks
Page is ready!
775 https://github.com/LearningJ

Page is ready!
886 https://github.com/dunovank/jupyter-themes
Page is ready!
887 https://github.com/deepmipt/dp_notebooks
Page is ready!
888 https://github.com/dudash/openshift-workshops
Page is ready!
889 https://github.com/rdipietro/jupyter-notebooks
Page is ready!
890 https://github.com/takluyver/nbopen
Page is ready!
891 https://github.com/Slicer/SlicerNotebooks
Page is ready!
892 https://github.com/kite8/Quantopian-lectures-notebook-translation
Page is ready!
893 https://github.com/ydixon/yolo_v3
Page is ready!
894 https://github.com/Germey/Python3NoteBooks
Page is ready!
895 https://github.com/PythonFreeCourse/Notebooks
Page is ready!
896 https://github.com/dennyglee/databricks
Page is ready!
897 https://github.com/dmonn/dcgan-oreilly
Page is ready!
898 https://github.com/chhayac/Machine-Learning-Notebooks
Page is ready!
899 https://github.com/hunkim/effective_python_notebook
Page is ready!
900 https://github.com/mwermelinger/Learn-to-code-for-data-analysis
Page is ready!
901 htt

Page is ready!
1010 https://github.com/solliancenet/microsoft-learning-paths-databricks-notebooks
Page is ready!
1011 https://github.com/urbica/gis-notebook
Page is ready!


In [11]:
# scrape github_api

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'
url6 = 'https://github.com/TiesdeKok/Python_NLP_Tutorial/'

def github_api_scraper(url):
    api_url = 'https://api.github.com/repos/'
    meta = {}
    url = api_url + url.replace('https://github.com/','')

    page = requests.get(url, timeout=None)
    status = page.status_code
    raw = json.loads(page.content)

    meta['url'] = url
    meta['api_url'] = api_url
    meta['description'] = raw.get('description')
    meta['created_at'] = raw.get('created_at')
    meta['pushed_at'] = raw.get('pushed_at')
    meta['homepage'] = raw.get('homepage')
    meta['size'] = raw.get('size')
    meta['stars'] = raw.get('stargazers_count')
    meta['language'] = raw.get('language')
    if raw.get('license') == None:
        meta['license'] = 'NaN'
    else:
        meta['license'] = raw['license']['name']

    return {'status': status, 'meta': meta, 'raw': raw}

#result = github_api_scraper(url5)
#print(result['meta'])
#print(result['raw'])

In [36]:
# traverse csv and scrape github_api
# attention! public api is limited to 60 requests / hour
# use VPN or a registered key to surpass the limit

git_store = '../data/repositories/git/'
meta_store = '.meta/github_api_meta.json'
raw_store = '.meta/github_api_raw.json'
csv = '../data/database/db_03_cleanup.csv'
column = 'Base_URL'
df = pd.read_csv(csv)

# blobcity
git_store = '../data/repositories/blobcity/git/'
csv = '../data/database/blobcity_01_index.csv'
column = 'github_link'
df = pd.read_csv(csv, sep=';')

#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
                
i = 0
for j, url in enumerate(df[column]):
    if isinstance(url, str) and 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        #print(url)
        
        path_meta = os.path.join(git_store, path, meta_store)
        path_raw = os.path.join(git_store, path, raw_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            print('row:', j, 'item:', i, 'url:', url)
            i += 1
            result = github_api_scraper(url.strip())
            print('status-code:', result['status'])
            
            if result['status'] == 404:
                print('No API Entry found')
                
            if result['status'] == 403:
                print('Quota exceeded')
                break
                
            create_folder(path_meta)
            with open(path_meta, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['meta']))

            create_folder(path_raw)
            with open(path_raw, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['raw']))
                
        #else:
        #    print('already scraped')
            
    if i == 100:
        print('Quota exceeded')
        break

row: 996 item: 0 url: https://github.com/mrm8488/shared_colab_notebooks
status-code: 200
row: 997 item: 1 url: https://github.com/unpingco/Python-for-Signal-Processing
status-code: 200
row: 998 item: 2 url: https://github.com/nipy/niwidgets
status-code: 200
row: 999 item: 3 url: https://github.com/BenLangmead/ads1-notebooks
status-code: 200
row: 1000 item: 4 url: https://github.com/Azure-Samples/cosmos-notebooks
status-code: 200
row: 1001 item: 5 url: https://github.com/yanneta/ML-notebooks
status-code: 200
row: 1002 item: 6 url: https://github.com/juanklopper/JuliaCourseNotebooks
status-code: 200
row: 1003 item: 7 url: https://github.com/wrobstory/sticky
status-code: 200
row: 1004 item: 8 url: https://github.com/jiffyclub/ipythonblocks
status-code: 200
row: 1005 item: 9 url: https://github.com/plotly/IPython-plotly
status-code: 200
row: 1006 item: 10 url: https://github.com/rickecon/Notebooks
status-code: 200
row: 1007 item: 11 url: https://github.com/Lasagne/Recipes
status-code: 200
