In [1]:
# imports
import requests
from requests_html import HTMLSession, AsyncHTMLSession
from bs4 import BeautifulSoup
import re
import json

In [2]:
# scrape github-page for metadata
# last pushed is added per javascript

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'

def github_scraper(url):
    meta = {
        'url': url
    }

    #page = requests.get(url, timeout=(3.05, 27))

    # requests-html is slightly better than requests in catching ajax data (last_commit)
    #session = HTMLSession()
    #page = session.get(url)

    # requests-html offers rendering javascript-functions
    #asession = AsyncHTMLSession()
    #r = await asession.get(url)
    #await r.html.arender()
    #page = r.html.html
    
    # soup = BeautifulSoup(page.content, 'html.parser')
    
    # selenium
    #CHROME_PATH = '/usr/bin/google-chrome'
    CHROMEDRIVER_PATH = './.selenium/chromedriver'
    WINDOW_SIZE = "1920,1080"

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
    #chrome_options.binary_location = CHROME_PATH

    driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options)
    #driver.implicitly_wait(1)
    driver.get(url)
    # wait for the tag <relative-time>
    try:
        element = WebDriverWait(driver, 1000).until(EC.visibility_of_element_located((By.TAG_NAME, 'relative-time')))
        print ("Page is ready!")
    except TimeoutException:
        print ("Loading took too much time!")
        
    html = driver.page_source
    driver.close()

    #print(html)

    soup = BeautifulSoup(html, 'html.parser')
    #title = soup.title.text
    #meta['title'] = title

    #print(page.text)
    #print(page.status_code)
    #print(title)

    about = soup.select('p.f4')[0].text.strip()
    # remove emojis, check: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
    about = about.encode('ascii', 'ignore').decode('ascii').strip()
    #print(about)
    meta['about'] = about

    #lic = soup.select('div.mt-3')[-1].text.strip()
    lic = soup.select('div.mt-3')
    lic = [elem.text.strip() for elem in lic]
    if len(lic) > 0 and "Readme" in lic:
        readme = lic.index("Readme")
        if readme == len(lic)-1:
            lic = 'Nan'
        else:
            lic = lic[readme+1]
        #print (lic)
        meta['license'] = lic

    lang = soup.select('li.d-inline')
    lang = [elem.text.strip().replace('\n', ': ') for elem in lang]
    #print(lang)
    meta['languages'] = lang

    social = soup.select('a.social-count')
    social = [elem.text.strip() for elem in social]
    #print(social)
    watch, stars, forks = social
    #print(watch, stars, forks)
    #meta['watch'] = watch
    meta['stars'] = stars
    #meta['forks'] = forks

    contributors = soup.select('span.Counter')[-1].text.strip()
    #print(contributors)
    meta['contributors'] = contributors

    commits = soup.find('span', {"aria-label" : re.compile("Commits")}).parent.text.strip().replace('\ncommits', '')
    #print(commits)
    meta['commits'] = commits

    # this in an ajax-element, which often is not fetched
    #last_commit = soup.select('relative-times')
    #if len(last_commit) > 0:
    #    last_commit = last_commit[0].get("datetime")
    #else:
    #    last_commit = 'NaN'

    #last_commit = soup.select('a.link-gray.ml-2')[-1].findChildren()[0].get("datetime")
    last_commit = soup.find('relative-time').get('datetime')
    #print(last_commit)
    meta['last_commit'] = last_commit
    
    return {'meta': meta, 'html': html }

result = github_scraper(url3)
print(json.dumps(result['meta']))

Page is ready!
{"url": "https://github.com/PatrickJS/awesome-angular", "about": "A curated list of awesome Angular resources", "license": "CC0-1.0 License", "languages": ["HTML: 74.7%", "CSS: 23.2%", "Shell: 2.1%"], "stars": "7.6k", "contributors": "246", "commits": "592", "last_commit": "2020-11-29T03:38:20Z"}


In [4]:
# traverse csv and scrape all github-pages for meta-data
import os
import pandas as pd

git_store = '../data/repositories/git/'
meta_store = '.meta/github_meta.json'
html_store = '.meta/github_raw.html'
csv = '../data/database/db_03_cleanup.csv'
df = pd.read_csv(csv)

#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
                
for url in df['Base_URL']:
    if 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        print(url)
        
        path_meta = os.path.join(git_store, path, meta_store)
        path_raw = os.path.join(git_store, path, html_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            result = github_scraper(url)
            #print(json.dumps(meta))

            create_folder(path_raw)
            with open(path_raw, 'w', encoding='utf-8') as fp:
                fp.write(result['html'])
              
            create_folder(path_meta)
            with open(path_meta, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['meta']))
                
        else:
            print('already scraped')

https://github.com/bschreck/robo-chef
already scraped
https://github.com/Ankushr785/Food-amenities-demand-prediction
already scraped
https://github.com/catherhuang/FP3-recipe
already scraped
https://github.com/stratospark/food-101-keras
already scraped
https://github.com/Murgio/Food-Recipe-CNN
already scraped
https://github.com/jubins/DeepLearning-Food-Image-Recognition-And-Calorie-Estimation
already scraped
https://github.com/Architectshwet/Amazon-Fine-Food-Reviews
already scraped
https://github.com/altosaar/food2vec
already scraped
https://github.com/gabrielilharco/snap-n-eat
already scraped
https://github.com/npatta01/web-deep-learning-classifier
already scraped
https://github.com/krpiyush5/Amazon-Fine-Food-Review
already scraped
https://github.com/nd1/DC_RestaurantViolationForecasting
already scraped
https://github.com/alifier/Restaurant_success_model
already scraped
https://github.com/josephofiowa/dc-michelin-challenge
already scraped
https://github.com/gzsuyu/Data-Analysis-NYC-Re

Page is ready!
https://github.com/austinbrian/portfolio/
Page is ready!
https://github.com/kaumaron/Data_Science/
already scraped
https://github.com/okfn-brasil/perfil-politico
Page is ready!
https://github.com/ParticipaPY/politic-bots
Page is ready!
https://github.com/PrincetonUniversity/gerrymandertests
Page is ready!
https://github.com/JulianMar11/SentimentPoliticalCompass
Page is ready!
https://github.com/muntisa/Deep-Politics
Page is ready!
https://github.com/edmundooo/more-money-more-problems
Page is ready!
https://github.com/abhiagar90/power_networks
Page is ready!
https://github.com/philippschmalen/Project_tsds
Page is ready!
https://github.com/kkirchhoff01/DebateAnalysis
Page is ready!
https://github.com/davidjwiner/political_affiliation_prediction
Page is ready!
https://github.com/philiplbean/facebook_political_ads
Page is ready!
https://github.com/pgromano/Political-Identity-Analysis
Page is ready!
https://github.com/kmunger/YT_descriptive
Page is ready!
https://github.com/a

In [5]:
# scrape github_api

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'
url6 = 'https://github.com/TiesdeKok/Python_NLP_Tutorial/'

def github_api_scraper(url):
    api_url = 'https://api.github.com/repos/'
    meta = {}
    url = api_url + url.replace('https://github.com/','')

    page = requests.get(url, timeout=None)
    status = page.status_code
    raw = json.loads(page.content)

    meta['url'] = url
    meta['api_url'] = api_url
    meta['description'] = raw.get('description')
    meta['created_at'] = raw.get('created_at')
    meta['pushed_at'] = raw.get('pushed_at')
    meta['homepage'] = raw.get('homepage')
    meta['size'] = raw.get('size')
    meta['stars'] = raw.get('stargazers_count')
    meta['language'] = raw.get('language')
    if raw.get('license') == None:
        meta['license'] = 'NaN'
    else:
        meta['license'] = raw['license']['name']

    return {'status': status, 'meta': meta, 'raw': raw}

result = github_api_scraper(url5)
print(result['meta'])
#print(result['raw'])

{'url': 'https://api.github.com/repos/ankitkariryaa/ambulanceSiteLocation', 'api_url': 'https://api.github.com/repos/', 'description': None, 'created_at': '2018-10-21T12:15:59Z', 'pushed_at': '2018-10-29T16:30:20Z', 'homepage': None, 'size': 27606, 'stars': 0, 'language': 'Jupyter Notebook', 'license': 'NaN'}


In [6]:
# traverse csv and scrape github_api
# attention! public api is limited to 60 requests / hour
# use VPN or a registered key to surpass the limit

git_store = '../data/repositories/git/'
meta_store = '.meta/github_api_meta.json'
raw_store = '.meta/github_api_raw.json'
csv = '../data/database/db_03_cleanup.csv'
df = pd.read_csv(csv)

#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
                
i = 0
for url in df['Base_URL']:
    if 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        #print(url)
        
        path_meta = os.path.join(git_store, path, meta_store)
        path_raw = os.path.join(git_store, path, raw_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            print(url)
            i += 1
            result = github_api_scraper(url.strip())
            print(result['status'])
            
            if result['status'] == 404:
                print('No API Entry found')
                
            if result['status'] == 403:
                print('Quota exceeded')
                break
                
            create_folder(path_meta)
            with open(path_meta, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['meta']))

            create_folder(path_raw)
            with open(path_raw, 'w', encoding='utf-8') as fp:
                fp.write(json.dumps(result['raw']))
                
        #else:
        #    print('already scraped')
            
    if i == 100:
        print('Quota exceeded')
        break