In [1]:
# imports
import requests
from requests_html import HTMLSession, AsyncHTMLSession
from bs4 import BeautifulSoup
import re
import json

In [2]:
# this function scrapes a github-page for metadata

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'

def github_scraper(url):
    meta = {}

    #page = requests.get(url, timeout=None)

    # requests-html is slightly better than requests in catching ajax data (last_commit)
    session = HTMLSession()
    page = session.get(url)

    # requests-html offers rendering javascript-functions
    #asession = AsyncHTMLSession()
    #r = await asession.get(url)
    #await r.html.arender()
    #page = r.html.html

    #print(page)

    soup = BeautifulSoup(page.content, 'html.parser')
    #title = soup.title.text
    #meta['title'] = title

    #print(page.text)
    #print(page.status_code)
    #print(title)

    about = soup.select('p.f4')[0].text.strip()
    # remove emojis, check: https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
    about = about.encode('ascii', 'ignore').decode('ascii').strip()
    #print(about)
    meta['about'] = about

    #lic = soup.select('div.mt-3')[-1].text.strip()
    lic = soup.select('div.mt-3')
    lic = [elem.text.strip() for elem in lic]
    if len(lic) > 0 and "Readme" in lic:
        readme = lic.index("Readme")
        if readme == len(lic)-1:
            lic = 'Nan'
        else:
            lic = lic[readme+1]
        #print (lic)
        meta['license'] = lic

    lang = soup.select('li.d-inline')
    lang = [elem.text.strip().replace('\n', ': ') for elem in lang]
    #print(lang)
    meta['languages'] = lang

    social = soup.select('a.social-count')
    social = [elem.text.strip() for elem in social]
    #print(social)
    watch, stars, forks = social
    #print(watch, stars, forks)
    #meta['watch'] = watch
    meta['stars'] = stars
    #meta['forks'] = forks

    contributors = soup.select('span.Counter')[-1].text.strip()
    #print(contributors)
    meta['contributors'] = contributors

    commits = soup.find('span', {"aria-label" : re.compile("Commits")}).parent.text.strip().replace('\ncommits', '')
    #print(commits)
    meta['commits'] = commits

    # this in an ajax-element, which often is not fetched
    #last_commit = soup.select('relative-times')
    #if len(last_commit) > 0:
    #    last_commit = last_commit[0].get("datetime")
    #else:
    #    last_commit = 'NaN'

    #last_commit = soup.select('a.link-gray.ml-2')[-1].findChildren()[0].get("datetime")
    #print(last_commit)
    #meta['last_commit'] = last_commit
    
    return meta

meta = github_scraper(url3)
print(json.dumps(meta))

{"about": "A curated list of awesome Angular resources", "license": "CC0-1.0 License", "languages": ["HTML: 74.7%", "CSS: 23.2%", "Shell: 2.1%"], "stars": "7.6k", "contributors": "245", "commits": "588"}


In [3]:
# traverse csv and scrape all github-repositories
import os
import pandas as pd

git_store = '../data/repositories/git/'
meta_store = '.meta/github_meta.json'
csv = '../data/database/db_03_cleanup.csv'
df = pd.read_csv(csv)

#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
                
for url in df['Base_URL']:
    if 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        print(url)
        
        path_meta = os.path.join(git_store, path, meta_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            meta = github_scraper(url)
            #print(json.dumps(meta))

            create_folder(path_meta)
            with open(path_meta, 'w') as fp:
                fp.write(json.dumps(meta))
                
        else:
            print('already scraped')

https://github.com/bschreck/robo-chef
already scraped
https://github.com/Ankushr785/Food-amenities-demand-prediction
already scraped
https://github.com/catherhuang/FP3-recipe
already scraped
https://github.com/stratospark/food-101-keras
already scraped
https://github.com/Murgio/Food-Recipe-CNN
already scraped
https://github.com/jubins/DeepLearning-Food-Image-Recognition-And-Calorie-Estimation
already scraped
https://github.com/Architectshwet/Amazon-Fine-Food-Reviews
already scraped
https://github.com/altosaar/food2vec
already scraped
https://github.com/gabrielilharco/snap-n-eat
already scraped
https://github.com/npatta01/web-deep-learning-classifier
already scraped
https://github.com/krpiyush5/Amazon-Fine-Food-Review
already scraped
https://github.com/nd1/DC_RestaurantViolationForecasting
already scraped
https://github.com/alifier/Restaurant_success_model
already scraped
https://github.com/josephofiowa/dc-michelin-challenge
already scraped
https://github.com/gzsuyu/Data-Analysis-NYC-Re

https://github.com/ProximaDas/nlp-govt-regulations
https://github.com/philxchen/Clustering-Canadian-regulations
https://github.com/ds-modules/EEP-147
https://github.com/vsub21/systemic-risk-dashboard
https://github.com/raymond180/FINRA_TRACE
https://github.com/davidmasse/US-supreme-court-prediction
https://github.com/AccelAI/AI-Law-Minicourse/
https://github.com/GirrajMaheshwari/Legal-Analytics-project---Court-misclassification
https://github.com/whs2k/GPO-AI
https://github.com/brightmart/sentiment_analysis_fine_grain
https://github.com/Danila89/kaggle_mercedes
https://github.com/Meena-Mani/SECOM_class_imbalance
https://github.com/usnistgov/modelmeth
https://github.com/han-yan-ds/Kaggle-Bosch
https://github.com/Azure/lstms_for_predictive_maintenance
https://github.com/Samimust/predictive-maintenance
https://github.com/m-hoff/maintsim
https://github.com/LaranIkal/ProductAnomaliesDetection
https://github.com/IBM/iot-predictive-analytics
https://github.com/sharmaroshan/SECOM-Detecting-Def

https://github.com/tstreamDOTh/Instacart-Market-Basket-Analysis
https://github.com/SarahMestiri/online-retail-case
https://github.com/sharmaroshan/Online-Retail-Transactions-of-UK
https://github.com/IBM-DSE/CyberShop-Analytics
https://github.com/arvindkarir/retail
https://github.com/finnqiao/cohort_online_retail
https://github.com/datadesk/california-electricity-capacity-analysis
already scraped
https://github.com/PyPSA/WHOBS
already scraped
https://github.com/pipette/Electricity-load-disaggregation
already scraped
https://github.com/farwacheema/DA-electricity-price-forecasting
already scraped
https://github.com/gschivley/carbon-index
already scraped
https://github.com/hvantil/ElectricityDemandForecasting
already scraped
https://github.com/un-modelling/Electricity_Consumption_Surveys
already scraped
https://github.com/amirrezaeian/Individual-household-electric-power-consumption-Data-Set-
already scraped
https://github.com/Open-Power-System-Data/renewable_power_plants
already scraped
ht

In [43]:
# github_api_scraper

url1 = 'https://github.com/AlvaroMenduina/Jupyter_Notebooks'
url2 = 'https://github.com/k2inno-tech/jupyter-genomics'
url3 = 'https://github.com/PatrickJS/awesome-angular'
url4 = 'https://github.com/danielmachinelearning/HotelSpamDetection'
url5 = 'https://github.com/ankitkariryaa/ambulanceSiteLocation'
url6 = 'https://github.com/TiesdeKok/Python_NLP_Tutorial/'

def github_api_scraper(url):
    api_url = 'https://api.github.com/repos/'
    meta = {}
    url = api_url + url.replace('https://github.com/','')

    page = requests.get(url, timeout=None)
    status = page.status_code
    if status == 200:
        raw = json.loads(page.content)

        meta['description'] = raw['description']
        meta['created_at'] = raw['created_at']
        meta['pushed_at'] = raw['pushed_at']
        meta['homepage'] = raw['homepage']
        meta['size'] = raw['size']
        meta['stars'] = raw['stargazers_count']
        meta['language'] = raw['language']
        if raw['license'] == None:
            meta['license'] = 'NaN'
        else:
            meta['license'] = raw['license']['name']

        return meta
    
    else:
        return None

meta = github_api_scraper(url5)
print(meta)

{'description': None, 'created_at': '2018-10-21T12:15:59Z', 'pushed_at': '2018-10-29T16:30:20Z', 'homepage': None, 'size': 27606, 'stars': 0, 'language': 'Jupyter Notebook', 'license': 'NaN'}


In [66]:
git_store = '../data/repositories/git/'
meta_store = '.meta/github_api_meta.json'
csv = '../data/database/db_03_cleanup.csv'
df = pd.read_csv(csv)

#print(df.head())

# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
                
i = 0
for url in df['Base_URL']:
    if 'https://github.com/' in url:
        path = url.replace('https://github.com/','')+'/' #.replace('/','\\')
        #print(url)
        
        path_meta = os.path.join(git_store, path, meta_store)
        #print(path_meta)
        
        already_scraped = os.path.isfile(path_meta)
        
        if not already_scraped:
            print(url)
            i += 1
            meta = github_api_scraper(url)
            
            if meta != None:
                #print(json.dumps(meta))

                create_folder(path_meta)
                with open(path_meta, 'w') as fp:
                    fp.write(json.dumps(meta))
            
            else:
                print('An Error occured')
                #break
                create_folder(path_meta)
                with open(path_meta, 'w') as fp:
                    fp.write(json.dumps({}))
                
        #else:
        #    print('already scraped')
            
    if i == 60:
        print('Quota exceeded')
        break

https://github.com/akpen/Stockholm-0.1
https://github.com/crowdAI/train-schedule-optimisation-challenge-starter-kit
https://github.com/mratsim/McKinsey-SmartCities-Traffic-Prediction
https://github.com/Data4Democracy/crash-model
https://github.com/llSourcell/AI_Supply_Chain
https://github.com/cavaunpeu/flight-delays
https://github.com/pratishthakapoor/RetailReplenishement/
An Error occured
https://github.com/kralmachine/WholesaleCustomerAnalysis
https://github.com/Semionn/JB-wholesale-distribution-analysis
https://github.com/prakhardogra921/Clustering-Analysis-on-customers-of-a-wholesale-distributor
https://github.com/tstreamDOTh/Instacart-Market-Basket-Analysis
https://github.com/SarahMestiri/online-retail-case
https://github.com/sharmaroshan/Online-Retail-Transactions-of-UK
https://github.com/IBM-DSE/CyberShop-Analytics
https://github.com/arvindkarir/retail
https://github.com/finnqiao/cohort_online_retail
