## GitHub Meta Collector

This script collects all meta-information derived by the individuall parsers  
collected data will be stored in a .csv-file inside ./data/database/  

In [1]:
import os
import time
import json
import platform
import time
import math
import sys
import datetime
import numpy as np
import pandas as pd

In [2]:
# scoring function to get a score between 0...1 for integer-values, 0.5 should be at ~100
def score(n, precision=3):
    if isinstance(n, int) or isinstance(i, float):
        return round(1-1/math.pow(1+n, 0.15), precision)
    else:
        return 0

for n in [0,1,10,25,50,100,1000,10000]:
    print(score(n))

0.0
0.099
0.302
0.387
0.446
0.5
0.645
0.749


In [3]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [4]:
# improved version (full text scan)
# scan text for predefined terms

text = 'We use LSTM for anomaly and object detection. As Convolutional Neural Networks are great for ML.'

pd_ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_terms = pd_ml_terms['Term'].tolist()
ml_slugs = pd_ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_tags = pd_ml_terms['Tag'].tolist()
ml_tags = [x for x in ml_tags if str(x) != 'nan']

#print(ml_tags)

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

def match_tags(haystack):
    df = pd.read_csv('../data/patterns/ml_terms.csv')
    tags = []
    
    df.set_index('Term', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag').head(1)
            if not 'nan' in str(tag):
                tags.append(tag)
        except:
            pass
        
    df.set_index('Slug', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item]
            if not 'nan' in str(tag):
                tags.append(str(tag))
        except:
            pass
        
    #if 'ANN' in tags or 'CNN' in tags or 'RNN' in tags:
    #    tags.remove('NN')
    
    return list(tags)

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}
needles_need_str_lower = {
    'ml_slugs': False,
    'ml_terms': True,
    'ml_libs': False,
}

matches = []

matches.extend(match_text(text, ml_terms, True))
matches.extend(match_text(text, ml_slugs, False))
print('matches', matches)

tags = match_tags(matches)
print('tags', tags)

tags_test = ['ML', 'NLP', 'classif', 'convolutional neural network', 'decision tree', 'layer', 'machine learning', 'model', 'naive bayes', 'natural language processing', 'neural network', 'predict', 'regression', 'train']
print(match_tags(tags_test))

matches ['object detection', 'anomaly', 'neural network', 'detect', 'lstm', 'convolutional neural network', 'ML']
tags ['Tag    ML\nName: ML, dtype: object']
['Tag    ML\nName: ML, dtype: object', 'Tag    NLP\nName: NLP, dtype: object']


In [6]:
# bundle collected data and store in single json-file
# suitable for database building
dir_base = '../data/repositories/git/'

# blobcity
dir_base = '../data/repositories/blobcity/git/'

json_files = {
    'additional_info': '.meta/additional_info.json',
    # 'git_analyzer': '.meta/git_analyzer_results.json',
    'git_analyzer': '.meta/git_analyzer_results_v02.json',
    'github_api': '.meta/github_api_meta.json',
    'github_meta': '.meta/github_meta.json',
}
date_ref_file = '.meta/log_clone.txt'

meta_collection = '.meta/summary.json'

authors = os.listdir(dir_base)

p = 0

start = 0
end = 5000

runtime_start = time.time()

i = start
for author in authors:
    if os.path.isdir(os.path.join(dir_base, author)):
        
        projects = os.listdir(os.path.join(dir_base, author))
        #print(author, projects)
        for project in projects:
            #print(project)
            if p >= start:
                path = os.path.join(dir_base, author, project)
                if os.path.isdir(path):
                    print('### ' + str(i) + ': ' + author, project.strip())
                    raw_json = {}
                    
                    # load json files
                    for key in json_files:
                        #print(key, json_files[key])
                        json_fp = os.path.join(path, json_files[key])
                        if os.path.isfile(json_fp):
                            with open(json_fp, 'r') as fp:
                                data = fp.read()
                                raw_json[key] = json.loads(data)
                        else:
                            raw_json[key] = {}

                    # readme
                    readme_fp = os.path.join(path, 'README.md')
                    if os.path.isfile(readme_fp):
                        with open(readme_fp, 'r', encoding='utf-8', errors="ignore") as fp:
                            readme = fp.read()
                    else:
                        readme = ''
                        
                    # combine ml_libs and keywords
                    ml_libs = []
                    ml_terms = []
                    ml_tags = []
                    
                    for file in raw_json['git_analyzer']:
                        #print(file)
                        #print(file['meta']['ml_libs'])
                        if file['meta'].get('ml_libs'):
                            ml_libs.extend(file['meta'].get('ml_libs'))
                        if file['meta'].get('keywords'):
                            ml_terms.extend(file['meta'].get('keywords'))
                        if file['meta'].get('ml_tags'):
                            ml_tags.extend(file['meta'].get('ml_tags'))
                    
                    # make values unique
                    ml_libs = np.unique(ml_libs).tolist()
                    ml_terms = np.unique(ml_terms).tolist()
                    ml_tags = np.unique(ml_tags).tolist()
                    #ml_tags = match_tags(ml_terms)
                    
                    # parse created_at
                    created_at = raw_json['github_api'].get('created_at')
                    if created_at != None:
                        created_at = created_at.replace('T',' ').replace('Z','')
                    
                    # get pushed_at
                    pushed_at = raw_json['github_api'].get('pushed_at')
                    if pushed_at == None:
                        pushed_at = raw_json['github_meta'].get('last_commit')
                    if pushed_at == None:
                        pushed_at = ''
                    
                    # 2018-07-25T04:01:33Z
                    pushed_at = pushed_at.replace('T',' ').replace('Z','')
                    #date_time_str = pushed_at.split('Z')
                    #date_time_str = date_time_str[0].strip()
                    #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%S')
                    #pushed_at = date_time_obj
                    #print(pushed_at)
                    
                    # get scraped_at
                    ref_fp = os.path.join(dir_base, author, project)
                    ref_fp += '/' + date_ref_file
                    if os.path.isfile(ref_fp):
                        scraped_at = creation_date(ref_fp)
                    else:
                        scraped_at = creation_date(path)
                    
                    # get primary language
                    language_primary = raw_json['github_api'].get('language')
                    if language_primary == None:
                        language_primary = raw_json['github_meta'].get('languages')
                        if language_primary != None and len(language_primary) > 0:
                            language_primary = language_primary[0].split(':')[0]
                        else:
                            language_primary = ''
                    #print(language_primary)
                    
                    # parse description
                    description = raw_json['github_meta'].get('about')
                    if description != None:
                        description = description.replace(';', '.')
                        description = description.replace('\n', ' ').replace('\r', '')
                        description = description.replace('<br/>', '').replace('<br>', '')
                        
                    # parse readme
                    if readme != None:
                        readme = readme.replace(';', '.')
                        readme = readme.replace('\n', ' ').replace('\r', '')
                        #readme = readme.replace('<br/>', '').replace('<br>', '')
                    
                    # build ml score
                    ml_detected = 0; #len(ml_libs) > 0 or len(keywords) > 0
                    if len(ml_terms) > 0:
                        ml_detected += 0.2
                    if len(ml_tags) > 0:
                        ml_detected += 0.3
                    if len(ml_libs) > 0:
                        ml_detected += 0.5
                        
                    # parse license
                    license = raw_json['github_meta'].get('license')
                    if license != None:
                        if 'Nan' in license:
                            license = ''
                        if 'View license' in license:
                            license = ''
                        if '+ ' in license:
                            license = ''
                        if 'License' in license:
                            license = license.replace('License','').strip()
                    
                    # build final json
                    final_json = {
                        'author': author,
                        'title': project,
                        'link': raw_json['github_meta'].get('url'),
                        'ml_detected': ml_detected,
                        'description': description,
                        'license': license,
                        'language_primary': language_primary,
                        'languages': raw_json['github_meta'].get('languages'),
                        'created_at': created_at,
                        'pushed_at': str(pushed_at),
                        'scraped_at': str(scraped_at),
                        'stars': raw_json['github_api'].get('stars'),
                        'stars_score': score(raw_json['github_api'].get('stars')),
                        'contributors': raw_json['github_meta'].get('contributors'),
                        'ml_libs': ml_libs,
                        'ml_tags': ml_tags,
                        'ml_terms': ml_terms,
                        'readme': readme,
                        'industry': raw_json['additional_info'].get('industry'),
                        'type': raw_json['additional_info'].get('type'),
                        'name': raw_json['additional_info'].get('name'),
                        'description2': raw_json['additional_info'].get('description'),
                        #'raw': raw_json,
                    }
                    #print(final_json)
                    
                    # store final json
                    with open(os.path.join(path, meta_collection), 'w') as fp:
                        fp.write(json.dumps(final_json))
                    
                # count parsed projects
                i += 1

            # count projects
            p += 1
        
    # break loop
    if i>=end and True:
        break
    
runtime_end = time.time()
print('### parsed {0} git-projects  in {1} seconds ###'.format(i-start, round(runtime_end - runtime_start, 3)))

### 0: 0xGG crossnote
### 1: 0xGG welcome-notebook
### 2: 1024hub Go-Learning-Notebook
### 3: 583 machine_learning_notebook
### 4: a415432669 -front_end_notebook
### 5: aaren notedown
### 6: abdullah768 icpc_notebook
### 7: abhinavsagar kaggle-notebooks
### 8: abhishekkrthakur colabcode
### 9: abhshkdz HackFlowy
### 10: abulbasar machine-learning
### 11: acamposxp Google-Colab-CloudTorrent
### 12: acgeospatial Satellite_Imagery_Python
### 13: aflaxman pymc-examples
### 14: agermanidis pigeon
### 15: ageron handson-ml
### 16: ageron handson-ml2
### 17: ageron julia_notebooks
### 18: ageron tf2_course
### 19: agile-geoscience notebooks
### 20: agiliq notebooks
### 21: AiswaryaSrinivas DataScienceWithPython
### 22: ajwdewit pcse_notebooks
### 23: akabe ocaml-jupyter
### 24: alchemyst Dynamics-and-Control
### 25: aleen42 PersonalWiki
### 26: alexamici covid-19-notebooks
### 27: alexcfleming Python_DL_Working_Notebooks
### 28: algorithmiaio awesome-ipython-notebooks
### 29: Alireza-Akhavan 

### 229: dudash openshift-workshops
### 230: dudeperf3ct DL_Notebooks
### 231: Dunedan mbp-2016-linux
### 232: dunovank jupyter-themes
### 233: dvorka mindforger
### 234: Dyakonov notebooks
### 235: dylanmei docker-zeppelin
### 236: dyweb papers-notebook
### 237: eclarson DataMiningNotebooks
### 238: eclarson MachineLearningNotebooks
### 239: ecmwf notebook-examples
### 240: ecosme38 Data-Assimilation-Notebooks
### 241: edbullen nltk
### 242: edvardHua Articles
### 243: eee2047s-uct notebooks
### 244: ehmatthes intro_programming
### 245: Einsteinish Artificial-Neural-Networks-with-Jupyter
### 246: Einsteinish bogotobogo-Machine-Learning
### 247: eka-foundation numerical-computing-is-fun
### 248: elabftw elabftw
### 249: elegant-scipy notebooks
### 250: elyra-ai elyra
### 251: elyra-ai kfp-notebook
### 252: Emergent-Behaviors-in-Biology mlreview_notebooks
### 253: emmettgb Emmetts-DS-NoteBooks
### 254: empathy87 The-Elements-of-Statistical-Learning-Python-Notebooks
### 255: empet Plotly

### 464: jupyter nbviewer
### 465: jupyter notebook
### 466: jupyter tmpnb
### 467: jupyter-attic docker-notebook
### 468: jupyter-attic jupyter-js-notebook
### 469: Jupyter-contrib jupyter_nbextensions_configurator
### 470: jupyter-on-openshift jupyter-notebooks
### 471: jupyter-widgets ipywidgets
### 472: jupytercalpoly jupyterlab-interactive-dashboard-editor
### 473: jupyterhub jupyter-server-proxy
### 474: jupyterhub jupyterhub
### 475: jupyterhub systemdspawner
### 476: jupyterlab debugger
### 477: jupyterlab jupyterlab-celltags
### 478: justmarkham pandas-videos
### 479: justmarkham scikit-learn-videos
### 480: jwagemann 2019_egu_workshop_jupyter_notebooks
### 481: jwkvam jupyterlab-vim
### 482: kaburelabs Datacamp-Courses
### 483: kadamwhite wp-notebook
### 484: kaixindelele tensorflow_notebook
### 485: kaleko CourseraML
### 486: karpathy randomfun
### 487: KeithGalli NumPy
### 488: KelvinJin iSwift
### 489: Kennytian learning-react-native
### 490: ketch numerical_linear_algebra

### 717: plotly IPython-plotly
### 718: plotly jupyter-dash
### 719: plutov notebook
### 720: pm58 notebook
### 721: pmservice ai-openscale-tutorials
### 722: pnavaro python-notebooks
### 723: polynote polynote
### 724: powerpak jupyter-dark-theme
### 725: practical-neuroimaging pna-notebooks
### 726: pranjalchaubey Deep-Learning-Notes
### 727: pratapvardhan notebooks
### 728: prathameshtari Predicting-Football-Match-Outcome-using-Machine-Learning
### 729: prathyvsh networked-notebooks
### 730: probcomp notebook
### 731: ProgramacionCompetitivaUFPS notebook
### 732: propublica il-tickets-notebooks
### 733: psinger notebooks
### 734: Pulkit-Khandelwal Reinforcement-Learning-Notebooks
### 735: puntofisso OSMnxNotebooks
### 736: pybokeh jupyter_notebooks
### 737: Pybonacci notebooks
### 738: pyecharts jupyter-echarts
### 739: pyecharts pyecharts-users-cases
### 740: PYFTS notebooks
### 741: pyHPC pyhpc-tutorial
### 742: pyrech composer-changelogs
### 743: pysal notebooks
### 744: python-e

### 944: vaibhavsagar notebooks
### 945: vaksakalli python_tutorials
### 946: Valassis-Digital-Media nbconflux
### 947: vatlab sos-notebook
### 948: vega ipyvega
### 949: velocyto-team velocyto-notebooks
### 950: viktyz iosnotebook
### 951: vinayak-mehta nbcommands
### 952: vincentherrmann wasserstein-notebook
### 953: vishwesh5 Quantum-Machine-Learning
### 954: voila-dashboards voila
### 955: vsbuffalo devnotes
### 956: vschaik Conjugate-Gradient
### 957: waltherg notebooks
### 958: WangYihang awesome-web-security
### 959: watson-developer-cloud assistant-improve-recommendations-notebook
### 960: wayanjimmy notebook
### 961: webdevmatics Entrust-Notebookapp
### 962: WeiFoo NoML
### 963: wenmin-wu jupyter-tabnine
### 964: wesm pydata-book
### 965: wilfredinni python-cheatsheet
### 966: willb fraud-notebooks
### 967: WillKoehrsen jupyter-notebook-extensions
### 968: WittyOrator JupyterNotebook
### 969: wix quix
### 970: wizardforcel data-science-notebook
### 971: WolframResearch Wolfram

In [8]:
# build single csv from bundled data for visualising purpose
dir_base = '../data/repositories/git/'
meta_collection = '.meta/summary.json'
output_csv = '../data/database/db_04_analyzed_v02.csv'

# blobcity
dir_base = '../data/repositories/blobcity/git/'
output_csv = '../data/database/blobcity_02_analyzed.csv'

ml_treshold = 0.5

authors = os.listdir(dir_base)

p = 0

start = 0
end = 5000

runtime_start = time.time()
df = pd.DataFrame()

i = start
for author in authors:
    if os.path.isdir(os.path.join(dir_base, author)):
        
        projects = os.listdir(os.path.join(dir_base, author))
        #print(author, projects)
        for project in projects:
            #print(project)
            if p >= start:
                path = os.path.join(dir_base, author, project)
                if os.path.isdir(path):
                    #print('### ' + str(i) + ': ' + author, project)
                    # store final json
                    with open(os.path.join(path, meta_collection), 'r') as fp:
                        data = fp.read()
                        data = json.loads(data)
                        if data['ml_detected'] > ml_treshold:
                            df = df.append(data, ignore_index=True)
                    
                # count parsed projects
                i += 1

            # count projects
            p += 1
        
    # break loop
    if i>=end and True:
        break
    
runtime_end = time.time()
print('### parsed {0} git-projects  in {1} seconds ###'.format(i-start, round(runtime_end - runtime_start, 3)))

# drop duplicates
df = df.drop_duplicates(['link'])

# drop rows without link
df = df.dropna(axis=0, subset=['link'])

print(df.head())

# drop columns
df.drop(columns=['readme'], inplace=True)

df.to_csv(output_csv, sep=';', index=False)

### parsed 1011 git-projects  in 4.761 seconds ###
         author contributors           created_at  \
0           583            0  2020-03-15 02:19:04   
1         aaren            6  2014-02-17 17:28:59   
2  abhinavsagar            0  2019-06-18 14:10:12   
3     abulbasar            2  2017-05-05 18:09:09   
4  acgeospatial            0  2018-01-28 16:21:21   

                                         description  \
0                                                      
1                      Markdown <=> IPython Notebook   
2           Sample notebooks for Kaggle competitions   
3  notebooks with example for machine learning ex...   
4  Sample sample scripts and notebooks on process...   

                                        description2 industry  \
0  Ã¦ÂÂºÃ¥ÂÂ¨Ã¥Â­Â¦Ã¤Â¹Â Ã§ÂºÂ¯Ã§Â®ÂÃ¦Â³ÂÃ¥Â®...     None   
1                      Markdown <=> IPython Notebook     None   
2           Sample notebooks for Kaggle competitions     None   
3  notebooks with example for mac