In [1]:
import os
import time
import json
import platform
import time
import math
import sys
import datetime
import numpy as np
import pandas as pd

In [2]:
# scoring function to get a score between 0...1 for integer-values, 0.5 should be at ~100
def score(n, precision=3):
    if isinstance(n, int) or isinstance(i, float):
        return round(1-1/math.pow(1+n, 0.15), precision)
    else:
        return 0

for n in [0,1,10,25,50,100,1000,10000]:
    print(score(n))

0.0
0.099
0.302
0.387
0.446
0.5
0.645
0.749


In [3]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [4]:
# improved version (full text scan)
# scan text for predefined terms

text = 'We use LSTM for anomaly and object detection. As Convolutional Neural Networks are great for ML.'

pd_ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_terms = pd_ml_terms['Term'].tolist()
ml_slugs = pd_ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_tags = pd_ml_terms['Tag'].tolist()
ml_tags = [x for x in ml_tags if str(x) != 'nan']

#print(ml_tags)

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

def match_tags(haystack):
    df = pd.read_csv('../data/patterns/ml_terms.csv')
    tags = []
    
    df.set_index('Term', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag').head(1)
            if not 'nan' in str(tag):
                tags.append(tag)
        except:
            pass
        
    df.set_index('Slug', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item]
            if not 'nan' in str(tag):
                tags.append(str(tag))
        except:
            pass
        
    #if 'ANN' in tags or 'CNN' in tags or 'RNN' in tags:
    #    tags.remove('NN')
    
    return list(tags)

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}
needles_need_str_lower = {
    'ml_slugs': False,
    'ml_terms': True,
    'ml_libs': False,
}

matches = []

matches.extend(match_text(text, ml_terms, True))
matches.extend(match_text(text, ml_slugs, False))
print('matches', matches)

tags = match_tags(matches)
print('tags', tags)

tags_test = ['ML', 'NLP', 'classif', 'convolutional neural network', 'decision tree', 'layer', 'machine learning', 'model', 'naive bayes', 'natural language processing', 'neural network', 'predict', 'regression', 'train']
print(match_tags(tags_test))

matches ['lstm', 'anomaly', 'convolutional neural network', 'neural network', 'detect', 'object detection', 'ML']
tags ['Tag    ML\nName: ML, dtype: object']
['Tag    ML\nName: ML, dtype: object', 'Tag    NLP\nName: NLP, dtype: object']


In [6]:
# bundle collected data and store in single json-file
# suitable for database building
dir_base = '../data/repositories/git/'
json_files = {
    'additional_info': '.meta/additional_info.json',
    # 'git_analyzer': '.meta/git_analyzer_results.json',
    'git_analyzer': '.meta/git_analyzer_results_v02.json',
    'github_api': '.meta/github_api_meta.json',
    'github_meta': '.meta/github_meta.json',
}
date_ref_file = '.meta/log_clone.txt'

meta_collection = '.meta/summary.json'

authors = os.listdir(dir_base)

p = 0

start = 0
end = 1000

runtime_start = time.time()

i = start
for author in authors:
    if os.path.isdir(os.path.join(dir_base, author)):
        
        projects = os.listdir(os.path.join(dir_base, author))
        #print(author, projects)
        for project in projects:
            #print(project)
            if p >= start:
                path = os.path.join(dir_base, author, project)
                if os.path.isdir(path):
                    print('### ' + str(i) + ': ' + author, project.strip())
                    raw_json = {}
                    
                    # load json files
                    for key in json_files:
                        #print(key, json_files[key])
                        json_fp = os.path.join(path, json_files[key])
                        if os.path.isfile(json_fp):
                            with open(json_fp, 'r') as fp:
                                data = fp.read()
                                raw_json[key] = json.loads(data)
                        else:
                            raw_json[key] = {}

                    # readme
                    readme_fp = os.path.join(path, 'README.md')
                    if os.path.isfile(readme_fp):
                        with open(readme_fp, 'r', encoding='utf-8', errors="ignore") as fp:
                            readme = fp.read()
                    else:
                        readme = ''
                        
                    # combine ml_libs and keywords
                    ml_libs = []
                    ml_terms = []
                    ml_tags = []
                    
                    for file in raw_json['git_analyzer']:
                        #print(file)
                        #print(file['meta']['ml_libs'])
                        if file['meta'].get('ml_libs'):
                            ml_libs.extend(file['meta'].get('ml_libs'))
                        if file['meta'].get('keywords'):
                            ml_terms.extend(file['meta'].get('keywords'))
                        if file['meta'].get('ml_tags'):
                            ml_tags.extend(file['meta'].get('ml_tags'))
                    
                    # make values unique
                    ml_libs = np.unique(ml_libs).tolist()
                    ml_terms = np.unique(ml_terms).tolist()
                    ml_tags = np.unique(ml_tags).tolist()
                    #ml_tags = match_tags(ml_terms)
                    
                    # parse created_at
                    created_at = raw_json['github_api'].get('created_at')
                    if created_at != None:
                        created_at = created_at.replace('T',' ').replace('Z','')
                    
                    # get pushed_at
                    pushed_at = raw_json['github_api'].get('pushed_at')
                    if pushed_at == None:
                        pushed_at = raw_json['github_meta'].get('last_commit')
                    if pushed_at == None:
                        pushed_at = ''
                    
                    # 2018-07-25T04:01:33Z
                    pushed_at = pushed_at.replace('T',' ').replace('Z','')
                    #date_time_str = pushed_at.split('Z')
                    #date_time_str = date_time_str[0].strip()
                    #date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%dT%H:%M:%S')
                    #pushed_at = date_time_obj
                    #print(pushed_at)
                    
                    # get scraped_at
                    ref_fp = os.path.join(dir_base, author, project)
                    ref_fp += '/' + date_ref_file
                    if os.path.isfile(ref_fp):
                        scraped_at = creation_date(ref_fp)
                    else:
                        scraped_at = creation_date(path)
                    
                    # get primary language
                    language_primary = raw_json['github_api'].get('language')
                    if language_primary == None:
                        language_primary = raw_json['github_meta'].get('languages')
                        if language_primary != None and len(language_primary) > 0:
                            language_primary = language_primary[0].split(':')[0]
                        else:
                            language_primary = ''
                    #print(language_primary)
                    
                    # parse description
                    description = raw_json['github_meta'].get('about')
                    if description != None:
                        description = description.replace(';', '.')
                        description = description.replace('\n', ' ').replace('\r', '')
                        description = description.replace('<br/>', '').replace('<br>', '')
                        
                    # parse readme
                    if readme != None:
                        readme = readme.replace(';', '.')
                        readme = readme.replace('\n', ' ').replace('\r', '')
                        #readme = readme.replace('<br/>', '').replace('<br>', '')
                    
                    # build ml score
                    ml_detected = 0; #len(ml_libs) > 0 or len(keywords) > 0
                    if len(ml_terms) > 0:
                        ml_detected += 0.2
                    if len(ml_tags) > 0:
                        ml_detected += 0.3
                    if len(ml_libs) > 0:
                        ml_detected += 0.5
                        
                    # parse license
                    license = raw_json['github_meta'].get('license')
                    if license != None:
                        if 'Nan' in license:
                            license = ''
                        if 'View license' in license:
                            license = ''
                        if '+ ' in license:
                            license = ''
                        if 'License' in license:
                            license = license.replace('License','').strip()
                    
                    # build final json
                    final_json = {
                        'author': author,
                        'title': project,
                        'link': raw_json['github_meta'].get('url'),
                        'ml_detected': ml_detected,
                        'description': description,
                        'license': license,
                        'language_primary': language_primary,
                        'languages': raw_json['github_meta'].get('languages'),
                        'created_at': created_at,
                        'pushed_at': str(pushed_at),
                        'scraped_at': str(scraped_at),
                        'stars': raw_json['github_api'].get('stars'),
                        'stars_score': score(raw_json['github_api'].get('stars')),
                        'contributors': raw_json['github_meta'].get('contributors'),
                        'ml_libs': ml_libs,
                        'ml_tags': ml_tags,
                        'ml_terms': ml_terms,
                        'readme': readme,
                        'industry': raw_json['additional_info'].get('industry'),
                        'type': raw_json['additional_info'].get('type'),
                        'name': raw_json['additional_info'].get('name'),
                        'description2': raw_json['additional_info'].get('description'),
                        #'raw': raw_json,
                    }
                    #print(final_json)
                    
                    # store final json
                    with open(os.path.join(path, meta_collection), 'w') as fp:
                        fp.write(json.dumps(final_json))
                    
                # count parsed projects
                i += 1

            # count projects
            p += 1
        
    # break loop
    if i>=end and True:
        break
    
runtime_end = time.time()
print('### parsed {0} git-projects  in {1} seconds ###'.format(i-start, round(runtime_end - runtime_start, 3)))

### 0: 1davegalloway SchoolDistrictAnalysis
### 1: A7med01 Deep-learning-for-Animal-Identification
### 2: AayushG159 Plant-Leaf-Identification
### 3: aayushmudgal Reducing-Manufacturing-Failures
### 4: ab-bh Disease-Outbreak-Prediction
### 5: abhiagar90 power_networks
### 6: abjer sds
### 7: abuchowdhury Mortgage_Bank_Loan_Analtsics
### 8: AccelAI AI-Law-Minicourse
### 9: aditink EMSRouting
### 10: adrianakopf NJPublicSchools
### 11: aeronetlab emergency-mapping
### 12: akarazeev LegalTech
### 13: Akesari12 LS123_Data_Prediction_Law_Spring-2019
### 14: akpen Stockholm-0.1
### 15: AlanConstantine KDD-Cup-2019-CAMMTR
### 16: albahnsen ML_RiskManagement
### 17: albertwebson Political-Vector-Projector
### 18: albiboni AileronSimulation
### 19: alifier Restaurant_success_model
### 20: alistairwallace97 olympian-biotech
### 21: altosaar food2vec
### 22: aluo417 Financial-Engineering-Projects
### 23: AlvaroMenduina Jupyter_Notebooks
### 24: am-aditya Artificial-Intelligence-for-Banking
### 25

### 205: ishank011 gs-quantify-bond-prediction
### 206: iurisegtovich PyTherm-applied-thermodynamics
### 207: ivan-bilan Painting_Forensics
### 208: jamesypeng Smarter-Emergency-Dispatch
### 209: janzaib-masood Educational-Data-Mining
### 210: Jean-njoroge coal-exploratory
### 211: jellespijker pymech
### 212: jerryxyx EquineTrading
### 213: jfzhang95 LSTM-water-table-depth-prediction
### 214: jhconning Dev-II
### 215: jinsonfernandez NLP_School-Budget-Project
### 216: jjakimoto finance_ml
### 217: jlperla ECON407_2018
### 218: joelowj Machine-Learning-and-Reinforcement-Learning-in-Finance
### 219: johnfwhitesell CensusPull
### 220: johnpfay USWaterAccounting
### 221: JonathanREB Budget_SchoolsAnalysis
### 222: JorgeDeLosSantos nusa
### 223: jorgehas smart-defect-inspection
### 224: josephofiowa dc-michelin-challenge
### 225: jrieke lstm-biology
### 226: JSchelldorfer ActuarialDataScience
### 227: jstac econometrics
### 228: jstac quantecon_nyu_2016
### 229: jubins DeepLearning-Food-Im

### 411: ual rental-listings
### 412: uci-cbcl D-GEX
### 413: un-modelling Electricity_Consumption_Surveys
### 414: usnistgov modelmeth
### 415: vibrationtoolbox vibration_toolbox
### 416: vicelab slaer
### 417: victorpena1 Natural-Gas-Demand-Prediction
### 418: vikram-bhati PAASBAAN-crime-prediction
### 419: Vipul115 Statistical-Time-Series-Analysis-on-Agricultural-Commodity-Prices
### 420: viritaromero Plant-diseases-classifier
### 421: vsub21 systemic-risk-dashboard
### 422: vtyeh pandas-challenge
### 423: waldronlab AppStatBio
### 424: wassname pipe-segmentation
### 425: whs2k GPO-AI
### 426: whugue school-closure
### 427: widdowquinn Teaching-EMBL-Plant-Path-Genomics
### 428: williamadams1 natural-gas-consumption-forecasting
### 429: worldbank ML-classification-algorithms-poverty
### 430: xiaofei6677 TourismFlickrMiner
### 431: xinychen transdim
### 432: yajnab pySteel
### 433: yiaktan Secondhand_Concert_Tickets
### 434: YungChunLu UCI-Power-Plant
### 435: zhentaoshi econ5170
### 

In [9]:
# build single csv from bundled data for visualising purpose
dir_base = '../data/repositories/git/'
meta_collection = '.meta/summary.json'
output_csv = '../data/database/db_04_analyzed_v02.csv'

authors = os.listdir(dir_base)

p = 0

start = 0
end = 1000

runtime_start = time.time()
df = pd.DataFrame()

i = start
for author in authors:
    if os.path.isdir(os.path.join(dir_base, author)):
        
        projects = os.listdir(os.path.join(dir_base, author))
        #print(author, projects)
        for project in projects:
            #print(project)
            if p >= start:
                path = os.path.join(dir_base, author, project)
                if os.path.isdir(path):
                    #print('### ' + str(i) + ': ' + author, project)
                    # store final json
                    with open(os.path.join(path, meta_collection), 'r') as fp:
                        data = fp.read()
                        data = json.loads(data)
                        df = df.append(data, ignore_index=True)
                    
                # count parsed projects
                i += 1

            # count projects
            p += 1
        
    # break loop
    if i>=end and True:
        break
    
runtime_end = time.time()
print('### parsed {0} git-projects  in {1} seconds ###'.format(i-start, round(runtime_end - runtime_start, 3)))

# drop duplicates
df = df.drop_duplicates(['link'])

# drop rows without link
df = df.dropna(axis=0, subset=['link'])

print(df.head())

# drop columns
df.drop(columns=['readme'], inplace=True)

df.to_csv(output_csv, sep=';', index=False)

### parsed 437 git-projects  in 7.017 seconds ###
          author contributors           created_at  \
0  1davegalloway            0  2018-07-19 02:41:45   
1        A7med01            0  2018-11-04 12:14:31   
2     AayushG159            2  2018-04-13 14:07:44   
3   aayushmudgal            0  2016-12-21 21:11:52   
4          ab-bh            3                 None   

                                         description  \
0  This analysis investigates the data of test sc...   
1  Animal Identification with Deep Convolutional ...   
2  Identification of plants through plant leaves ...   
3  Reducing Manufacturing Failures - A Kaggle Cha...   
4  Innovaccer Hackercamp '17 Project for predicti...   

                                        description2  \
0                   School budget vs school results.   
1           Deep learning for animal identification.   
2  Identification of plants through plant leaves ...   
3                   Reducing manufacturing failures.   
4  Machi