In [1]:
# imports
import requests
import json
import os
import platform
import time
import math
import datetime
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

In [2]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
#print(store_data('Hello', '../data/repositories/mlart/test.txt'))
#print(load_data('../data/repositories/mlart/test.txt'))

# test json
#print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
#print(load_data('../data/repositories/mlart/test.json', fromJson=True))

In [3]:
# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

In [4]:
# scan text for predefined terms

text = 'We use CNN for anomaly detection. As Convolutional Neural Networks are great for ML.'

ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_slugs = ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_terms = ml_terms['Term'].tolist()

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}

print(match_text(text, ml_terms, True))

['convolutional neural network', 'neural network', 'anomaly detection']


In [5]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [64]:
# scrape competition

folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/'
dataset = 'dataset.html'

def scrape_competition(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta = {}
    
    meta['title'] = soup.find('h1', class_="competition-header__title").text.strip()
    meta['subtitle'] = soup.find('h2', class_="competition-header__subtitle").text.strip()
    meta['type'] = soup.find('p', class_="competition-header__classification-text").text.strip()
    meta['organisation'] = soup.find('span', class_="competition-header__organization-name")
    if meta['organisation'] == None:
        meta['organisation'] = ''
    else:
        meta['organisation'] = meta['organisation'].text.strip()
    temp = soup.find_all('li', class_="horizontal-list-item horizontal-list-item--bullet horizontal-list-item--default")
    for item in temp:
        #print(item.text)
        if 'team' in item.text:
            meta['teams'] = item.text.replace('teams','').replace('team','').strip()
        if 'ago' in item.text:
            #item = BeautifulSoup(item, 'html.parser')
            item = item.select('li>span>span')[0]
            meta['date_closed'] = item.get('title')
    
    meta['description'] = soup.find('div', class_="competition-overview__content").text.strip()
    meta['tags'] = [x.text.strip() for x in soup.find_all('span', class_="CategoryButton_CategoryName-sc-c10946 jFsDhg")]
    
    
    

    return meta

html = load_data(folder_base+folder+dataset)
if 'file not found' in html:
    print(html)
meta = scrape_competition(html)
print(meta)
#store_data(links, folder_base+folder+out, toJson=True)

{'title': 'Lyft 3D Object Detection for Autonomous Vehicles', 'subtitle': 'Can you advance the state of the art in 3D object detection?', 'type': 'Featured prediction Competition', 'organisation': 'Lyft', 'teams': '547', 'date_closed': 'Wed Nov 13 2019 00:59:00 GMT+0100 (Mitteleuropäische Normalzeit)', 'description': 'Self-driving technology presents a rare opportunity to improve the quality of life in many of our communities. Avoidable collisions, single-occupant commuters, and vehicle emissions are choking cities, while infrastructure strains under rapid urban growth. Autonomous vehicles are expected to redefine transportation and unlock a myriad of societal, environmental, and economic benefits. You can apply your data analysis skills in this competition to advance the state of self-driving technology.\nLyft, whose mission is to improve people’s lives with the world’s best transportation, is investing in the future of self-driving vehicles. Level 5, their self-driving division, is w

In [70]:
# iterate competions

url = url = 'https://www.kaggle.com/c/'
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/'
dataset = 'dataset.html'
file_out = 'meta.json'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', i, len(folders))
    #print(folder)
    #print('###', i, os.path.join(folder_base,folder,subfolder))
    path = os.path.join(folder_base,folder,dataset)
    i += 1

    if os.path.exists(path):
        print(path)
        html = load_data(path)
        if 'file not found' in html:
            print(html)
        meta = scrape_competition(html)
        meta['link'] = url+folder
        #print(meta)
        store_data(meta, os.path.join(folder_base,folder,file_out), toJson=True)
            
    if quit!=0 and i>quit:
        break

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', j, 'items')

folder: 419
subfolder: 0 419
../data/repositories/kaggle/competitions/c/15-071x-the-analytics-edge-competition-spring-2015\dataset.html
subfolder: 1 419
../data/repositories/kaggle/competitions/c/15-071x-the-analytics-edge-spring-20152\dataset.html
subfolder: 2 419
../data/repositories/kaggle/competitions/c/20-newsgroups-ciphertext-challenge\dataset.html
subfolder: 3 419
../data/repositories/kaggle/competitions/c/3d-object-detection-for-autonomous-vehicles\dataset.html
subfolder: 4 419
../data/repositories/kaggle/competitions/c/abstraction-and-reasoning-challenge\dataset.html
subfolder: 5 419
../data/repositories/kaggle/competitions/c/accelerometer-biometric-competition\dataset.html
subfolder: 6 419
../data/repositories/kaggle/competitions/c/acm-sf-chapter-hackathon-big\dataset.html
subfolder: 7 419
../data/repositories/kaggle/competitions/c/acm-sf-chapter-hackathon-small\dataset.html
subfolder: 8 419
../data/repositories/kaggle/competitions/c/acquire-valued-shoppers-challenge\dataset.

subfolder: 80 419
../data/repositories/kaggle/competitions/c/data-science-bowl-2018\dataset.html
subfolder: 81 419
../data/repositories/kaggle/competitions/c/data-science-bowl-2019\dataset.html
subfolder: 82 419
../data/repositories/kaggle/competitions/c/data-science-for-good-careervillage\dataset.html
subfolder: 83 419
../data/repositories/kaggle/competitions/c/data-science-for-good-city-of-los-angeles\dataset.html
subfolder: 84 419
../data/repositories/kaggle/competitions/c/data-science-london-scikit-learn\dataset.html
subfolder: 85 419
../data/repositories/kaggle/competitions/c/datasciencebowl\dataset.html
subfolder: 86 419
../data/repositories/kaggle/competitions/c/dato-native\dataset.html
subfolder: 87 419
../data/repositories/kaggle/competitions/c/decoding-the-human-brain\dataset.html
subfolder: 88 419
../data/repositories/kaggle/competitions/c/deepfake-detection-challenge\dataset.html
subfolder: 89 419
../data/repositories/kaggle/competitions/c/deloitte-churn-prediction\dataset.

subfolder: 164 419
../data/repositories/kaggle/competitions/c/home-depot-product-search-relevance\dataset.html
subfolder: 165 419
../data/repositories/kaggle/competitions/c/homesite-quote-conversion\dataset.html
subfolder: 166 419
../data/repositories/kaggle/competitions/c/hospital\dataset.html
subfolder: 167 419
../data/repositories/kaggle/competitions/c/how-much-did-it-rain\dataset.html
subfolder: 168 419
../data/repositories/kaggle/competitions/c/how-much-did-it-rain-ii\dataset.html
subfolder: 169 419
../data/repositories/kaggle/competitions/c/human-protein-atlas-image-classification\dataset.html
subfolder: 170 419
../data/repositories/kaggle/competitions/c/humpback-whale-identification\dataset.html
subfolder: 171 419
../data/repositories/kaggle/competitions/c/icdar2013-gender-prediction-from-handwriting\dataset.html
subfolder: 172 419
../data/repositories/kaggle/competitions/c/icdar2013-stroke-recovery-from-offline-data\dataset.html
subfolder: 173 419
../data/repositories/kaggle/co

subfolder: 245 419
../data/repositories/kaggle/competitions/c/mens-machine-learning-competition-2019\dataset.html
subfolder: 246 419
../data/repositories/kaggle/competitions/c/mercari-price-suggestion-challenge\dataset.html
subfolder: 247 419
../data/repositories/kaggle/competitions/c/mercedes-benz-greener-manufacturing\dataset.html
subfolder: 248 419
../data/repositories/kaggle/competitions/c/MerckActivity\dataset.html
subfolder: 249 419
../data/repositories/kaggle/competitions/c/microsoft-malware-prediction\dataset.html
subfolder: 250 419
../data/repositories/kaggle/competitions/c/mlsp-2013-birds\dataset.html
subfolder: 251 419
../data/repositories/kaggle/competitions/c/mlsp-2014-mri\dataset.html
subfolder: 252 419
../data/repositories/kaggle/competitions/c/movie-review-sentiment-analysis-kernels-only\dataset.html
subfolder: 253 419
../data/repositories/kaggle/competitions/c/msdchallenge\dataset.html
subfolder: 254 419
../data/repositories/kaggle/competitions/c/msk-redefining-cancer-

subfolder: 323 419
../data/repositories/kaggle/competitions/c/rsna-str-pulmonary-embolism-detection\dataset.html
subfolder: 324 419
../data/repositories/kaggle/competitions/c/RTA\dataset.html
subfolder: 325 419
../data/repositories/kaggle/competitions/c/RxVolumePrediction\dataset.html
subfolder: 326 419
../data/repositories/kaggle/competitions/c/santa-2019-revenge-of-the-accountants\dataset.html
subfolder: 327 419
../data/repositories/kaggle/competitions/c/santa-gift-matching\dataset.html
subfolder: 328 419
../data/repositories/kaggle/competitions/c/santa-workshop-tour-2019\dataset.html
subfolder: 329 419
../data/repositories/kaggle/competitions/c/santander-customer-satisfaction\dataset.html
subfolder: 330 419
../data/repositories/kaggle/competitions/c/santander-customer-transaction-prediction\dataset.html
subfolder: 331 419
../data/repositories/kaggle/competitions/c/santander-product-recommendation\dataset.html
subfolder: 332 419
../data/repositories/kaggle/competitions/c/santander-va

subfolder: 402 419
../data/repositories/kaggle/competitions/c/whats-cooking-kernels-only\dataset.html
subfolder: 403 419
../data/repositories/kaggle/competitions/c/WIC2011\dataset.html
subfolder: 404 419
../data/repositories/kaggle/competitions/c/wikichallenge\dataset.html
subfolder: 405 419
../data/repositories/kaggle/competitions/c/wise-2014\dataset.html
subfolder: 406 419
../data/repositories/kaggle/competitions/c/womens-machine-learning-competition-2018\dataset.html
subfolder: 407 419
../data/repositories/kaggle/competitions/c/womens-machine-learning-competition-2019\dataset.html
subfolder: 408 419
../data/repositories/kaggle/competitions/c/word2vec-nlp-tutorial\dataset.html
subfolder: 409 419
../data/repositories/kaggle/competitions/c/worldcup2010\dataset.html
subfolder: 410 419
../data/repositories/kaggle/competitions/c/worldcupconf\dataset.html
subfolder: 411 419
../data/repositories/kaggle/competitions/c/yandex-personalized-web-search-challenge\dataset.html
subfolder: 412 419
.

In [73]:
# collect all meta.json into a single csv

folder_base = '../data/repositories/kaggle/competitions/c/'
file_meta = 'meta.json'
fp_csv = '../data/database/kaggle_competitions.csv'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()
df = pd.DataFrame()

runtime_start = time.time()

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', i, len(folders))
    #print(folder)
    #print('###', i, os.path.join(folder_base,folder,subfolder))
    path = os.path.join(folder_base,folder,file_meta)
    i += 1

    if os.path.exists(path):
        print(path)
        data = load_data(path, fromJson=True)
        if 'file not found' in data:
            print(html)
            
        data['description'] = data['description'].replace('\n', ' ').replace('\r', '').replace('¶', '').strip()

        # date (ignoring GMT+x)
        date_time_str = data['date_closed'].split('GMT')
        date_time_str = date_time_str[0].strip()
        date_time_obj = datetime.datetime.strptime(date_time_str, '%a %b %d %Y %H:%M:%S')
        data['date_closed'] = date_time_obj
        
        df = df.append(data, ignore_index=True)
            
    if quit!=0 and i>quit:
        break
        
runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', j, 'items')
print(df.shape)
print(df.head())
        
# drop columns
#df.drop(columns=['author', 'submission'], inplace=True)

df.to_csv(fp_csv, sep=';', index=False)

folder: 419
subfolder: 0 419
../data/repositories/kaggle/competitions/c/15-071x-the-analytics-edge-competition-spring-2015\meta.json
subfolder: 1 419
../data/repositories/kaggle/competitions/c/15-071x-the-analytics-edge-spring-20152\meta.json
subfolder: 2 419
../data/repositories/kaggle/competitions/c/20-newsgroups-ciphertext-challenge\meta.json
subfolder: 3 419
../data/repositories/kaggle/competitions/c/3d-object-detection-for-autonomous-vehicles\meta.json
subfolder: 4 419
../data/repositories/kaggle/competitions/c/abstraction-and-reasoning-challenge\meta.json
subfolder: 5 419
../data/repositories/kaggle/competitions/c/accelerometer-biometric-competition\meta.json
subfolder: 6 419
../data/repositories/kaggle/competitions/c/acm-sf-chapter-hackathon-big\meta.json
subfolder: 7 419
../data/repositories/kaggle/competitions/c/acm-sf-chapter-hackathon-small\meta.json
subfolder: 8 419
../data/repositories/kaggle/competitions/c/acquire-valued-shoppers-challenge\meta.json
subfolder: 9 419
../da

../data/repositories/kaggle/competitions/c/deloitte-western-australia-rental-prices\meta.json
subfolder: 91 419
../data/repositories/kaggle/competitions/c/demand-forecasting-kernels-only\meta.json
subfolder: 92 419
../data/repositories/kaggle/competitions/c/denoising-dirty-documents\meta.json
subfolder: 93 419
../data/repositories/kaggle/competitions/c/detecting-insults-in-social-commentary\meta.json
subfolder: 94 419
../data/repositories/kaggle/competitions/c/diabetic-retinopathy-detection\meta.json
subfolder: 95 419
../data/repositories/kaggle/competitions/c/dog-breed-identification\meta.json
subfolder: 96 419
../data/repositories/kaggle/competitions/c/dogs-vs-cats\meta.json
subfolder: 97 419
../data/repositories/kaggle/competitions/c/dogs-vs-cats-redux-kernels-edition\meta.json
subfolder: 98 419
../data/repositories/kaggle/competitions/c/donorschoose-application-screening\meta.json
subfolder: 99 419
../data/repositories/kaggle/competitions/c/dont-call-me-turkey\meta.json
subfolder: 

../data/repositories/kaggle/competitions/c/imaterialist-challenge-FGVC2017\meta.json
subfolder: 178 419
../data/repositories/kaggle/competitions/c/imaterialist-challenge-furniture-2018\meta.json
subfolder: 179 419
../data/repositories/kaggle/competitions/c/imaterialist-fashion-2019-FGVC6\meta.json
subfolder: 180 419
../data/repositories/kaggle/competitions/c/imaterialist-fashion-2020-fgvc7\meta.json
subfolder: 181 419
../data/repositories/kaggle/competitions/c/imet-2019-fgvc6\meta.json
subfolder: 182 419
../data/repositories/kaggle/competitions/c/imet-2020-fgvc7\meta.json
subfolder: 183 419
../data/repositories/kaggle/competitions/c/inaturalist-2018\meta.json
subfolder: 184 419
../data/repositories/kaggle/competitions/c/inaturalist-2019-fgvc6\meta.json
subfolder: 185 419
../data/repositories/kaggle/competitions/c/inaturalist-challenge-at-fgvc-2017\meta.json
subfolder: 186 419
../data/repositories/kaggle/competitions/c/inclusive-images-challenge\meta.json
subfolder: 187 419
../data/repo

../data/repositories/kaggle/competitions/c/NFL-Punt-Analytics-Competition\meta.json
subfolder: 262 419
../data/repositories/kaggle/competitions/c/nips-2017-defense-against-adversarial-attack\meta.json
subfolder: 263 419
../data/repositories/kaggle/competitions/c/nips-2017-non-targeted-adversarial-attack\meta.json
subfolder: 264 419
../data/repositories/kaggle/competitions/c/nips-2017-targeted-adversarial-attack\meta.json
subfolder: 265 419
../data/repositories/kaggle/competitions/c/noaa-fisheries-steller-sea-lion-population-count\meta.json
subfolder: 266 419
../data/repositories/kaggle/competitions/c/noaa-right-whale-recognition\meta.json
subfolder: 267 419
../data/repositories/kaggle/competitions/c/nomad2018-predict-transparent-conductors\meta.json
subfolder: 268 419
../data/repositories/kaggle/competitions/c/nyc-taxi-trip-duration\meta.json
subfolder: 269 419
../data/repositories/kaggle/competitions/c/online-sales\meta.json
subfolder: 270 419
../data/repositories/kaggle/competitions/

subfolder: 346 419
../data/repositories/kaggle/competitions/c/siim-isic-melanoma-classification\meta.json
subfolder: 347 419
../data/repositories/kaggle/competitions/c/socialNetwork\meta.json
subfolder: 348 419
../data/repositories/kaggle/competitions/c/sp-society-camera-model-identification\meta.json
subfolder: 349 419
../data/repositories/kaggle/competitions/c/spooky-author-identification\meta.json
subfolder: 350 419
../data/repositories/kaggle/competitions/c/springleaf-marketing-response\meta.json
subfolder: 351 419
../data/repositories/kaggle/competitions/c/stanford-covid-vaccine\meta.json
subfolder: 352 419
../data/repositories/kaggle/competitions/c/state-farm-distracted-driver-detection\meta.json
subfolder: 353 419
../data/repositories/kaggle/competitions/c/statoil-iceberg-classifier-challenge\meta.json
subfolder: 354 419
../data/repositories/kaggle/competitions/c/stayalert\meta.json
subfolder: 355 419
../data/repositories/kaggle/competitions/c/street-view-getting-started-with-ju

In [6]:
# scrape notebook content

folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'

def scrape_notebook_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta = {}
    
    meta['author'] = soup.find('a', class_="sc-paWCZ jvKDQp").get('href').replace('/','')
    meta['title'] = soup.find('a', class_="KernelViewerContext_KernelTitle-sc-rdaqnd").text   
    meta['type'] = soup.find('span', class_="KernelViewerContext_KernelTypeInfo-sc-1l6fza6 kqxzvL").text.replace('using data from','').strip()
    meta['sources'] = soup.find('a', class_="KernelViewerContext_DataSourceUrl-sc-1dm3ij9 lpoMHV").text.strip()
    meta['tags'] = soup.find('span', class_="KernelViewerContext_CategoriesWrapper-sc-8yrjj NgcTE").text.split('·')
    meta['tags'] = list(filter(None, meta['tags']))
    meta['votes'] = soup.find('span', class_="vote-button__vote-count").text
    meta['submission'] = soup.select('div.kernel-code-pane__submission-info-content')
    if len(meta['submission']) > 0:
        meta['submission'] = meta['submission'][0].text
    #meta['votes'] = soup.find('span', class_="vote-button__vote-count").text
    meta['license'] = soup.select('div.kernel-code-pane__subtitle>a')
    if len(meta['license']) > 0:
        meta['license'] = meta['license'][0].text
    
    score = soup.select('div.kernel-code-pane__submission-score-value')
    if len(score) > 0:
        meta['score_private'] = score[0].text
        meta['score_public'] = score[1].text
    
    
    temp = soup.find('span', class_="KernelViewerContext_KernelSubtitle-sc-rltxca esPWpV").select('span') #.text.split('·')
    #print (temp)
    for item in temp:
        #print(item)
        if 'views' in item.text:
            meta['views'] = int(''.join(filter(str.isdigit, item.text)))
        if 'GMT' in item.get('title', 'nan'):
            meta['date'] = item.get('title')
    
    return meta

html = load_data(folder_base+folder+notebook)
if 'file not found' in html:
    print(html)
meta = scrape_notebook_content(html)
print(meta)
#store_data(links, folder_base+folder+out, toJson=True)

{'author': 'asimandia', 'title': 'lyft3d inference kernel', 'type': 'Python notebook', 'sources': 'multiple data sources', 'tags': ['gpu'], 'votes': '38', 'submission': 'Best Submission SuccessfulSubmitted by [ods.ai] blonde & asimandia in Kitti lands a year ago', 'license': 'Apache 2.0', 'score_private': '0.040', 'score_public': '0.040', 'views': 2595, 'date': 'Tue Oct 29 2019 16:27:40 GMT+0100 (Mitteleuropäische Normalzeit)'}


In [7]:
# scrape kernel content

folder_base = '../data/repositories/kaggle/competitions/c/'
folder1 = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
folder2 = 'airbnb-recruiting-new-user-bookings/notebooks/datadave/ndcg-score-r/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'

def scrape_kernel_content(html, needles, snippet = False):
    soup = BeautifulSoup(html, 'html.parser')
    
    if snippet == True:
        code = soup.find('div', {"id": "code"}).text
    else:
        code = soup.find('body').text
    #print(code)
    
    meta = {}
    
    for key, value in needles.items():
        meta[key] = match_text(code, value, True)
        
    meta['description'] = soup.select('div.text_cell_render')
    #print(meta['description'])
    if len(meta['description']) > 0:
        meta['description'] = meta['description'][0].text.strip()
    else:
        meta['description'] = ''
    
    return meta

# test for non-kernel-file ('code' embedded in notebook.html)
html = load_data(folder_base+folder2+notebook)
if 'file not found' in html:
    print(html)
meta = scrape_kernel_content(html, needles)
print(meta)

# test for kernel-file ('code' in kernel.html)
html = load_data(folder_base+folder1+kernel)
if 'file not found' in html:
    print(html)
meta = scrape_kernel_content(html, needles)
print(meta)


#store_data(links, folder_base+folder+out, toJson=True)

{'ml_slugs': ['nlp', 'nn', 'ner', 'ml', 'ai'], 'ml_terms': ['rank', 'predict', 'deep learning', 'training data', 'classification', 'filter', 'train'], 'ml_libs': [], 'description': ''}
{'ml_slugs': ['dl', 'cv', 'nn', 'ann', 'ner', 'rl', 'ai'], 'ml_terms': ['epoch', 'convolutional neural network', 'recommend', 'layer', 'neural network', 'test data', 'predict', 'label', 'u-net', 'image segmentation', 'loss', 'model', 'activation function', 'filter', 'train', 'relu'], 'ml_libs': ['pytorch'], 'description': "Please check out Guido's excellent kernel here. In this kernel i show how to perform inference on test set using the trained model.\nI just added RaDAM optimzer and got some better score.\nYou can find the BEV of the test set here.\nUpdates:¶\nCorrected yaw calculation\nUsed category height information"}


In [8]:
# iterate all folders and compose results into meta.json

url = 'https://www.kaggle.com/'
folder_base = '../data/repositories/kaggle/competitions/'
file_notebook = 'notebook_02.html'
file_kernel = 'kernel.html'
file_out = 'meta.json'

skip = True # if true skip meta collection / set to false to recreate {file_out} from scratch
quit = 10000 # quit after n files processed
breakOnError = False

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0
j = 0

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', len(folders))
    for subfolder in subfolders:
        print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,subfolder, 'notebooks/')
        i += 1
        
        if os.path.exists(path):
            projects = os.listdir(path)

            for author in projects:
                #print('author:', author)
                items = os.listdir(os.path.join(folder_base,folder,subfolder, 'notebooks/', author))

                for notebook in items:
                    j+=1
                    print(' - ', j, 'author:', author, '/ notebook:', notebook)
                    path_notebook = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_notebook)
                    path_kernel = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_kernel)
                    path_out = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_out)
                    
                    meta = {}
                    
                    if skip and os.path.isfile(path_out):
                        print('skipped')
                            
                    else:
                        try:
                            # get meta-data
                            if os.path.isfile(path_notebook):
                                #print('notebook found')
                                meta['scraped_at'] = creation_date(path_notebook)
                                meta['link'] = f'{url}{author}/{notebook}'
                                meta['reference'] = f'{url}{folder}/{subfolder}'
                                html = load_data(path_notebook, fromJson=False)
                                if 'file not found' in html:
                                    print('notebook not found')
                                    break
                                meta.update(scrape_notebook_content(html))

                            # get meta-data
                            if os.path.isfile(path_kernel):
                                #print('kernel found')
                                html = load_data(path_kernel, fromJson=False)
                                if 'file not found' in html:
                                    print('kernel not found')
                                    break
                                meta.update(scrape_kernel_content(html, needles))
                            else:
                                #print('kernel not found')
                                html = load_data(path_notebook, fromJson=False)
                                if 'file not found' in html:
                                    print('notebook not found')
                                    break
                                meta.update(scrape_kernel_content(html, needles))

                            #print(meta)
                            store_data(meta, path_out, toJson=True)
                            
                        except Exception as e:
                            print("Oops!", e.__class__, "occurred.")
                            print(e)
                            if breakOnError:
                                break
                
                if j>quit:
                    break
            if j>quit:
                break
    if j>quit:
        break

folder: 1
subfolder: 1
### 0 ../data/repositories/kaggle/competitions/c\15-071x-the-analytics-edge-competition-spring-2015
### 1 ../data/repositories/kaggle/competitions/c\15-071x-the-analytics-edge-spring-20152
### 2 ../data/repositories/kaggle/competitions/c\20-newsgroups-ciphertext-challenge
 -  1 author: a45632 / notebook: classification-tfidf-svm-2-0
skipped
 -  2 author: amansohane / notebook: level-3-with-partial-deciphering-0-94-level-3
skipped
 -  3 author: ananthu017 / notebook: classification-tfidf-logistic
skipped
 -  4 author: ashishpatel26 / notebook: attension-layer-basic-for-nlp
skipped
 -  5 author: ashishpatel26 / notebook: beginner-to-intermediate-nlp-tutorial
skipped
 -  6 author: ashishpatel26 / notebook: everything-you-want-to-know-about-20-ngctc
skipped
 -  7 author: ashishpatel26 / notebook: stratified-kfold-hyperparameter-tuning
skipped
 -  8 author: delayedkarma / notebook: some-basic-explorations-lgb-baseline
skipped
 -  9 author: interneuron / notebook: diff

 -  109 author: inversion / notebook: run-length-decoding-quick-start
skipped
 -  110 author: julian3833 / notebook: 2-understanding-and-plotting-rle-bounding-boxes
skipped
 -  111 author: julian3833 / notebook: 4-exploring-public-models
skipped
 -  112 author: kmader / notebook: baseline-u-net-model-part-1
skipped
 -  113 author: kmader / notebook: from-trained-u-net-to-submission-part-2
skipped
 -  114 author: kmader / notebook: transfer-learning-for-boat-or-no-boat
skipped
 -  115 author: kotarojp / notebook: first-step-for-submission-u-net-tta
skipped
 -  116 author: leighplt / notebook: pytorch-tutorial-dataset-data-preparetion-stage
skipped
 -  117 author: meaninglesslives / notebook: airbus-ship-detection-data-visualization
skipped
 -  118 author: npatta01 / notebook: naive-model
skipped
 -  119 author: rackovic1994 / notebook: convolutional-neural-network
skipped
 -  120 author: voglinio / notebook: from-masks-to-bounding-boxes
skipped
 -  121 author: windsurfer / notebook: bas

 -  230 author: abhishek / notebook: beating-the-benchmark-1
skipped
 -  231 author: adamwong / notebook: script-aminos-glm-logit
skipped
 -  232 author: alledluviette / notebook: ctr-test
skipped
 -  233 author: brianbai / notebook: inspect-tables-with-pandas
skipped
 -  234 author: btwardow / notebook: r-dplyr-sqlite
skipped
 -  235 author: datayo / notebook: python-data-explorer
skipped
 -  236 author: jeffmoser / notebook: database-schema
skipped
 -  237 author: jeffmoser / notebook: sample-rows-from-each-table
skipped
 -  238 author: olivermeyfarth / notebook: logistic-regression-on-histctr
skipped
 -  239 author: rahulpatel11315 / notebook: read-data-from-tsv-file-using-pandas-dataframe
skipped
 -  240 author: roenbaeck / notebook: number-of-records-in-every-table
skipped
 -  241 author: rootua / notebook: apache-spark-scala-logistic-regression
skipped
 -  242 author: ryutek / notebook: join-test
skipped
 -  243 author: satomacoto / notebook: logistic-regression
skipped
 -  244 a

notebook not found
 -  364 author: navinmundhra / notebook: birdcall-starter-tablular-image-data
skipped
 -  365 author: parulpandey / notebook: eda-and-audio-processing-with-python
skipped
 -  366 author: pavansanagapati / notebook: birds-sounds-eda-spotify-urban-sound-eda
skipped
 -  367 author: rohanrao / notebook: birdcall-eda-chirp-hoot-and-flutter
skipped
 -  368 author: rohitsingh9990 / notebook: eda-visualizations-simple-baseline
skipped
 -  369 author: shahules / notebook: bird-watch-complete-eda-fe
skipped
 -  370 author: shonenkov / notebook: competition-metrics
skipped
 -  371 author: shonenkov / notebook: sample-submission-using-custom-check
skipped
 -  372 author: tanulsingh077 / notebook: audio-albumentations-transform-your-audio
skipped
 -  373 author: tarunpaparaju / notebook: birdcall-identification-spectrogram-resnet
skipped
 -  374 author: ttahara / notebook: inference-birdsong-baseline-resnest50-fast
skipped
 -  375 author: ttahara / notebook: training-birdsong-bas

 -  517 author: bguberfain / notebook: just-showing-a-few-images
skipped
 -  518 author: bguberfain / notebook: naive-keras-cdiscount
skipped
 -  519 author: bguberfain / notebook: naive-statistics
skipped
 -  520 author: bguberfain / notebook: not-so-naive-way-to-convert-bson-to-files
skipped
 -  521 author: blazeka / notebook: multi-gpu-tensorflow-convnet-0-65
skipped
 -  522 author: blazeka / notebook: validate-download-with-sha256-hash
skipped
 -  523 author: cerebrium / notebook: multi-class-logistic-regression-using-tensor-flow
skipped
 -  524 author: ezietsman / notebook: inception-v3-finetune
skipped
 -  525 author: humananalog / notebook: keras-generator-for-reading-directly-from-bson
skipped
 -  526 author: inversion / notebook: processing-bson-files
skipped
 -  527 author: lamdang / notebook: fast-shuffle-bson-generator-for-keras
skipped
 -  528 author: mihaskalic / notebook: keras-xception-model-0-68-on-pl-weights
skipped
 -  529 author: mpekalski / notebook: convert-bson-t

 -  635 author: sanikamal / notebook: ciphertext-challenge-iii
skipped
 -  636 author: seriousran / notebook: only-length-0-00000
skipped
 -  637 author: smlopezza / notebook: ciphertext-challenge-iii-v3
skipped
 -  638 author: sunandosamaddar / notebook: scratch-8urykgfg
skipped
 -  639 author: tarobxl / notebook: cipher-level-1-2-simple-eda-on-level-3-4
skipped
 -  640 author: tauffer / notebook: dumb-luck
skipped
 -  641 author: tenffe / notebook: ciphertext-eda-and-baseline
skipped
### 59 ../data/repositories/kaggle/competitions/c\cir-prospect
### 60 ../data/repositories/kaggle/competitions/c\ClaimPredictionChallenge
### 61 ../data/repositories/kaggle/competitions/c\connectomics
### 62 ../data/repositories/kaggle/competitions/c\conway-s-reverse-game-of-life
 -  642 author: ptyshevs / notebook: cnn-for-reversing-game-of-life
skipped
 -  643 author: ruchibahl18 / notebook: starting-of-an-end-game
skipped
### 63 ../data/repositories/kaggle/competitions/c\conways-reverse-game-of-life-2

skipped
 -  741 author: rohanrao / notebook: covid-19-w2-lgb-mad
skipped
 -  742 author: sambitmukherjee / notebook: covid-19-data-adding-world-development-indicators
skipped
 -  743 author: sanskrutipanda / notebook: covid-19-data-visualizations-with-plotly
skipped
### 68 ../data/repositories/kaggle/competitions/c\covid19-global-forecasting-week-3
 -  744 author: abhijithchandradas / notebook: sir-model-don-t-understand-calculus-don-t-worry
skipped
 -  745 author: aerdem4 / notebook: covid-19-basic-model-not-leaky
skipped
 -  746 author: anjum48 / notebook: seir-hcd-model
skipped
 -  747 author: arpandas65 / notebook: covid-19-projection-using-lstm
skipped
 -  748 author: corochann / notebook: covid-19-effect-of-temperature-humidity
skipped
 -  749 author: corochann / notebook: covid-19-spread-situation-by-prefecture-in-japan
skipped
 -  750 author: davidbnn92 / notebook: weather-data
skipped
 -  751 author: dferhadi / notebook: covid-19-predictions-growth-factor-and-calculus
skipped


 -  884 author: vanausloos / notebook: full-preprocessing-tutorial
skipped
 -  885 author: yoshcakes / notebook: full-preprocessing-in-r-with-3d-visualizations
skipped
 -  886 author: zfturbo / notebook: keras-vs-cancer
skipped
### 80 ../data/repositories/kaggle/competitions/c\data-science-bowl-2018
 -  887 author: akshayt19nayak / notebook: getting-started-image-processing-basics
skipped
 -  888 author: bonlime / notebook: train-test-image-mosaic
skipped
 -  889 author: gaborvecsei / notebook: basic-pure-computer-vision-segmentation-lb-0-229
skipped
 -  890 author: infernop / notebook: object-detection-techniques
skipped
 -  891 author: jakubczakon / notebook: morphological-postprocessing-on-unet-lb-0-429
skipped
 -  892 author: jerrythomas / notebook: exploratory-analysis
skipped
 -  893 author: keegil / notebook: keras-u-net-starter-lb-0-277
skipped
 -  894 author: kmader / notebook: normalizing-brightfield-stained-and-fluorescence
skipped
 -  895 author: kmader / notebook: nuclei-o

 -  1007 author: duccao / notebook: outlier-treatment
skipped
### 91 ../data/repositories/kaggle/competitions/c\demand-forecasting-kernels-only
 -  1008 author: abhilashawasthi / notebook: feature-engineering-lgb-model
skipped
 -  1009 author: adityaecdrid / notebook: my-first-time-series-comp-added-prophet
skipped
 -  1010 author: arindamgot / notebook: eda-prophet-mlp-neural-network-forecasting
skipped
 -  1011 author: ashishpatel26 / notebook: keeping-it-simple-by-xyzt
skipped
 -  1012 author: ashishpatel26 / notebook: light-gbm-demand-forecasting
skipped
 -  1013 author: ashishpatel26 / notebook: lstm-demand-forecasting
skipped
 -  1014 author: ashishpatel26 / notebook: store-item-demand-using-using-arima
skipped
 -  1015 author: cworsnup / notebook: backtesting-cross-validation-for-timeseries
skipped
 -  1016 author: danofer / notebook: getting-started-with-time-series-features
skipped
 -  1017 author: darshanadiga / notebook: time-series-data-exploration
skipped
 -  1018 author: 

 -  1119 author: kbhits / notebook: tensorflow-starter-kit-fixed
skipped
 -  1120 author: rajmehra03 / notebook: a-comprehensive-guide-to-transfer-learning
skipped
 -  1121 author: sanchitvj / notebook: cat-or-dog-transfer-learning-using-resnets
skipped
 -  1122 author: sarvajna / notebook: dogs-vs-cats-keras-solution
skipped
 -  1123 author: sentdex / notebook: full-classification-example-with-convnet
skipped
 -  1124 author: shaochuanwang / notebook: keras-warm-up-cats-vs-dogs-cnn-with-vgg16
skipped
 -  1125 author: shivamb / notebook: cnn-architectures-vgg-resnet-inception-tl
skipped
 -  1126 author: suniliitb96 / notebook: tutorial-keras-transfer-learning-with-resnet50
skipped
 -  1127 author: tunguz / notebook: cats-and-dogs-with-rapids-t-sne
skipped
 -  1128 author: yassineghouzam / notebook: dogs-cats-aren-t-enough-object-recognition
skipped
### 98 ../data/repositories/kaggle/competitions/c\donorschoose-application-screening
 -  1129 author: ambarish / notebook: eda-fe-xgb-glm-m

 -  1239 author: ragnar123 / notebook: exploratory-data-analysis-and-factor-model-idea
skipped
 -  1240 author: raviyadav2398 / notebook: ds4g-emission-factor
skipped
 -  1241 author: tiurii / notebook: ds4g-modelling-of-emissions-of-power-plants
skipped
 -  1242 author: vlarmet / notebook: an-r-notebook-for-no2-emission-factor
skipped
 -  1243 author: vpatricio / notebook: ds4g-where-does-the-no2-come-from
skipped
### 104 ../data/repositories/kaggle/competitions/c\dsg-hackathon
 -  1244 author: sanikamal / notebook: air-quality-prediction-eda
skipped
### 105 ../data/repositories/kaggle/competitions/c\dstl-satellite-imagery-feature-detection
 -  1245 author: aamaia / notebook: rgb-using-m-bands-example
skipped
 -  1246 author: aamaia / notebook: small-vehicles
skipped
 -  1247 author: aamaia / notebook: trees-are-red-buildings-are-blue-sort-of
skipped
 -  1248 author: amanbh / notebook: eda-the-scouring-of-the-shire
skipped
 -  1249 author: amanbh / notebook: visualize-polygons-and-ima

 -  1356 author: fmak95 / notebook: facial-keypoint-detection
skipped
 -  1357 author: gakshaygupta / notebook: real-time-cnn-architecture
skipped
 -  1358 author: karanjakhar / notebook: facial-keypoint-detection
skipped
 -  1359 author: liudmyla / notebook: easy-keras-facial-keypoint-detection
skipped
 -  1360 author: madhawav / notebook: basic-fully-connected-nn
skipped
 -  1361 author: mannsingh / notebook: facial-keypoints
skipped
 -  1362 author: mirmahathirmohammad / notebook: kaggle-facial-keypoint-detection
skipped
 -  1363 author: mirodil / notebook: facial-keypoints-detection
skipped
 -  1364 author: negi009 / notebook: facial-keypoint-detection
skipped
 -  1365 author: nitron / notebook: facial-keypoints-fastai-image-regression
skipped
 -  1366 author: obione26 / notebook: facial-keypoints-detection-keras-albumentations
skipped
 -  1367 author: phylake1337 / notebook: 2-15-loss-simple-split-trick
skipped
 -  1368 author: sshikamaru / notebook: keras-cnn-starter
skipped
 -  

 -  1552 author: atogni85 / notebook: galaxy-convnet
skipped
 -  1553 author: helmehelmuto / notebook: keras-cnn
skipped
 -  1554 author: mgambati / notebook: galaxy-challenge
skipped
 -  1555 author: mgambati / notebook: teste-classifica-o-de-gal-xias
skipped
 -  1556 author: zhuangjw / notebook: compress-galaxy-test-data
skipped
 -  1557 author: zhuangjw / notebook: compress-galaxy-train-data
skipped
### 135 ../data/repositories/kaggle/competitions/c\GEF2012-wind-forecasting
### 136 ../data/repositories/kaggle/competitions/c\gendered-pronoun-resolution
 -  1558 author: ashishpatel26 / notebook: research-summary-with-co-reference-resolutions
skipped
 -  1559 author: bguberfain / notebook: sentences-with-highlight
skipped
 -  1560 author: ceshine / notebook: pytorch-bert-baseline-public-score-0-54
skipped
 -  1561 author: ceshine / notebook: pytorch-bert-endpointspanextractor-kfold
skipped
 -  1562 author: chanhu / notebook: bert-score-layer-kfold-weightdecay-0-486
skipped
 -  1563 aut

 -  1692 author: latimerb / notebook: 2020-model-comparison-no-leak-submission
skipped
 -  1693 author: lucabasa / notebook: are-men-s-and-women-s-tournaments-different
skipped
 -  1694 author: nxrprime / notebook: right-left-shoot-march-madness-eda-and-analysis
skipped
 -  1695 author: paulorzp / notebook: kenpom-scraper-2020
skipped
 -  1696 author: ratan123 / notebook: march-madness-2020-ncaam-simple-lightgbm-on-kfold
skipped
 -  1697 author: robikscube / notebook: 2020-march-madness-data-first-look-eda
skipped
 -  1698 author: robikscube / notebook: ncaa-basketball-court-plot-helper-functions
skipped
 -  1699 author: vbmokin / notebook: mm-ncaam-no-leaks-lgb-xgb-logreg
skipped
 -  1700 author: warkingleo2000 / notebook: eda-with-sparse-matrix
skipped
### 149 ../data/repositories/kaggle/competitions/c\google-cloud-ncaa-march-madness-2020-division-1-womens-tournament
 -  1701 author: a45632 / notebook: 2020-starter-kernel-women-improved
skipped
 -  1702 author: anshumoudgil / noteboo

skipped
 -  1891 author: hellozeyu / notebook: test-script-1
skipped
 -  1892 author: keithtrnka / notebook: data-exploration
skipped
 -  1893 author: kelexu / notebook: word-cloud
skipped
 -  1894 author: khaledfayed / notebook: rf-mean-squared-error
skipped
 -  1895 author: khaoticmind / notebook: deep-learning-regression
skipped
 -  1896 author: olest1980 / notebook: clustering-of-product-descriptions
skipped
 -  1897 author: remap1 / notebook: get-train-validation-indices
skipped
 -  1898 author: rlkuhn / notebook: modified-benchark
skipped
 -  1899 author: ryabokonroman / notebook: thehomedepot
skipped
 -  1900 author: shawamar / notebook: product-recommendation-system-for-e-commerce
skipped
 -  1901 author: steubk / notebook: fixing-typos
skipped
 -  1902 author: tennissuperstar / notebook: data-exploration-1
skipped
 -  1903 author: thakurrajanand / notebook: gbm-beat-the-benchmark
skipped
 -  1904 author: uditsaini / notebook: exploring-the-home-depot-data
skipped
 -  1905 auth

skipped
 -  2029 author: davidcairuz / notebook: feature-engineering-lightgbm
skipped
 -  2030 author: jazivxt / notebook: safe-box
skipped
 -  2031 author: jesucristo / notebook: fraud-complete-eda
skipped
 -  2032 author: kabure / notebook: almost-complete-feature-engineering-ieee-data
skipped
 -  2033 author: kabure / notebook: extensive-eda-and-modeling-xgb-hyperopt
skipped
 -  2034 author: kyakovlev / notebook: ieee-fe-with-some-eda
skipped
 -  2035 author: kyakovlev / notebook: ieee-gb-2-make-amount-useful-again
skipped
 -  2036 author: kyakovlev / notebook: ieee-internal-blend
skipped
 -  2037 author: kyakovlev / notebook: ieee-lgbm-with-groupkfold-cv
skipped
 -  2038 author: nroman / notebook: eda-for-cis-fraud-detection
skipped
 -  2039 author: nroman / notebook: lgb-single-model-lb-0-9419
skipped
 -  2040 author: paulorzp / notebook: gmean-of-low-correlation-lb-0-952x
skipped
 -  2041 author: robikscube / notebook: ieee-fraud-detection-first-look-and-eda
skipped
 -  2042 auth

 -  2178 author: lowecoryr / notebook: learn-from-other-kernels-fork-from-me
skipped
 -  2179 author: macaodha / notebook: basic-inat2019-data-exploration
skipped
 -  2180 author: praxitelisk / notebook: inaturalist-2019-eda-dl
skipped
 -  2181 author: s3chwartz / notebook: inaturalist-2019-at-fgvc6
skipped
 -  2182 author: sujoykg / notebook: xception-keras
skipped
 -  2183 author: zfturbo / notebook: benchmark-2019-speed-of-image-reading
skipped
### 185 ../data/repositories/kaggle/competitions/c\inaturalist-challenge-at-fgvc-2017
 -  2184 author: jihyeseo / notebook: image-jpeg
skipped
### 186 ../data/repositories/kaggle/competitions/c\inclusive-images-challenge
 -  2185 author: abosol / notebook: understanding-a-little-what-are-the-inputs
skipped
 -  2186 author: alexanderliao / notebook: inclusive-images-stage-2
skipped
 -  2187 author: daikinban / notebook: baby-step
skipped
 -  2188 author: duboviy / notebook: basic-eda-with-images
skipped
 -  2189 author: gpreda / notebook: last

skipped
 -  2298 author: michalthedude / notebook: kaggle-users-by-location
skipped
 -  2299 author: miniushkin / notebook: jitter-test-for-overfitting-notebook
skipped
 -  2300 author: mylesoneill / notebook: normalized-kaggle-medal-count-by-country
skipped
 -  2301 author: nagadomi / notebook: list-of-installed-packages
skipped
 -  2302 author: tanitter / notebook: grid-search-xgboost-with-scikit-learn
skipped
 -  2303 author: toshik / notebook: splines-with-r
skipped
 -  2304 author: triskelion / notebook: connected-particles-iii-bl-ocks
skipped
### 194 ../data/repositories/kaggle/competitions/c\invasive-species-monitoring
 -  2305 author: algila / notebook: inception-v3-and-k-fold-in-python-0-98996
skipped
 -  2306 author: ambarish / notebook: invasive-species-monitoring-analysis
skipped
 -  2307 author: amlacorp / notebook: keras-starter-fork
skipped
 -  2308 author: ardiya / notebook: tensorflow-vgg-pretrained
skipped
 -  2309 author: chmaxx / notebook: finetune-vgg16-0-97-with-m

 -  2425 author: augustodenevreze / notebook: users-jobs-exploration
skipped
### 201 ../data/repositories/kaggle/competitions/c\job-salary-prediction
### 202 ../data/repositories/kaggle/competitions/c\just-the-basics-strata-2013
### 203 ../data/repositories/kaggle/competitions/c\just-the-basics-the-after-party
### 204 ../data/repositories/kaggle/competitions/c\kaggle-survey-2019
 -  2426 author: amiiiney / notebook: student-community-on-kaggle
skipped
 -  2427 author: andresionek / notebook: how-to-create-award-winning-data-visualizations
skipped
 -  2428 author: andresionek / notebook: is-there-any-job-out-there-kaggle-vs-glassdoor
skipped
 -  2429 author: artgor / notebook: a-look-at-russian-kagglers-over-time
skipped
 -  2430 author: artvolgin / notebook: exploring-phd-community-with-network-analysis
skipped
 -  2431 author: dataraj / notebook: tools-and-tools
skipped
 -  2432 author: etsc9287 / notebook: python-vs-r-the-data-science-rivalry
skipped
 -  2433 author: fatihbilgin / no

### 216 ../data/repositories/kaggle/competitions/c\landmark-recognition-2020
 -  2567 author: akensert / notebook: glrec-resnet50-arcface-tf2-2
notebook not found
 -  2568 author: andypenrose / notebook: pytorch-training-inference-efficientnet-b4
skipped
 -  2569 author: anshuls235 / notebook: google-landmark-recognition-eda
skipped
 -  2570 author: azaemon / notebook: eda-data-augmentation-for-beginners
skipped
 -  2571 author: camaskew / notebook: host-baseline-example
skipped
 -  2572 author: chandanverma / notebook: baseline-landmark-recognition-0-4832
skipped
 -  2573 author: chirag9073 / notebook: landmark-recognition-exploratory-data-analysis
skipped
 -  2574 author: chumajin / notebook: eda-for-biginner-updated-to-english-ver
skipped
 -  2575 author: jagdmir / notebook: google-landmark-prediction-2020
skipped
 -  2576 author: mohammedessam97 / notebook: organizer-s-code-submission
skipped
 -  2577 author: namanj27 / notebook: eda-google-landmark-recognition-2020
skipped
 -  257

skipped
 -  2664 author: scirpus / notebook: andrews-script-plus-a-genetic-program-model
skipped
 -  2665 author: tarunpaparaju / notebook: lanl-earthquake-prediction-signal-denoising
skipped
 -  2666 author: vettejeep / notebook: masters-final-project-eda
skipped
 -  2667 author: vettejeep / notebook: masters-final-project-model-lb-1-392
skipped
 -  2668 author: zikazika / notebook: memory-problems
skipped
 -  2669 author: zikazika / notebook: useful-new-features-and-a-optimised-model
skipped
### 222 ../data/repositories/kaggle/competitions/c\leaf-classification
 -  2670 author: abhmul / notebook: keras-convnet-lb-0-0052-w-visualization
skipped
 -  2671 author: alexanderlazarev / notebook: simple-keras-1d-cnn-features-split
skipped
 -  2672 author: anilnarassiguin / notebook: ml-classic-pipeline-python-xgboost
skipped
 -  2673 author: asparago / notebook: 3-basic-classifiers-and-features-correlation
skipped
 -  2674 author: bmetka / notebook: logistic-regression
skipped
 -  2675 autho

 -  2773 author: pestipeti / notebook: lyft-l5kit-unofficial-fix
skipped
 -  2774 author: pestipeti / notebook: pytorch-baseline-inference
skipped
 -  2775 author: pestipeti / notebook: pytorch-baseline-train
skipped
 -  2776 author: ryches / notebook: lyft-constant-velocity-extrapolation-baseline
skipped
 -  2777 author: tuckerarrants / notebook: lyft-ensembling-raster-sizes
skipped
### 232 ../data/repositories/kaggle/competitions/c\m5-forecasting-accuracy
 -  2778 author: anshuls235 / notebook: time-series-forecasting-eda-fe-modelling
skipped
 -  2779 author: girmdshinsei / notebook: for-japanese-beginner-with-wrmsse-in-lgbm
skipped
 -  2780 author: harupy / notebook: m5-baseline
skipped
 -  2781 author: headsortails / notebook: back-to-predict-the-future-interactive-m5-eda
skipped
 -  2782 author: kneroma / notebook: m5-first-public-notebook-under-0-50
skipped
 -  2783 author: kneroma / notebook: m5-forecast-v2-python
skipped
 -  2784 author: kyakovlev / notebook: m5-custom-features

 -  2916 author: vincento / notebook: 0-6-lb
skipped
 -  2917 author: zfturbo / notebook: seizure-boost-0-6-lb
skipped
### 244 ../data/repositories/kaggle/competitions/c\mens-machine-learning-competition-2018
 -  2918 author: aashita / notebook: feature-engineering-for-march-madness
skipped
 -  2919 author: aphaniteja / notebook: exploratory-madness
skipped
 -  2920 author: captcalculator / notebook: a-very-extensive-ncaa-exploratory-analysis
skipped
 -  2921 author: circle811 / notebook: simple-logistic-regression
skipped
 -  2922 author: dicksonchin93 / notebook: collaborative-filtering
skipped
 -  2923 author: eaturner / notebook: logistics-and-basic-stats-lb-public-0-046
skipped
 -  2924 author: gaborfodor / notebook: i-don-t-always-read-the-rules
skipped
 -  2925 author: juliaelliott / notebook: basic-starter-kernel-ncaa-men-s-dataset
skipped
 -  2926 author: lnatml / notebook: feature-engineering-with-advanced-stats
skipped
 -  2927 author: lpkirwin / notebook: fivethirtyeight-el

Oops! <class 'AttributeError'> occurred.
'NoneType' object has no attribute 'get'
 -  3064 author: amar09 / notebook: fare-prediction-stacked-ensemble-xgboost-lgbm
skipped
 -  3065 author: breemen / notebook: nyc-taxi-fare-data-exploration
skipped
 -  3066 author: btyuhas / notebook: bayesian-optimization-with-xgboost
skipped
 -  3067 author: dimitreoliveira / notebook: taxi-fare-prediction-with-keras-deep-learning
skipped
 -  3068 author: dimitreoliveira / notebook: tensorflow-dnn-coursera-ml-course-tutorial
skipped
 -  3069 author: dster / notebook: nyc-taxi-fare-starter-kernel-simple-linear-model
skipped
 -  3070 author: gunbl4d3 / notebook: xgboost-ing-taxi-fares
skipped
 -  3071 author: jsylas / notebook: python-version-of-top-ten-rank-r-22-m-2-88
skipped
 -  3072 author: jsylas / notebook: top-ten-rank-r-22m-rows-2-90-lightgbm
skipped
 -  3073 author: justjun0321 / notebook: exploratory-geoclustering-to-modeling
skipped
 -  3074 author: madhurisivalenka / notebook: cleansing-eda-

skipped
### 266 ../data/repositories/kaggle/competitions/c\noaa-right-whale-recognition
 -  3191 author: solyoh21 / notebook: fish-1-test
skipped
### 267 ../data/repositories/kaggle/competitions/c\nomad2018-predict-transparent-conductors
 -  3192 author: cbartel / notebook: random-forest-using-elemental-properties
skipped
 -  3193 author: giginim / notebook: tensorflow-neural-network
skipped
 -  3194 author: haimfeld87 / notebook: simple-catboost
skipped
 -  3195 author: headsortails / notebook: resistance-is-futile-transparent-conductors-eda
skipped
 -  3196 author: hireme / notebook: two-outputs-regressor-with-lightgbm
skipped
 -  3197 author: holar9 / notebook: hands-on-cubist-brnn
skipped
 -  3198 author: janpreets / notebook: using-the-atomic-coordinates-for-prediction
skipped
 -  3199 author: johnfarrell / notebook: nomad2018-simple-lgbm-starter
skipped
 -  3200 author: kemuel / notebook: python-exploration-with-domain-knowledge
skipped
 -  3201 author: leo1988 / notebook: explor

skipped
 -  3304 author: maunish / notebook: osic-super-cool-eda-and-pytorch-baseline
skipped
 -  3305 author: miklgr500 / notebook: linear-decay-based-on-resnet-cnn
skipped
 -  3306 author: nxrprime / notebook: fibrosis-eda-fast-ai
skipped
 -  3307 author: piantic / notebook: osic-pulmonary-fibrosis-progression-basic-eda
skipped
 -  3308 author: rohanrao / notebook: osic-understanding-laplace-log-likelihood
skipped
 -  3309 author: titericz / notebook: tabular-simple-eda-linear-model
skipped
 -  3310 author: twinkle0705 / notebook: your-starter-notebook-for-osic
skipped
 -  3311 author: ulrich07 / notebook: osic-multiple-quantile-regression-starter
skipped
 -  3312 author: vbmokin / notebook: higher-lb-score-by-tuning-mloss-upgrade-visual
skipped
 -  3313 author: yasufuminakama / notebook: osic-lgb-baseline
skipped
### 276 ../data/repositories/kaggle/competitions/c\otto-group-product-classification-challenge
 -  3314 author: abhishek / notebook: beating-the-benchmark-v2-0
skipped
 -  

skipped
 -  3421 author: jihyeseo / notebook: handle-rds-and-zip-eda
skipped
 -  3422 author: joncle / notebook: notebook0f2646ced6
skipped
 -  3423 author: joncle / notebook: notebookc2931820a9
skipped
 -  3424 author: joncle / notebook: test1
skipped
 -  3425 author: mcwitt / notebook: heatmap
skipped
 -  3426 author: nabilblk / notebook: last-location-benchmark
skipped
 -  3427 author: prayas / notebook: last-location-benchmark
skipped
 -  3428 author: raymonmina / notebook: notebook98c3fcc906
skipped
 -  3429 author: sircausticmail / notebook: getting-the-most-visited-places
skipped
 -  3430 author: sivaram123 / notebook: visualization-of-taxi-trip-end-points
skipped
 -  3431 author: thomas92 / notebook: plot-of-trips
skipped
 -  3432 author: urayukitaka / notebook: prediction-taxi-trajectory
skipped
 -  3433 author: willieliao / notebook: test-set-sampling-cutoff
skipped
### 289 ../data/repositories/kaggle/competitions/c\pkdd-15-taxi-trip-time-prediction-ii
 -  3434 author: benham

skipped
 -  3550 author: rejpalcz / notebook: feature-extraction-using-period-analysis
skipped
 -  3551 author: rooshroosh / notebook: fork-simple-mlp-for-time-series-classification
skipped
 -  3552 author: sergeylebedev / notebook: light-curve-equalization
skipped
 -  3553 author: yuval6967 / notebook: 3rd-place-cnn
skipped
### 295 ../data/repositories/kaggle/competitions/c\poker-rule-induction
 -  3554 author: kenkpixdev / notebook: poker-comb-without-ml-model-accuracy-1-00
skipped
 -  3555 author: prakharrathi25 / notebook: iterative-proker-hand-prediction
skipped
### 296 ../data/repositories/kaggle/competitions/c\porto-seguro-safe-driver-prediction
 -  3556 author: aharless / notebook: xgboost-cv-lb-284
skipped
 -  3557 author: andrewmvd / notebook: lightgbm-in-r
skipped
 -  3558 author: aquatic / notebook: entity-embedding-neural-net
skipped
 -  3559 author: arthurtok / notebook: interactive-porto-insights-a-plot-ly-tutorial
skipped
 -  3560 author: batzner / notebook: gini-coeffi

 -  3673 author: kamalchhirang / notebook: 5th-place-solution-0-0184-score
skipped
 -  3674 author: mm5631 / notebook: ml-workflow-data-science-approach
skipped
 -  3675 author: plasticgrammer / notebook: pubg-finish-placement-prediction-playground
skipped
 -  3676 author: rejasupotaro / notebook: cheaters-and-zombies
skipped
 -  3677 author: rejasupotaro / notebook: effective-feature-engineering
skipped
 -  3678 author: shahules / notebook: feature-engineering-and-model-stacking
skipped
 -  3679 author: slmf1995 / notebook: exploring-pubg-match-statistics-rankings
skipped
### 306 ../data/repositories/kaggle/competitions/c\pycon-2015-tutorial-predict-closed-questions-on-stack-overflow
### 307 ../data/repositories/kaggle/competitions/c\quickdraw-doodle-recognition
 -  3680 author: amneves / notebook: quick-draw-keras-cnn-model
skipped
 -  3681 author: gaborfodor / notebook: black-white-cnn-lb-0-77
skipped
 -  3682 author: gaborfodor / notebook: data-reggeli
skipped
 -  3683 author: gabo

### 315 ../data/repositories/kaggle/competitions/c\recruit-restaurant-visitor-forecasting
 -  3780 author: aless80 / notebook: sarimax-on-mean-visits
skipped
 -  3781 author: asindico / notebook: a-japanese-journey
skipped
 -  3782 author: breakfastpirate / notebook: weeks-before-after-golden-week-2016
skipped
 -  3783 author: captcalculator / notebook: a-very-extensive-recruit-exploratory-analysis
skipped
 -  3784 author: dongxu027 / notebook: mean-mix-math-geo-harmonic-lb-0-493
skipped
 -  3785 author: fabiendaniel / notebook: recruit-restaurant-eda
skipped
 -  3786 author: headsortails / notebook: be-my-guest-recruit-restaurant-eda
skipped
 -  3787 author: huntermcgushion / notebook: exhaustive-weather-eda-file-overview
skipped
 -  3788 author: huntermcgushion / notebook: weather-station-location-eda
skipped
 -  3789 author: ievgenvp / notebook: lstm-encoder-decoder-via-keras-lb-0-5
skipped
 -  3790 author: nitinsurya / notebook: surprise-me-2-neural-networks-keras
skipped
 -  3791 

 -  3901 author: ashishpatel26 / notebook: chexnet-batch-normalization-hyparameter-tuning
skipped
 -  3902 author: ashishpatel26 / notebook: chexnet-radiologist-level-pneumonia-detection
skipped
 -  3903 author: chenyc15 / notebook: mean-average-precision-metric
skipped
 -  3904 author: drt2290078 / notebook: mask-rcnn-sample-starter-code
skipped
 -  3905 author: eduardomineo / notebook: u-net-lung-segmentation-montgomery-shenzhen
skipped
 -  3906 author: giuliasavorgnan / notebook: start-here-beginner-intro-to-lung-opacity-s1
skipped
 -  3907 author: gpreda / notebook: rsna-pneumonia-detection-eda
skipped
 -  3908 author: hmendonca / notebook: mask-rcnn-and-coco-transfer-learning-lb-0-155
skipped
 -  3909 author: hmendonca / notebook: mask-rcnn-with-submission
skipped
 -  3910 author: jonnedtc / notebook: cnn-segmentation-connected-components
skipped
 -  3911 author: jtlowery / notebook: intro-eda-with-dicom-metadata
skipped
 -  3912 author: kmader / notebook: lung-opacity-classificat

 -  4045 author: sudalairajkumar / notebook: maximum-possible-score
skipped
 -  4046 author: sudalairajkumar / notebook: simple-exploration-notebook-v3-0
skipped
 -  4047 author: sudalairajkumar / notebook: when-less-is-more
skipped
 -  4048 author: tezdhar / notebook: when-less-is-more-extended
skipped
 -  4049 author: yifanxie / notebook: santander-products-visualisation
skipped
 -  4050 author: zfturbo / notebook: mass-hashes
skipped
 -  4051 author: zfturbo / notebook: santander-battle
skipped
### 332 ../data/repositories/kaggle/competitions/c\santander-value-prediction-challenge
 -  4052 author: alexpengxiao / notebook: preprocessing-model-averaging-by-xgb-lgb-1-39
skipped
 -  4053 author: asydorchuk / notebook: save-98-of-ram
skipped
 -  4054 author: bminixhofer / notebook: a-different-validation-technique
skipped
 -  4055 author: headsortails / notebook: breaking-bank-santander-eda
skipped
 -  4056 author: johnfarrell / notebook: baseline-with-lag-select-fake-rows-dropped
skippe

 -  4170 author: benhamner / notebook: frequency-of-crimes-in-san-francisco
notebook not found
 -  4171 author: captcalculator / notebook: function-to-map-any-crime
notebook not found
 -  4172 author: dbennett / notebook: test-map
notebook not found
 -  4173 author: eyecjay / notebook: vehicle-thefts-or-jerry-rice-jubilation
notebook not found
 -  4174 author: ifness / notebook: prevalent-crimes-in-san-francisco
notebook not found
 -  4175 author: lesibius / notebook: crime-scene-exploration-and-model-fit
notebook not found
 -  4176 author: luventu / notebook: title
notebook not found
 -  4177 author: mchirico / notebook: vehicle-thefts-drops-off-after-2006
notebook not found
 -  4178 author: mircat / notebook: violent-crime-mapping
notebook not found
 -  4179 author: nitinvijay23 / notebook: predict-the-crime-category-knn-logistic
notebook not found
 -  4180 author: petercooman / notebook: histograms-of-crime-category-by-day
notebook not found
 -  4181 author: sanghan / notebook: crim

notebook not found
 -  4280 author: hidehisaarai1213 / notebook: openvaccine-checkout-bpps
notebook not found
 -  4281 author: isaienkov / notebook: openvaccine-eda-feature-engineering-modeling
notebook not found
 -  4282 author: its7171 / notebook: dangerous-features
notebook not found
 -  4283 author: mrkmakr / notebook: covid-ae-pretrain-gnn-attn-cnn
notebook not found
 -  4284 author: nasirkhalid24 / notebook: cnn-transformer-enc-rnn-feature-eng-data-aug
notebook not found
 -  4285 author: ragnar123 / notebook: wavenet-gru-baseline
notebook not found
 -  4286 author: robikscube / notebook: openvaccine-covid-19-mrna-starter-eda
notebook not found
 -  4287 author: symyksr / notebook: openvaccine-deepergcn
notebook not found
 -  4288 author: t88take / notebook: openvaccine-simple-lgb-baseline
notebook not found
 -  4289 author: takadaat / notebook: openvaccine-pytorch-ae-pretrain
notebook not found
 -  4290 author: tatsuya214355 / notebook: stanford-covid-vaccine
notebook not found
 -

 -  4398 author: prokaj / notebook: bert-joint-baseline-notebook
notebook not found
 -  4399 author: ragnar123 / notebook: exploratory-data-analysis-and-baseline
notebook not found
 -  4400 author: sakami / notebook: tfqa-pytorch-baseline
notebook not found
 -  4401 author: seesee / notebook: submit-full
notebook not found
 -  4402 author: xhlulu / notebook: tf-qa-jsonl-to-dataframe
notebook not found
 -  4403 author: yihdarshieh / notebook: inference-use-hugging-face-models
notebook not found
 -  4404 author: yutanakamura / notebook: nlp-express-0-data-loading-visualization
notebook not found
### 362 ../data/repositories/kaggle/competitions/c\text-normalization-challenge-english-language
 -  4405 author: allunia / notebook: eda-en-text-normalization
notebook not found
 -  4406 author: alphasis / notebook: bigdata-trick-or-treat-lb-0-9954
notebook not found
 -  4407 author: alvira12 / notebook: class-wise-processing-lb-0-992-new-dataset
notebook not found
 -  4408 author: amitabhac / n

skipped
 -  4507 author: wesamelshamy / notebook: trackml-problem-explanation-and-data-exploration
skipped
 -  4508 author: yuval6967 / notebook: 7th-place-clustering-extending-ml-merging-0-75
skipped
### 376 ../data/repositories/kaggle/competitions/c\tradeshift-text-classification
 -  4509 author: tarunaryyan / notebook: imputation-for-missing-values-in-features
skipped
### 377 ../data/repositories/kaggle/competitions/c\transfer-learning-on-stack-exchange-tags
 -  4510 author: akshatpathak / notebook: text-data-clustering
skipped
 -  4511 author: anokas / notebook: frequent-words-model-v2
skipped
 -  4512 author: charlescostello / notebook: transfer-learning-on-stack-exchange-tags
skipped
 -  4513 author: eliotbarr / notebook: word-clouds
skipped
 -  4514 author: katarz / notebook: tags-exploration
skipped
 -  4515 author: l3nnys / notebook: useful-text-preprocessing-on-the-datasets
skipped
 -  4516 author: mrtroll / notebook: analying-tfidf-biology-corpus
skipped
 -  4517 author: mrt

skipped
 -  4638 author: willieliao / notebook: et1-ridge3-med-adj
skipped
 -  4639 author: ymcdull / notebook: ridge-lb-0-0100659
skipped
 -  4640 author: ysidhu / notebook: two-sigma-portfolio-returns-eda
skipped
### 387 ../data/repositories/kaggle/competitions/c\two-sigma-financial-news
 -  4641 author: artgor / notebook: eda-feature-engineering-and-everything
skipped
 -  4642 author: ashishpatel26 / notebook: attension-layer-basic-for-nlp
skipped
 -  4643 author: ashishpatel26 / notebook: bird-eye-view-of-two-sigma-nn-approach
skipped
 -  4644 author: bguberfain / notebook: a-simple-model-using-the-market-and-news-data
skipped
 -  4645 author: chocozzz / notebook: two-sigma-news-simple-eda-prophet-nlp
skipped
 -  4646 author: christofhenkel / notebook: market-data-nn-baseline
skipped
 -  4647 author: danielson / notebook: cleaning-up-market-data-errors-and-stock-splits
skipped
 -  4648 author: dmdm02 / notebook: complete-eda-voting-lightgbm
skipped
 -  4649 author: dmitrypukhov / n

 -  4749 author: dishask99 / notebook: notebook8bb2428ec0
skipped
 -  4750 author: sandeepsingh3480 / notebook: notebook8ecd7cbceb
skipped
### 397 ../data/repositories/kaggle/competitions/c\web-traffic-time-series-forecasting
 -  4751 author: arjunsurendran / notebook: using-lstm-on-training-data
skipped
 -  4752 author: attollos / notebook: time-series-forecast-example-with-prophet
skipped
 -  4753 author: chechir / notebook: weekend-flag-median-with-wiggle
skipped
 -  4754 author: clustifier / notebook: weekend-weekdays
skipped
 -  4755 author: cpmpml / notebook: smape-weirdness
skipped
 -  4756 author: dextrousjinx / notebook: brief-insight-on-web-traffic-time-series
skipped
 -  4757 author: gvyshnya / notebook: parallel-operations-over-a-pandas-df
skipped
 -  4758 author: gvyshnya / notebook: prophet-class-wrapper
skipped
 -  4759 author: headsortails / notebook: wiki-traffic-forecast-exploration-wtf-eda
skipped
 -  4760 author: merckel / notebook: preliminary-investigation-holtwin

 -  4889 author: xchmiao / notebook: popcorn-rnn-model
skipped
 -  4890 author: yepp2411 / notebook: baseline-model-using-nn-for-movie-review
skipped
### 409 ../data/repositories/kaggle/competitions/c\worldcup2010
### 410 ../data/repositories/kaggle/competitions/c\worldcupconf
### 411 ../data/repositories/kaggle/competitions/c\yandex-personalized-web-search-challenge
### 412 ../data/repositories/kaggle/competitions/c\yelp-recruiting
### 413 ../data/repositories/kaggle/competitions/c\yelp-recsys-2013
### 414 ../data/repositories/kaggle/competitions/c\yelp-restaurant-photo-classification
 -  4891 author: anokas / notebook: patch-features-rfr
skipped
 -  4892 author: aparajit0511 / notebook: yelp-restaurant
skipped
 -  4893 author: ashbunny / notebook: kernel909af544ff
skipped
 -  4894 author: benhamner / notebook: sample-photos
skipped
 -  4895 author: dmytrolystopad / notebook: imagine-an-image1
skipped
 -  4896 author: enerrio / notebook: data-exploration-yelp-classification
skipped
 -

In [9]:
# scoring function to get a score between 0...1 for integer-values, 0.5 should be at ~100
def score(n, precision=3):
    if isinstance(n, int) or isinstance(i, float):
        return round(1-1/math.pow(1+n, 0.15), precision)
    else:
        try:
            n = int(n)
        except:
            return 0
        
    return round(1-1/math.pow(1+n, 0.15), precision)

for n in [0,1,10,25,50,100,1000,10000]:
    print(score(n))
    
print(score('3'))
print(score('a'))

0.0
0.099
0.302
0.387
0.446
0.5
0.645
0.749
0.188
0


In [80]:
# throw all parsed meta-data together in a single csv
# select only true ML cases

folder_base = '../data/repositories/kaggle/competitions/'
file_json = 'meta.json'
fp_csv = '../data/database/kaggle_competitions_01_original.csv'
fp_research = '../data/database/kaggle_competitions_02_research.csv'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0
j = 0

runtime_start = time.time()
df = pd.DataFrame()
df2 = pd.DataFrame()

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', len(folders))
    for subfolder in subfolders:
        #print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,subfolder, 'notebooks/')
        i += 1
        
        if os.path.exists(path):
            projects = os.listdir(path)

            for author in projects:
                #print('author:', author)
                items = os.listdir(os.path.join(folder_base,folder,subfolder, 'notebooks/', author))

                for notebook in items:
                    j+=1
                    #print(' - ', j, 'author:', author, '/ notebook:', notebook)
                    fp_json = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_json)
                    
                    if os.path.isfile(fp_json):
                        data = load_data(fp_json, fromJson=True)
                        data['score_votes'] = score(data['votes'])
                        data['score_views'] = score(data['views'])
                        ml_score = 0
                        if len(data['ml_slugs']) > 0:
                            ml_score += 0.2
                        if len(data['ml_terms']) > 0:
                            ml_score += 0.3
                        if len(data['ml_libs']) > 0:
                            ml_score += 0.5
                        data['ml_detected'] = ml_score
                        
                        data['description'] = data['description'].replace('\n', ' ').replace('\r', '').replace('¶', '').strip()
                        
                        # date (ignoring GMT+x)
                        # Wed Dec 19 2018 14:42:40 GMT+0100 (Mitteleuropäische Normalzeit)
                        # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
                        date_time_str = data['date'].split('GMT')
                        date_time_str = date_time_str[0].strip()
                        date_time_obj = datetime.datetime.strptime(date_time_str, '%a %b %d %Y %H:%M:%S')

                        #print('Date:', date_time_obj.date())
                        #print('Time:', date_time_obj.time())
                        #print('Date-time:', date_time_obj)
                        data['date'] = date_time_obj
                        
                        # store only items with:
                        # - ml_score >= 0.5
                        # - are indicated as "best submission" # and 'Best Submission' in data['submission'] 
                        # - description > 1 word
                        words = data['description'].split(' ')
                        if ml_score > 0.5 and len(words) > 5:
                            df = df.append(data, ignore_index=True)
                        
                        else:
                            #if 'R' in data['type']:
                            df2 = df2.append(data, ignore_index=True)
                            
                    #if j % 100 == 0:
                    #    print('folder', i, '/ notebook', j)
                        
                if quit!=0 and j>quit:
                    break
            if quit!=0 and j>quit:
                break
    if quit!=0 and j>quit:
        break
        
# drop duplicates
df = df.drop_duplicates(['link'])

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', j, 'items')
print(df.shape)
print(df.head())
        
# drop columns
df.drop(columns=['author', 'submission'], inplace=True)
df2.drop(columns=['author', 'submission'], inplace=True)

df.to_csv(fp_csv, sep=';', index=False)
df2.to_csv(fp_research, sep=';', index=False)

folder: 1
subfolder: 1
runtime: 51.662 seconds for 5072 items
(1822, 22)
          author                date  \
0         a45632 2019-01-11 10:49:10   
1     amansohane 2018-12-24 16:27:39   
2  ashishpatel26 2019-01-04 08:43:54   
3  ashishpatel26 2018-12-27 10:13:57   
4  ashishpatel26 2018-12-19 14:42:40   

                                         description     license  \
0  Hybrid solution update using multiple sources....  Apache 2.0   
1  This is an attempt to solve difficulty 3. It b...  Apache 2.0   
2  What is Attention? Attention is simply a vecto...  Apache 2.0   
3  Beginner to Intermediate Natural Language Proc...  Apache 2.0   
4  This a fork of @opanichev 's great kernel: htt...  Apache 2.0   

                                                link  ml_detected  \
0  https://www.kaggle.com/a45632/classification-t...          1.0   
1  https://www.kaggle.com/amansohane/level-3-with...          1.0   
2  https://www.kaggle.com/ashishpatel26/attension...          1.0   
3