In [2]:
# imports
import requests
import json
import os
import sys
import platform
import time
import math
import datetime
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup

In [3]:
# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

# test text
#print(store_data('Hello', '../data/repositories/mlart/test.txt'))
#print(load_data('../data/repositories/mlart/test.txt'))

# test json
#print(store_data({'msg':'Hello World'}, '../data/repositories/mlart/test.json', toJson=True))
#print(load_data('../data/repositories/mlart/test.json', fromJson=True))

In [4]:
# helper function to create folder create_folder
def create_folder(path):
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
            print(path + ' created')
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

In [5]:
# scan text for predefined terms

text = 'We use LSTM for anomaly and object detection. As Convolutional Neural Networks are great for ML.'

pd_ml_terms = pd.read_csv('../data/patterns/ml_terms.csv')
ml_terms = pd_ml_terms['Term'].tolist()
ml_slugs = pd_ml_terms['Slug'].tolist()
ml_slugs = [x for x in ml_slugs if str(x) != 'nan']
ml_tags = pd_ml_terms['Tag'].tolist()
ml_tags = [x for x in ml_tags if str(x) != 'nan']

#print(ml_tags)

ml_libs = pd.read_csv('../data/patterns/ml_libraries.csv')
ml_libs = ml_libs['Python Package'].tolist()

def match_text(haystack, needles, toLower = False, unique = True):
    
    if toLower == True:
        haystack = haystack.lower()
        needles = [x.lower() for x in needles]
    
    if unique == True:
        matches = {x for x in needles if x in haystack}
        matches = list(matches)
    else:
        matches = [x for x in needles if x in haystack]
    
    return matches

def match_tags(haystack):
    df = pd.read_csv('../data/patterns/ml_terms.csv')
    tags = []
    
    df.set_index('Term', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(tag)
        except:
            pass
        
    df.set_index('Slug', inplace = True)
    for item in haystack:
        try:
            tag = df.loc[item].get('Tag')
            if not 'nan' in str(tag):
                tags.append(str(tag))
        except:
            pass
        
    #if 'ANN' in tags or 'CNN' in tags or 'RNN' in tags:
    #    tags.remove('NN')
    
    return list(tags)

#ml_slugs, ml_terms, ml_libs, match_text(haystack, needles, toLower = False, unique = True)
needles = {
    'ml_slugs': ml_slugs,
    'ml_terms': ml_terms,
    'ml_libs': ml_libs,
}
needles_need_str_lower = {
    'ml_slugs': False,
    'ml_terms': True,
    'ml_libs': False,
}

matches = []

matches.extend(match_text(text, ml_terms, True))
matches.extend(match_text(text, ml_slugs, False))
print('matches', matches)

tags = match_tags(matches)
print('tags', tags)

matches ['detect', 'object detection', 'anomaly', 'lstm', 'convolutional neural network', 'neural network', 'ML']
tags ['Object Detection', 'LSTM', 'CNN', 'NN', 'ML']


In [6]:
# get file modifictaion date
# https://stackoverflow.com/questions/237079/how-to-get-file-creation-modification-date-times-in-python

def creation_date(path_to_file, datetime = True):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        timestamp = os.path.getctime(path_to_file)
    else:
        stat = os.stat(path_to_file)
        try:
            timestamp = stat.st_birthtime
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            timestamp = stat.st_mtime
        
    if datetime == True:
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))

    return timestamp
    
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'
print(creation_date(folder_base+folder+notebook))

2020-12-12 16:08:03


In [7]:
# clear text formatting
def clear_text(text):
    return text.replace('\n',' ').replace('\r','').replace('¶','').strip()

In [7]:
# scrape competition

folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/'
dataset = 'dataset.html'

def scrape_competition(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta = {}
    
    meta['title'] = soup.find('h1', class_="competition-header__title").text.strip()
    meta['subtitle'] = soup.find('h2', class_="competition-header__subtitle").text.strip()
    meta['type'] = soup.find('p', class_="competition-header__classification-text").text.strip()
    meta['organisation'] = soup.find('span', class_="competition-header__organization-name")
    if meta['organisation'] == None:
        meta['organisation'] = ''
    else:
        meta['organisation'] = meta['organisation'].text.strip()
    temp = soup.find_all('li', class_="horizontal-list-item horizontal-list-item--bullet horizontal-list-item--default")
    for item in temp:
        #print(item.text)
        if 'team' in item.text:
            meta['teams'] = item.text.replace('teams','').replace('team','').strip()
        if 'ago' in item.text:
            #item = BeautifulSoup(item, 'html.parser')
            item = item.select('li>span>span')[0]
            meta['date_closed'] = item.get('title')
    
    meta['description'] = clear_text(soup.find('div', class_="competition-overview__content").text)
    meta['tags'] = [x.text.strip() for x in soup.find_all('span', class_="CategoryButton_CategoryName-sc-c10946 jFsDhg")]
    
    
    

    return meta

html = load_data(folder_base+folder+dataset)
if 'file not found' in html:
    print(html)
meta = scrape_competition(html)
print(meta)
#store_data(links, folder_base+folder+out, toJson=True)

{'title': 'Lyft 3D Object Detection for Autonomous Vehicles', 'subtitle': 'Can you advance the state of the art in 3D object detection?', 'type': 'Featured prediction Competition', 'organisation': 'Lyft', 'teams': '547', 'date_closed': 'Wed Nov 13 2019 00:59:00 GMT+0100 (Mitteleuropäische Normalzeit)', 'description': 'Self-driving technology presents a rare opportunity to improve the quality of life in many of our communities. Avoidable collisions, single-occupant commuters, and vehicle emissions are choking cities, while infrastructure strains under rapid urban growth. Autonomous vehicles are expected to redefine transportation and unlock a myriad of societal, environmental, and economic benefits. You can apply your data analysis skills in this competition to advance the state of self-driving technology. Lyft, whose mission is to improve people’s lives with the world’s best transportation, is investing in the future of self-driving vehicles. Level 5, their self-driving division, is wo

In [None]:
# iterate competions

url = 'https://www.kaggle.com/c/'
folder_base = '../data/repositories/kaggle/competitions/c/'
folder = '3d-object-detection-for-autonomous-vehicles/'
dataset = 'dataset.html'
file_out = 'meta.json'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', i, len(folders))
    #print(folder)
    #print('###', i, os.path.join(folder_base,folder,subfolder))
    path = os.path.join(folder_base,folder,dataset)
    i += 1

    if os.path.exists(path):
        print(path)
        html = load_data(path)
        if 'file not found' in html:
            print(html)
        meta = scrape_competition(html)
        meta['link'] = url+folder
        #print(meta)
        store_data(meta, os.path.join(folder_base,folder,file_out), toJson=True)
            
    if quit!=0 and i>quit:
        break

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', i, 'items')

In [12]:
# scrape datasets

folder_base = '../data/repositories/kaggle/datasets/'
folder = '4quant/depth-generation-lightfield-imaging/'
dataset = 'dataset.html'

def scrape_datasets(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta = {}
    
    try:
        meta['title'] = soup.find('h1', class_="dataset-header-v2__title").text.strip()
        meta['subtitle'] = soup.find('h2', class_="dataset-header-v2__subtitle").text.strip()
        meta['date'] = soup.find('time').get('datetime')
        meta['type'] = soup.find('p', class_="dataset-header-v2__classification-text").text.strip()
        meta['organisation'] = soup.find('div', class_="dataset-header-v2__collaborators-wrapper")
        if meta['organisation'] == None:
            meta['organisation'] = ''
        else:
            meta['organisation'] = meta['organisation'].text.strip()
        temp = soup.find_all('li', class_="horizontal-list-item horizontal-list-item--bullet horizontal-list-item--default")
        for item in temp:
            #print(item.text)
            if 'team' in item.text:
                meta['teams'] = item.text.replace('teams','').replace('team','').strip()
            if 'ago' in item.text:
                #item = BeautifulSoup(item, 'html.parser')
                item = item.select('li>span>span')[0]
                meta['date_closed'] = item.get('title')

        meta['description'] = clear_text(soup.find('div', class_="Description_ContentWrapper-sc-xv68c4").text)
        meta['usability'] = soup.find('p', {'data-test': 'rating'}).text.strip()
        meta['license'] = soup.find('a', class_="QuickInfo_A-sc-wkm1up").text.strip()
        meta['tags'] = [x.text.strip() for x in soup.find_all('a', {'class': 'SimpleTag_Tag-sc-1m0qa3y'})]

        footer = soup.find('div', {'class', 'Footer_Wrapper-sc-1fi89lh'})
        items = footer.find_all('li')
        for item in items:
            if 'view' in item.text:
                meta['views'] = int(item.text.replace('view','').replace('s','').replace(',','').strip())
            if 'download' in item.text:
                meta['downloads'] = int(item.text.replace('download','').replace('s','').replace(',','').strip())
            if 'notebook' in item.text:
                meta['notebooks'] = int(item.text.replace('notebook','').replace('s','').replace(',','').strip())
            if 'topic' in item.text:
                meta['topics'] = int(item.text.replace('topic','').replace('s','').replace(',','').strip())
    except:
        pass
    
    return meta

html = load_data(folder_base+folder+dataset)
if 'file not found' in html:
    print(html)
meta = scrape_datasets(html)
print(meta)
#store_data(links, folder_base+folder+out, toJson=True)

{'title': 'Computational Imaging', 'subtitle': 'Lightfield (Lytro) and Stereo (Project Tango) Datasets', 'date': '2018-07-10T16:58:22.407Z', 'type': 'Dataset', 'organisation': '4Quant', 'description': 'Context The data is based on images I have taken with my Lytro Illum camera (https://pictures.lytro.com/ksmader) they have been exported as image data and depth maps. The idea is to make and build tools for looking at Lytro Image data and improving the results Content The data are from the Lytro Illum and captured as 40MP images which are then converted to 5MP RGB+D images. All of the required data for several test images is provided The second datasets come from the Lenovo Phab2 (Project Tango) which utilizes dual image sensors to recreate point clouds of large 3D structures. These are provided as .ply and .obj datasets Acknowledgements The data is based on images I have taken with my Lytro Illum camera (https://pictures.lytro.com/ksmader). Inspiration  Build a neural network which auto

In [None]:
# iterate datasets

url = 'https://www.kaggle.com/'
folder_base = '../data/repositories/kaggle/datasets/'
folder = '3d-object-detection-for-autonomous-vehicles/'
dataset = 'dataset.html'
file_out = 'meta.json'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()

for folder in folders:
    projects = os.listdir(os.path.join(folder_base, folder))
    
    for project in projects:
        #subfolders = os.listdir(os.path.join(folder_base,folder,project))
        print('subfolder:', i, len(folders))
        #print(folder)
        #print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,project,dataset)
        i += 1

        if os.path.exists(path):
            print(path)
            html = load_data(path)
            if 'file not found' in html:
                print(html)
            meta = scrape_datasets(html)
            meta['link'] = url+folder+'/'+project
            #print(meta)
            #path = os.path.join(folder_base,folder,project,file_out)
            #print(path)
            store_data(meta, os.path.join(folder_base,folder,project,file_out), toJson=True)

        if quit!=0 and i>quit:
            break
    if quit!=0 and i>quit:
        break

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', i-1, 'items')

In [None]:
# collect all meta.json into a single csv

folder_base = '../data/repositories/kaggle/competitions/'
file_meta = 'meta.json'
fp_csv = '../data/database/kaggle_competitions.csv'

# datasets
folder_base = '../data/repositories/kaggle/datasets/'
file_meta = 'meta.json'
fp_csv = '../data/database/kaggle_datasets.csv'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()
df = pd.DataFrame()

runtime_start = time.time()
print('folder:', len(folders))
for folder in folders:
    projects = os.listdir(os.path.join(folder_base,folder))
    for project in projects:
        #subfolders = os.listdir(os.path.join(folder_base,folder))
        print(i)
        #print(folder)
        #print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,project,file_meta)
        i += 1

        if os.path.exists(path):
            print(path)
            data = load_data(path, fromJson=True)
            if 'file not found' in data:
                print(html)

            try:
                data['description'] = clear_text(data['description'])
            except:
                data['description'] = ''

            if 'date_closed' in data:
                # date (ignoring GMT+x)
                date_time_str = data['date_closed'].split('GMT')
                date_time_str = date_time_str[0].strip()
                date_time_obj = datetime.datetime.strptime(date_time_str, '%a %b %d %Y %H:%M:%S')
                data['date_closed'] = date_time_obj
            
            if 'date' in data:
                # date (ignoring GMT+x)
                date_time_str = data['date'].split('.')
                date_time_str = date_time_str[0].strip().replace('T',' ').replace('Z','')
                date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
                data['date'] = date_time_obj

            df = df.append(data, ignore_index=True)

        if quit!=0 and i>quit:
            break
    if quit!=0 and i>quit:
        break
        
runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', i, 'items')
print(df.shape)
print(df.head())
        
# drop columns
#df.drop(columns=['author', 'submission'], inplace=True)

df.to_csv(fp_csv, sep=';', index=False)

In [40]:
# scrape notebook content

folder_base = '../data/repositories/kaggle/competitions/c/'
folder1 = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
folder2 = 'aerial-cactus-identification/notebooks/abhinand05/in-depth-guide-to-convolutional-neural-networks/'
notebook = 'notebook_02.html'

def scrape_notebook_content(html):
    soup = BeautifulSoup(html, 'html.parser')
    meta = {}
    
    error = soup.find('h1', class_='not-found__header')
    if not error == None:
        meta['error'] = 'not found'
        return meta
    
    try:
        meta['author'] = meta['author'].get('href').replace('/','')
    except:
        meta['author'] = ''
        
    try:
        meta['title'] = soup.find('a', class_="KernelViewerContext_KernelTitle-sc-rdaqnd").text
    except:
        meta['title'] = ''
    
    try:
        meta['type'] = soup.find('span', class_="KernelViewerContext_KernelTypeInfo-sc-1l6fza6 kqxzvL").text.replace('using data from','').strip()
    except:
        meta['type'] = ''
    
    try:
        meta['sources'] = soup.find('a', class_="KernelViewerContext_DataSourceUrl-sc-1dm3ij9 lpoMHV").text.strip()
    except:
        meta['sources'] = ''
    
    try:
        meta['tags'] = soup.find('span', class_="KernelViewerContext_CategoriesWrapper-sc-8yrjj NgcTE").text.split('·')
        meta['tags'] = list(filter(None, meta['tags']))
        meta['tags'] = [x.replace('+1 more','') for x in meta['tags']]
        meta['tags'] = [x.replace('+2 more','') for x in meta['tags']]
        meta['tags'] = [x.replace('+3 more','') for x in meta['tags']]
        meta['tags'] = [x.replace('+4 more','') for x in meta['tags']]
    except:
        meta['tags'] = ''
    
    try:
        meta['votes'] = soup.find('span', class_="vote-button__vote-count").text
    except:
        meta['votes'] = ''
        
    try:
        meta['submission'] = soup.select('div.kernel-code-pane__submission-info-content')
        if len(meta['submission']) > 0:
            meta['submission'] = meta['submission'][0].text
    except:
        meta['submission'] = ''
        
    #meta['votes'] = soup.find('span', class_="vote-button__vote-count").text
    try:
        meta['license'] = soup.select('div.kernel-code-pane__subtitle>a')
        if len(meta['license']) > 0:
            meta['license'] = meta['license'][0].text
    except:
        meta['license'] = ''
    
    try:
        score = soup.select('div.kernel-code-pane__submission-score-value')
        if len(score) > 0:
            meta['score_private'] = score[0].text
        if len(score) > 1:
            meta['score_public'] = score[1].text
    except:
        pass
    
    try:
        temp = soup.find('span', class_="KernelViewerContext_KernelSubtitle-sc-rltxca esPWpV").select('span') #.text.split('·')
        #print (temp)
        for item in temp:
            #print(item)
            if 'views' in item.text:
                meta['views'] = int(''.join(filter(str.isdigit, item.text)))
            if 'GMT' in item.get('title', 'nan'):
                meta['date'] = item.get('title')
    except:
        pass
    
    return meta

html = load_data(folder_base+folder2+notebook)
if 'file not found' in html:
    print(html)
meta = scrape_notebook_content(html)
print(meta)
#store_data(links, folder_base+folder+out, toJson=True)

{'author': '', 'title': 'In-Depth Guide to Convolutional Neural Networks', 'type': 'Python notebook', 'sources': 'Aerial Cactus Identification', 'tags': ['beginner, exploratory data analysis, deep learning, cnn'], 'votes': '62', 'submission': 'Best Submission SuccessfulSubmitted by Abhinand a year ago', 'license': 'Apache 2.0', 'score_private': '0.9981', 'score_public': '0.9981', 'views': 4474, 'date': 'Sun Jun 16 2019 19:46:53 GMT+0200 (Mitteleuropäische Sommerzeit)'}


In [42]:
# scrape kernel content

folder_base = '../data/repositories/kaggle/competitions/c/'
folder1 = '3d-object-detection-for-autonomous-vehicles/notebooks/asimandia/lyft3d-inference-kernel/'
folder2 = 'airbnb-recruiting-new-user-bookings/notebooks/datadave/ndcg-score-r/'
notebook = 'notebook_02.html'
kernel = 'kernel.html'

def scrape_kernel_content(html, needles, snippet = False):
    soup = BeautifulSoup(html, 'html.parser')
    
    meta = {}
    
    try:
        if snippet == True:
            code = soup.find('div', {"id": "code"}).text
        else:
            code = soup.find('body').text
        #print(code)

        for key, value in needles.items():
            meta[key] = match_text(code, value, needles_need_str_lower[key])

        meta['ml_tags'] = match_tags(meta['ml_terms'] + meta['ml_slugs'])

        meta['description'] = soup.select('div.text_cell_render')
        #print(meta['description'])
        if len(meta['description']) > 0:
            meta['description'] = clear_text(meta['description'][0].text)
        else:
            meta['description'] = ''
    except:
        pass
    
    return meta

# test for non-kernel-file ('code' embedded in notebook.html)
html = load_data(folder_base+folder2+notebook)
if 'file not found' in html:
    print(html)
meta = scrape_kernel_content(html, needles)
print(meta)

# test for kernel-file ('code' in kernel.html)
html = load_data(folder_base+folder1+kernel)
if 'file not found' in html:
    print(html)
meta = scrape_kernel_content(html, needles)
print(meta)


#store_data(links, folder_base+folder+out, toJson=True)

{'ml_slugs': [], 'ml_terms': ['rank', 'deep learning', 'detect', 'classif', 'fit', 'filter', 'predict', 'train', 'training data'], 'ml_libs': [], 'ml_tags': ['DL', 'Classification'], 'description': ''}
{'ml_slugs': ['AI'], 'ml_terms': ['epoch', 'model', 'label', 'u-net', 'relu', 'layer', 'detect', 'loss', 'activation function', 'neural network', 'test data', 'fit', 'filter', 'predict', 'recommend', 'train', 'convolutional neural network', 'image segmentation'], 'ml_libs': ['pytorch'], 'ml_tags': ['U-Net', 'ReLu', 'NN', 'Recommender', 'CNN', 'Image Segmentation', 'AI'], 'description': "Please check out Guido's excellent kernel here. In this kernel i show how to perform inference on test set using the trained model. I just added RaDAM optimzer and got some better score. You can find the BEV of the test set here. Updates: Corrected yaw calculation Used category height information"}


In [None]:
# iterate all folders and compose results into meta.json

url = 'https://www.kaggle.com/'
#folder_base = '../data/repositories/kaggle/competitions/'
folder_base = '../data/repositories/kaggle/datasets/'
file_notebook = 'notebook_02.html'
file_kernel = 'kernel.html'
file_out = 'meta.json'

skip = True # if true skip meta collection / set to false to recreate {file_out} from scratch
quit = 100000 # quit after n files processed
breakOnError = False

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0
j = 0

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', len(folders))
    for subfolder in subfolders:
        print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,subfolder, 'notebooks/')
        i += 1
        
        if os.path.exists(path):
            projects = os.listdir(path)

            for author in projects:
                #print('author:', author)
                items = os.listdir(os.path.join(folder_base,folder,subfolder, 'notebooks/', author))

                for notebook in items:
                    j+=1
                    print(' - ', j, 'author:', author, '/ notebook:', notebook)
                    path_notebook = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_notebook)
                    path_kernel = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_kernel)
                    path_out = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_out)
                    
                    meta = {}
                    
                    if skip and os.path.isfile(path_out):
                        print('skipped')
                            
                    else:
                        #try:
                        if True:
                            # get meta-data
                            if os.path.isfile(path_notebook):
                                #print('notebook found')
                                meta['scraped_at'] = creation_date(path_notebook)
                                meta['link'] = f'{url}{author}/{notebook}'
                                meta['reference'] = f'{url}{folder}/{subfolder}'
                                html = load_data(path_notebook, fromJson=False)
                                if 'file not found' in html:
                                    print('notebook not found')
                                    break
                                meta.update(scrape_notebook_content(html))

                            # get meta-data
                            if os.path.isfile(path_kernel):
                                #print('kernel found')
                                html = load_data(path_kernel, fromJson=False)
                                if 'file not found' in html:
                                    print('kernel not found')
                                    break
                                meta.update(scrape_kernel_content(html, needles))
                            else:
                                #print('kernel not found')
                                html = load_data(path_notebook, fromJson=False)
                                if 'file not found' in html:
                                    print('notebook not found')
                                    break
                                meta.update(scrape_kernel_content(html, needles))

                            #print(meta)
                            store_data(meta, path_out, toJson=True)
                            
                        #except Exception as e:
                        #    print("Oops!", e.__class__, "occurred.")
                        #    print(e)
                        #    if breakOnError:
                        #        sys.exit()
                
                if j>quit:
                    break
            if j>quit:
                break
    if j>quit:
        break

In [46]:
# scoring function to get a score between 0...1 for integer-values, 0.5 should be at ~100
def score(n, precision=3):
    if isinstance(n, int) or isinstance(i, float):
        return round(1-1/math.pow(1+n, 0.15), precision)
    else:
        try:
            n = int(n)
        except:
            return 0
        
    return round(1-1/math.pow(1+n, 0.15), precision)

for n in [0,1,10,25,50,100,1000,10000]:
    print(score(n))
    
print(score('3'))
print(score('a'))

0.0
0.099
0.302
0.387
0.446
0.5
0.645
0.749
0.188
0


In [None]:
# throw all parsed meta-data together in a single csv
# select only true ML cases

folder_base = '../data/repositories/kaggle/competitions/'
file_json = 'meta.json'
fp_csv = '../data/database/kaggle_competitions_01_original.csv'
fp_research = '../data/database/kaggle_competitions_02_research.csv'

#datasets
folder_base = '../data/repositories/kaggle/datasets/'
fp_csv = '../data/database/kaggle_datasets_01_original.csv'
fp_research = '../data/database/kaggle_datasets_02_research.csv'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0
j = 0

runtime_start = time.time()
df = pd.DataFrame()
df2 = pd.DataFrame()

for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    print('subfolder:', len(folders))
    for subfolder in subfolders:
        #print('###', i, os.path.join(folder_base,folder,subfolder))
        path = os.path.join(folder_base,folder,subfolder, 'notebooks/')
        i += 1
        
        if os.path.exists(path):
            projects = os.listdir(path)

            for author in projects:
                #print('author:', author)
                items = os.listdir(os.path.join(folder_base,folder,subfolder, 'notebooks/', author))

                for notebook in items:
                    j+=1
                    #print(' - ', j, 'author:', author, '/ notebook:', notebook)
                    fp_json = os.path.join(folder_base,folder,subfolder, 'notebooks/', author, notebook, file_json)
                    
                    if os.path.isfile(fp_json):
                        data = load_data(fp_json, fromJson=True)
                        #print(data)
                        if 'score_votes' in data:
                            data['score_votes'] = score(data['votes'])
                        if 'score_views' in data:
                            data['score_views'] = score(data['views'])
                            
                        if not 'ml_slugs' in data:
                            data['ml_slugs'] = ''
                        if not 'ml_terms' in data:
                            data['ml_terms'] = ''
                        if not 'ml_libs' in data:
                            data['ml_libs'] = ''
                        
                        ml_score = 0
                        if len(data['ml_slugs']) > 0 or len(data['ml_terms']) > 0:
                            ml_score += 0.2
                        if 'ml_tags' in data and len(data['ml_tags']) > 0:
                            ml_score += 0.3
                        if len(data['ml_libs']) > 0:
                            ml_score += 0.5
                        data['ml_detected'] = ml_score
                        
                        if 'description' in data:
                            data['description'] = data['description'].replace('\n', ' ').replace('\r', '').replace('¶', '').strip()
                        else:
                            data['description'] = ''
                        
                        if 'date' in data:
                            # date (ignoring GMT+x)
                            # Wed Dec 19 2018 14:42:40 GMT+0100 (Mitteleuropäische Normalzeit)
                            # https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
                            date_time_str = data['date'].split('GMT')
                            date_time_str = date_time_str[0].strip()
                            date_time_obj = datetime.datetime.strptime(date_time_str, '%a %b %d %Y %H:%M:%S')

                            #print('Date:', date_time_obj.date())
                            #print('Time:', date_time_obj.time())
                            #print('Date-time:', date_time_obj)
                            data['date'] = date_time_obj
                        
                        # store only items with:
                        # - ml_score >= 0.5
                        # - are indicated as "best submission" # and 'Best Submission' in data['submission'] 
                        # - description > 1 word
                        words = data['description'].split(' ')
                        if ml_score >= 0.5 and len(words) > 5:
                            df = df.append(data, ignore_index=True)
                            print(data['link'], 'is ML use case')
                        
                        else:
                            #if 'R' in data['type']:
                            df2 = df2.append(data, ignore_index=True)
                            print(data['link'], 'is not ML use case')
                            
                    #if j % 100 == 0:
                    #    print('folder', i, '/ notebook', j)
                        
                if quit!=0 and j>quit:
                    break
            if quit!=0 and j>quit:
                break
    if quit!=0 and j>quit:
        break
        
# drop duplicates
df = df.drop_duplicates(['link'])

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', j, 'items')
print(df.shape)
print(df.head())
        
# drop columns
df.drop(columns=['author', 'submission'], inplace=True)
df2.drop(columns=['author', 'submission'], inplace=True)

df.to_csv(fp_csv, sep=';', index=False)
df2.to_csv(fp_research, sep=';', index=False)

In [51]:
# print size of gathered csv
print(df.shape)
print(df2.shape)

(16730, 19)
(26744, 20)


In [52]:
# string to list

def str_to_list(s):
    s = s.replace("'", "").replace(' ,', ',').replace(
        '[', '').replace(']', '').split(',')
    s = [i for i in s if i]
    return s

In [58]:
# correlate competitions/datasets with attached notebooks

folder_base = '../data/repositories/kaggle/competitions/'
file_json = 'meta.json'
fp_csv_notebooks = '../data/database/kaggle_competitions_01_original.csv'
fp_csv_out = '../data/database/kaggle_competitions_correlated_01.csv'

# datasets
folder_base = '../data/repositories/kaggle/datasets/'
fp_csv_notebooks = '../data/database/kaggle_datasets_01_original.csv'
fp_csv_out = '../data/database/kaggle_datasets_correlated_01.csv'

quit = 0 # quit after n files processed / 0 ... no limit

folders = os.listdir(folder_base)
print('folder:', len(folders))
i = 0

runtime_start = time.time()
df = pd.DataFrame()
df_n = pd.read_csv(fp_csv_notebooks, sep=';')
#print(df_n.head)

print('folder:', len(folders))
for folder in folders:
    subfolders = os.listdir(os.path.join(folder_base,folder))
    for subfolder in subfolders:
        i += 1
        #print('###', i, os.path.join(folder_base,folder,subfolder))
        
        # read in dataset
        data = load_data(os.path.join(folder_base,folder,subfolder,file_json), fromJson=True)
        if 'title' in data:
            title = data['title'].strip()
            #print(dataset)
        else:
            data['title'] = ''
        print(i, title)
        
        #rows = []
    
        #df_n.set_index('sources', inplace = True)
        #rows = df_n.loc[title]
        #rows = df[df_n['sources'] == title]
        rows = df_n.loc[df_n['sources'] == title]
            
        #print(rows.shape[0])
        
        # if one notebook with ml is found the dataset can be considered as ml-dataset
        if rows.shape[0] > 0:
            #print('attached', data)
            
            # parse date
            if 'date_closed' in data:
                # Thu Jan 17 2019 00:59:00 GMT+0100
                date_time_str = data['date_closed'].split('GMT')
                date_time_str = date_time_str[0].strip()
                date_time_obj = datetime.datetime.strptime(date_time_str, '%a %b %d %Y %H:%M:%S')
                data['date_closed'] = date_time_obj
            
            # parse score
            
            if 'teams' in data:
                teams = data.get('teams')
                teams = int(teams.replace(',',''))
                data['teams'] = teams
                data['teams_score'] = score(data['teams'])
            else:
                data['teams_score'] = 0
            
            # parse tags
            tags = []
            for tag in rows['tags'].tolist():
                #print(tag)
                tag = str_to_list(tag)
                tags.extend(tag)
                
            tags = [x.replace('+1 more','') for x in tags]
            tags = [x.replace('+2 more','') for x in tags]
            tags = [x.replace('+3 more','') for x in tags]
            tags = [x.replace('+4 more','') for x in tags]
            tags = list(map(str.strip, tags))
            
            tags = list(set(tags))
            data['tags'] = tags
            
            #print(list(set(tags)))
            df = df.append(data, ignore_index=True)
        
        if quit!=0 and i>quit:
            break
             
    if quit!=0 and i>quit:
        break
        
# drop duplicates
#df = df.drop_duplicates(['link'])

runtime_end = time.time()
print('runtime:', round(runtime_end - runtime_start, 3), 'seconds for', j, 'items')
print(df.shape)
print(df.head())
        
# drop columns
#df.drop(columns=['description'], inplace=True)

df.to_csv(fp_csv_out, sep=';', index=False)

folder: 8467
folder: 8467
1 Computational Imaging
2 Eye Gaze
3 Finding Bubbles in Foam
4 Segmenting Soft Tissue Sarcomas
5 catdoginfo
6 Basic Arabic Vocal Emotions Dataset
7 roberta_an_an_an
8 Military Aircraft Detection Dataset
9 Bert0.01
10 bert0.2
11 bert0.3
12 Calories Burned During Exercise and Activities
13 Captcha Images
14 PUBG Weapon Stats
15 Valorant Weapon Stats
16 IPL 2008 - 2017 Predictor
17 FIA F1 (Formula 1) 1950-2020 data
18 YouTube VP and Presidential Debate Comments
19 Tarantino Scripts
20 Divorce/marriage dataset with birth dates
21 Top crypto exchanges BTC cold wallets transactions
22 Simple Source
23 Devpost Project Data
24 Jester Collaborative Filtering Dataset
25 BIG MART SALES DATASET
26 Churn_Modelling.csv
27 Saudi Hotels in Booking.com
28 AAU RainSnow Traffic Surveillance Dataset
29 AAU Zebrafish Re-Identification Dataset
30 The Brackish Dataset
31 Thermal soccer dataset
32 AAU VAP Trimodal People Segmentation Dataset
33 Demand Dataset
34 Hypertension Data
35 

282 Albert-Large-v2
283 Face Mask Dataset (YOLO  Format)
284 Attend2You - InstaPic Dataset
285 Premier League Detailed Team Data
286 Retail Analysis with Walmart Data
287 Face Recognition
288 Factors Affecting Early Adult Lung Function 2
289 Factors Affecting Early Adult Lung Function 2
290 All India Mobile Data Speed for July 2020 dataset
291 NCAA BASKETBALL MEN percentage win
292 Bike Crash Data 2007-2018
293 European Football Database 2019/2020
294 100,000 UK Used Car Data set
295 AdmissionPrediction.csv
296 Amsterdam - AirBnb
297 Latest IMDB
298 Flick 30k Dataset
299 Flickr 8k Dataset
300 Allergen Data Based on Geographic Location
301 Jojo's Bizarre Adventure
302 S&P index historical Data
303 IPL Player Stats (2008 to 2020)
304 COVID-19 Dataset for California Counties
305 COVID-19 Dataset for Michigan Counties
306 StatsBomb data
307 Births in U.S 1994 to 2003
308 IMDB movies metadata
309 Sample Stock Test
310 Koeln Strafzettel 2015
311 extradata
312 RePaUfra_v1
313 Obesity Stats
31

604 The Price and Sales of Avocado
605 ticks: bitcoin, ethereum,litecoin, ripple
606 Game of Thrones Script All Seasons
607 Laminar Coflow Flame (CRECK-C1C3HT) CH4/N2 65/35
608 Laminar Coflow Flame (DRM19) CH4/N2 65/35
609 bert-v2
610 word2vec_v2
611 word2vec-results
612 word2vec-total-results
613 AZLyrics song lyrics
614 Australian Weather Observation Data
615 google quest lstm model trained in keras
616 Data_files
617 Model Zoo utility files for object detection task
618 Philippine Voters Profile
619 Saudi Daily Stocks History (Tadawul)
620 Facebook Anti-Vaccine Post Data (Scaled Features)
621 Panda_Dataset_final_36_256_256
622 La Liga Santander 2019-2020
623 User's Age by Expenses
624 Rent Prices - Spain
625 Industrial Production Index in USA
626 Leading Indicators OECD
627 Trade of Goods Indexes by Country
628 World Bank's Global Economic Prospects Data
629 World Economic Indiactors by OECD
630 World National and Real GDP (Annualy/Quaterly)
631 World Unemployment Rate from OECD
632

890 12 Months Financial Data
891 Análisis Mercado Libre - Categoría Refrigeradores
892 Fatal US Police Shootings Data
893 Youtube Video Statistics
894 Solar and Meter Readings 15mins
895 Titanic
896 history_weather_munich
897 Linear Regression
898 2020 Cost of Living
899 Bing, NRC, Afinn Lexicons
900 Birdcall Recognition Data
901 Country Mapping - ISO, Continent, Region
902 GTZAN Dataset - Music Genre Classification
903 ISO Country Codes - Global
904 Pulmonary Fibrosis Prep Data
905 Rick&Morty Scripts
906 Riiid! Answer Correctness Prediction - rapids
907 Estrutura Educacional X Desempenho ENEM
908 Twitch Social Networks
909 Spanish Poetry Dataset
910 Swedish NER corpus
911 International airline passengers
912 Acres burned in forest fires in Canada, 1918-1988
913 breast_cancer
914 Duke Breast Cancer Dataset
915 Sales Of Shampoo
916 Species Classification
917 Campeonato Brasileiro 2009-2018
918 resnest269
919 Data Jobs Listings - Glassdoor
920 COVID-19 Mexico Clean & Order by States
921 

1140 Milan Airbnb Open Data (only entire apartments)
1141 Words in Portuguese (BR) and Emotions
1142 COVID-19 Community Data Collection
1143 Malicious_n_Non-Malicious URL
1144 Hotel review
1145 Spooky Author
1146 Indian Economical Data  1990 to 2019
1147 Car damage detection
1148 Uber Request Data.csv
1149 emotion_analysis
1150 Liver_patient
1151 Million Song Data Set Subset
1152 Mobile Data Speeds of All India during March, 2018
1153 Alphabet Stocks
1154 Restaurant_Reviews
1155 newdata
1156 DeepSlice & Secure5G - 5G & LTE Wireless Dataset
1157 Data Analytics to study music streaming patterns
1158 cells_recog
1159 NETWORK ANAMOLY DETECTION
1160 california housing value
1161 Recruit Restaurant Visitor Forecasting Data
1162 ocr_test
1163 ensemble3
1164 American Sexual Health Association
1165 Cotas Atividade Parlamentar dos Senadores (CEAPS)
1166 nepal json file
1167 NLL Statistics
1168 San Francisco Building Permits
1169 The Paleobiology Database
1170 Mental Health Dataset
1171 Profile o

1418 Lord of the rings text
1419 Loan Prediction
1420 Test_A102.csv
1421 Training set
1422 Millets
1423 COVID-19 INDIA DATA
1424 COVID-19 SOUTHAFRICA
1425 Mental Health Poll
1426 Palmer Penguins Dataset-Alternative Iris Dataset
1427 SMILE Twitter Emotion Dataset
1428 OpenPowerlifting
1429 California_fire_perimeters
1430 Random Aircraft Information
1431 Devanagari Character Dataset
1432 Cloud and Non-Cloud Images(Anomaly Detection)
1433 Major countries nCoV Data Credibility Tracking
1434 ANZ Data@ANZ Program
1435 KPMG Virtual Internship
1436 Quantium Data Analytics Virtual Experience Program
1437 online_class_prediction
1438 Samsung Mobile Price
1439 Ask Reddit Dataset
1440 COVID-19 Coronavirus Complete Dataset
1441 Titanic data files
1442 Surat Traffic vehicle data
1443 ASII.jk
1444 Melanoma TFRecord 256x256 ATelea
1445 lyft training tfrecord 224 - 0
1446 lyft training tfrecord 224 - 1
1447 lyft training tfrecord 224 - 2
1448 lyft validation tfrecord 224
1449 Fire and Smoke dataset
145

1701 lyft-tfrecords-uint8-part5
1702 lyft-tfrecords-uint8-part6
1703 lyft-tfrecords-uint8-part7
1704 lyft-tfrecords-uint8-train-chopped100-part0
1705 lyft-tfrecords-uint8-train-chopped100-part1
1706 lyft-tfrecords-uint8-train-chopped100-part2
1707 lyft-tfrecords-uint8-train-chopped100-part3
1708 lyft-tfrecords-uint8-train-chopped100-part4
1709 lyft-tfrecords-uint8-train-chopped100-part5
1710 lyft-tfrecords-uint8-train-chopped100-part6
1711 lyft-tfrecords-uint8-train-chopped100-part7
1712 lyft-tfrecords-uint8-val-part0
1713 lyft-tfrecords-uint8-val-part1
1714 lyft-tfrecords-uint8-val-part2
1715 lyft-tfrecords-uint8-val-part3
1716 lyft-tfrecords-uint8-val-part4
1717 lyft-tfrecords-uint8-val-part5
1718 lyft-tfrecords-uint8-val-part6
1719 lyft-tfrecords-uint8-val-part7
1720 lyft-tfrecords-uint8-valid-chopped100-part0
1721 lyft-tfrecords-uint8-valid-chopped100-part1
1722 lyft-tfrecords-uint8-valid-chopped100-part2
1723 lyft-tfrecords-uint8-valid-chopped100-part3
1724 lyft-tfrecords-uint8-va

1965 JHU Coronavirus COVID-19 Global Cases, by country
1966 COVID19 - The New York Times
1967 covid19-public-forecasts
1968 USAFacts US Coronavirus Database
1969 Cooperative Patent Classification (CPC) Data
1970 Bitcoin Cash Blockchain
1971 Dash Crypto Blockchain
1972 Dogecoin Crypto Blockchain
1973 Ethereum Classic Blockchain
1974 Litecoin Crypto Blockchain
1975 Zcash Crypto Blockchain
1976 Disclosed Standard Essential Patents (dSEP) Data
1977 ChEMBL EBI Small Molecules Database
1978 Ethereum Blockchain
1979 1000 Cannabis Genomes Project
1980 geo-openstreetmap
1981 Google Analytics Sample
1982 Google Patents Research Data
1983 MAtrixware REsearch Collection (MAREC) Data
1984 Open Images
1985 Google Patents Public Data
1986 PatentsView Data
1987 BigQuery Sample Tables
1988 Medicare and Medicaid enrollment
1989 Medicare and Medicaid Services
1990 Point-in-Time Homelessness Count
1991 SEC Filings
1992 Intellectual Property Investigations by the USITC
1993 USPTO OCE Patent Assignment Data

2204 Uber and Lyft Dataset Boston, MA
2205 Build Bridges, Not Walls
2206 EEG data from basic sensory task in Schizophrenia
2207 California Kindergarten Immunization Rates
2208 Denver AirBNB
2209 Weekly CDC Pneumonia Cases
2210 titanic
2211 YouTube Video Categories
2212 Massdrop Catalog
2213 finetuneunm
2214 unm10000II
2215 goodreads books/author data
2216 transactions
2217 Sushi Sandwich
2218 Intraday market data
2219 headline
2220 gdp_per_capita
2221 titanic_train
2222 testqqw
2223 Horse Colic Dataset (with test file)
2224 Dados hidrometeorológicos do Brasil
2225 carpeta
2226 Brain cancer gene expression - CuMiDa
2227 Breast cancer gene expression - CuMiDa
2228 Leukemia gene expression - CuMiDa
2229 formation-machine-learning
2230 IBOVESPA Futures Contracts
2231 BM&FBovespa Time Series 1986-2019
2232 Foreign Exchange Rates 2000-2019
2233 Abstract Art Gallery
2234 Case Study: Applicants for a Gold Digger position
2235 CAC40 Stocks Dataset
2236 FIFA21 OFFICIAL DATASET
2237 Dataset of En

2474 ISIC 2019 TFRecords 192x192
2475 ISIC 2019 TFRecords 256x256
2476 ISIC 2019 TFRecords 384x384
2477 ISIC 2019 TFRecords 512x512
2478 ISIC 2019 TFRecords 768x768
2479 JPEG ISIC 2019 1024x1024
2480 JPEG ISIC 2019 128x128
2481 JPEG ISIC 2019 192x192
2482 JPEG ISIC 2019 256x256
2483 JPEG ISIC 2019 384x384
2484 JPEG ISIC 2019 512x512
2485 JPEG ISIC 2019 768x768
2486 JPEG Melanoma 1024x1024
2487 JPEG Melanoma 128x128
2488 JPEG Melanoma 192x192
2489 JPEG Melanoma 256x256
2490 JPEG Melanoma 384x384
2491 JPEG Melanoma 512x512
2492 JPEG Melanoma 768x768
2493 Malignant TFRecords 128x128
2494 Malignant TFRecords 192x192
2495 Malignant TFRecords 256x256
2496 Malignant TFRecords 384x384
2497 Malignant TFRecords 512x512
2498 Malignant TFRecords 768x768
2499 Malignant-v2 TFRecords 1024x1024
2500 Malignant-v2 TFRecords 128x128
2501 Malignant-v2 TFRecords 192x192
2502 Malignant-v2 TFRecords 256x256
2503 Malignant-v2 TFRecords 384x384
2504 Malignant-v2 TFRecords 512x512
2505 Malignant-v2 TFRecords 76

2749 Chicago Towing Records
2750 Speed Camera Violations in Chicago, 2014-2016
2751 preprocessing
2752 train_tatanic
2753 modelcmp
2754 philippines
2755 Taiwanese Bankruptcy Prediction
2756 China Mobile User Gemographics
2757 China City Names with province
2758 jiejingshibie
2759 Facebook_dataset
2760 ODI cricket runs
2761 Air Passengers
2762 Europe Economic Crisis
2763 faceRecognition
2764 facial emotion recognition
2765 bank details
2766 BigMartSale
2767 research_student.csv
2768 Stanford Open Policing Project
2769 Ta Feng Grocery Dataset
2770 Africa Economic, Banking and Systemic Crisis Data
2771 African Country Recession Dataset (2000 to 2017)
2772 our_submissions
2773 Annotated Potholes Image Dataset
2774 bert_cosreg
2775 roberta_large
2776 Covid-19 Cases Blumenau-SC
2777 atec_nlp_sim
2778 waldo_locations
2779 Esophageal Endoscopy Images
2780 EfficientNetB0-B7 Keras Weights
2781 America's Top College Rankings 2019 (Forbes)
2782 Airlines Tweets Sentiments
2783 2018 calorie, exercis

2993 Los Angeles Vehicle & Pedestrian Stop Data
2994 Los Angeles Water Capture by Method
2995 Los Angeles Water Main Breaks per Month
2996 Los Angeles Zoning Reference Table
2997 What's Happening LA Calendar Dataset
2998 OAK License Plate Reader (PLR) 12/2010 to 5/2014
2999 Oakland 5-Year Moratorium for City Streets
3000 Oakland Adopt a Drain Program Adoptions
3001 Oakland Call Center & Public Work Service Requests
3002 Oakland Campaign Finance FPPC Forms Data
3003 Oakland Certified Green Businesses
3004 Oakland City 5-Year Financial Forecast 2016-20
3005 Oakland City Boards & Commissions Contact List
3006 Oakland Coliseum Event Ticket Distribution 2011-18
3007 Oakland Crime, 911 Calls, Gun Incidents
3008 Oakland Crime Statistics 2011 to 2016
3009 Oakland Crime Statistics 4/2013 to 8/2016
3010 Oakland Cultural Funding Program Grant Recipients
3011 Oakland Equal Access Accommodations
3012 Oakland Equity Indicator Scores and Conversions
3013 Oakland Equity Indicators - Health & Wellness


3244 Salary Data
3245 The Marvel Universe Social Network
3246 Titanic Training Dataset
3247 Air pollution impact
3248 Fire-detection-model-Keras for video
3249 London Borough and Ward Boundaries up to 2014
3250 Pokemon Gen VII Pokedex with Moves
3251 Kiva loan mpi geocodes
3252 Fairy Tales
3253 20201109TabNetTest2
3254 Crimes in Chicago
3255 skewed
3256 RSNA Train/Test 256 Sub Window
3257 Cook Partisan Voter Index (PVI) 2017
3258 Cook Partisan Voter Index (PVI) for States (2017)
3259 Nebraska Football Box Scores 1962-2019
3260 NFL Team Stats 2002-2019 (ESPN)
3261 US House Cook Ratings & Election Results 2002-2018
3262 US Senate Cook Rating & Election Results 1976-2018
3263 NYSE from 1/1/05 to 1/16/18
3264 NTSB Accident Reports
3265 Ocean Ship Logbooks (1750-1850)
3266 Tectonic Plate Boundaries
3267 tweet-sentiment-extraction-2020-complete-pseudo
3268 Skill Mapping Dataset
3269 2016 MLB Season
3270 MTA Turnstile Traffic
3271 Actor's Dataset
3272 baidu-competition_final_data
3273 Diabeti

3504 Face Detection in Images
3505 Resume Entities for NER
3506 Vehicle Number Plate Detection
3507 Raven_distribute_nine
3508 SEC (EDGAR) Company Names & CIK Keys
3509 Hearthstone: Heroes of Warcraft Standard Cards
3510 UK Daily Confirmed Cases
3511 1.6 million UK traffic accidents
3512 NBA Finals Team Stats
3513 Publicly Supported Symbols of the Confederacy
3514 ND-GAIN: UAA Dataset
3515 Jigsaw Multilingual IPA-transliterated dataset
3516 Weather Data for COVID-19 Data Analysis
3517 USDA Barley Data
3518 Predicting Student Grades
3519 volcano stft data
3520 volcano test features
3521 volcano train fts
3522 predicting churn using balanced data set
3523 Twitter Bots Accounts
3524 Prime Ministers of Spain
3525 CORD-19 fastText Vectors
3526 CORD-19 Full Index
3527 CORD-19 QA
3528 CORD-19 Study Design
3529 COVID-19 National responses dataset
3530 trabajo
3531 All Roy Rogers Locations (May 2020)
3532 Word Cloud Mask
3533 Privatization of Space
3534 Days not Spent at School
3535 GitHub Issu

3783 Credit card fraud detection
3784 Logistic regression To predict heart disease
3785 MNIST-Handwritten Digit Recognition Problem
3786 made_cv_hw2
3787 Cassava Leaf Disease TFRecords 128x128
3788 Cassava Leaf Disease TFRecords 256x256
3789 Cassava Leaf Disease TFRecords 384x384
3790 Cassava Leaf Disease TFRecords 512x512
3791 Cassava Leaf Disease TFRecords classes 512x512
3792 housePrices_log
3793 Monet TFRecords 256x256
3794 TFRecords Monet paintings 256x256
3795 North Dakota Mile Markers
3796 Ponniyan selvan Tamil Book for NLP
3797 Skin Cancer MNIST: HAM10000
3798 Sri Lankan State Universities, Faculties & Degrees
3799 COVID-19 Stats and Mobility Trends
3800 Notícias publicadas no Brasil
3801 COVID-19 Indonesian Tweets
3802 IMDb Indonesian Movies
3803 Indonesian Batik Motifs
3804 Indonesian Names
3805 countries
3806 Student Grade Prediction
3807 Play Store Game Reviews
3808 Text Classification on Emails
3809 Parkinson's Disease (PD) classification
3810 Cloud GPU provider cost and s

4071 Deaths INEGI 2014
4072 WPP2019POP Annual population by age group
4073 Titanic data
4074 Eclipse Megamovie
4075 Global Terrorism DB
4076 Mother Jones Mass Shootings
4077 School Shootings US 1990-present
4078 climate change
4079 Obras Odebrecht Latam
4080 Ranking presidentes
4081 19,000 Spotify Songs
4082 TabNetFe7Seeds424810Folds
4083 Bolivia COVID19 datasets
4084 London Bike Sharing System
4085 150K Lyrics Labeled with Spotify Valence
4086 ESPN NBA Players Data
4087 Fortune Global 500
4088 Jordan vs Lebron
4089 Disease Predictor
4090 Generation 8 Pokemon
4091 Superalloys
4092 Wood texture samples
4093 AdaBelief
4094 pytorch tabnet
4095 Overwatch
4096 DHT11 Temperature and Humidity Sensor
4097 ATP matches dataset
4098 Twitter Climate Change Sentiment Dataset
4099 Tesla Vehicle Sales by Quarters
4100 json_femurn
4101 Exoplanets Database
4102 Brazilian dams and Brumadinho households
4103 Quality Prediction in a Mining Process
4104 COVID-19 dataset from Región de Murcia, Spain
4105 Op

4348 ISEAR Dataset
4349 Autism Screening
4350 Books_with_description
4351 US Election 2020 - County Results + Metadata
4352 Captcha 2 text
4353 Rush Tour Date and Locations
4354 Skin Cancer: Malignant vs. Benign
4355 500 Cities: Local Data for Better Health
4356 fdsfsdf
4357 Wikiquote Short English Quotes
4358 NYC WFLS
4359 Predicting Fraud for Mobile Payment Services
4360 College data
4361 Drinks by country
4362 Multidigit MNIST(M2NIST)
4363 50 Startups
4364 Census Income
4365 Position_Salaries
4366 Social Network Ads
4367 Blue Book for Bulldozers
4368 Sentiment Analysis dataset-Google Play App Reviews
4369 Social Media Prediction
4370 Twitter Black Panther
4371 Life Expectancy by Country and Year Gapminder
4372 Urdu Aphabets (MNIST)
4373 Linear Regression Content
4374 numberplate 2
4375 Turkish Super League Matches (1959-2020)
4376 bert_inference
4377 bertinference1
4378 inference
4379 OpenVaccine Competition gold submission files.
4380 housing_data
4381 Boston House-Predict
4382 Mar

4569 League of Legends Champions Abilities 10.15
4570 League of Legends High Elo Patch 10.16
4571 CursoPandas
4572 Bitcoin News
4573 Influenza and Env Factors
4574 Buenos Aires Public Bike System Dataset
4575 Truck Breadcrumb information
4576 housing prices
4577 GNP Per Industry in Mexico
4578 The SDOBenchmark Dataset
4579 Combustible Price Brasil
4580 How ISIS Uses Twitter
4581 Religious Texts Used By ISIS
4582 The Global Avian Invasions Atlas
4583 United States Commutes
4584 20 newsgroup preprocessed
4585 Bovespa Index - IBOV composition stocks data
4586 Crunchyroll animes database
4587 Deep Learning A-Z - ANN dataset
4588 Medical NER Spacy Model
4589 NASDAQ financial fundamentals
4590 Financials as Reported 2010-2020 - SEC Filings
4591 SEC Filings 1994-2020
4592 S&P 500 futures tick data (SP)
4593 tuyutext
4594 Employee-review
4595 2020 Atlantic Hurricanes/Storms (preeliminary)
4596 World Health Organization COVID-19 data
4597 US counties COVID 19 dataset
4598 Yelp dataset
4599 Open

4840 OC2Emission
4841 Melbourne Housing Snapshot
4842 Flower images dataset
4843 Environmental Sensor Telemetry Data
4844 LAN Network Stability
4845 faces_data_new
4846 usapolygones4
4847 Bank Marketing Campaigh Dataset
4848 Payroll dataset from Campo Alegre - AL - Brazil
4849 Brazilian IBOV Historical Data from 1992 to 2019
4850 Practice Dataset Predict Customer Churn (Telecom)
4851 YouTube Scrapped Data
4852 IPL 2020
4853 Kaggle Competions, Rankings and Kernels
4854 Top Indian Educational apps reviews
4855 imdb dataset
4856 Victoria State Accident DataSet
4857 ToyotaCorrolla Data-Set .
4858 Me_Jarvis Lichess Matches
4859 L&T Vehicle Loan Default Prediction
4860 Airline Passenger Traffic
4861 Bank Data : Testing
4862 Bank Telemarketing Campaign Case
4863 Bike Sharing
4864 Categorical : Country Geotags
4865 Real-time Covid 19 Data
4866 Cricket
4867 Diabetes (●'◡'●)
4868 Exchange Rate
4869 Financial Banking Marketing Campaign
4870 HELP International
4871 India Press
4872 Loan Defaulter


5156 trainingdata_7_8_9
5157 trainingsample8_9
5158 Canadian Underwriter insPRESS Article Headlines
5159 Criptocurrencies
5160 Covid19 Spain report
5161 What people purchase
5162 Encuesta Longitudinal Empresarial 5ta vers CHILE
5163 Complete IMDB Movies Dataset
5164 takee2
5165 title12
5166 usedCarPrices
5167 San Franscisco Checkin
5168 Data_set
5169 Crimes Committed in France
5170 Open Data 500 Companies
5171 Stock Holdings of Elected Officials (Virginia)
5172 rvl-cdip
5173 Anna University results May-June 2016
5174 HeartDisease
5175 Heartdiseasepredictiion
5176 Car Data
5177 knime_cat_publ
5178 USA Housing dataset
5179 230 Bird Species
5180 Detect Autism from a facial image
5181 BBC YouTube Videos Metadata
5182 Bird Songs Recordings from United States
5183 China Regions Map
5184 Chinese MNIST
5185 COVID19 Daily Updates
5186 COVID-19 Romania - County level
5187 COVID19 Tweets
5188 Doctoral Boards in Military Academies in Romania
5189 Elementary school admission Romania 2014
5190 EU St

5432 Income Prediction using python
5433 INTELLIGENT IRRIGATION SYSTEM
5434 Indian Start-ups Funding Weekly Dataset 2016-2020
5435 US PRESIDENTS
5436 Suicides in India
5437 Coursera Face Recognition
5438 Recipe for Creating Word Embedings word2vec
5439 Tech News Articles
5440 Faces with Masks
5441 House Hold Price
5442 Temperature in Ann Arbour, Michigan (2005-2015)
5443 Epileptic Seizure Recognition
5444 Poems Categorization Datasets
5445 Titanic Solution for Beginner's Guide
5446 Harvard Course Enrollments, Fall 2015
5447 Electoral Integrity in 2016 US Election
5448 Harvard Tuition
5449 Azerbaijan Datasets
5450 Covid 19 - GER
5451 read-for-training
5452 Car Price Prediction
5453 NSL-KDD
5454 EurUsd 60 Min
5455 ATIS Airline Travel Information System
5456 Corona Virus Cases in Pakistan
5457 Customer Churn
5458 Intent Classification
5459 Fake News
5460 Tunisia 2020 Projects
5461 kaggle
5462 Mushroom Classification Updated Dataset
5463 Bollywood Celebrity Faces
5464 US President Campaign

5703 Price of flats in Moscow
5704 Playing Cards Labelized Dataset
5705 Parkinson Replicated Acoustic Features Dataset
5706 COVID-19-Research
5707 Titanic1
5708 sales_train.csv
5709 Workers Browser Activity in CrowdFlower Tasks
5710 Semantic segmentation of aerial imagery
5711 Breast Cancer
5712 MNIST npy and JPEG
5713 Hung Data
5714 Data sample
5715 iterative_stratification
5716 json_string
5717 Top Chess Players
5718 Weather Data for Recruit Restaurant Competition
5719 crime  senior citizen
5720 Abalone UCI
5721 LendingClub Issued Loans
5722 dear genie kickstarter
5723 Fake news dataset for politics and fashion
5724 Historical Botnet Attack in Malaysia
5725 Twitter Emotion cryptocurrency
5726 InlandSea Tsunami Damage
5727 Wine Quality by UCI
5728 Users mobile banking transaction frequency
5729 cars.csv
5730 Landscape Classification
5731 DeepSolar dataset
5732 COVID-19 Literature
5733 Horse Racing
5734 Tennis 2011-2019
5735 Twitter Friends
5736 Australian Childcare List 2020
5737 #432

5983 Medicare Spending Per Patient across Hospitals
5984 Amazon Commerce Reviews
5985 Amazon COVID-19 Predatory Pricing Data
5986 Amphibians Data Set
5987 Analytics Vidhya Insurance Cross Sell Prediction
5988 Bank Marketing Data Set
5989 Early Stage Diabetes Risk Prediction Dataset
5990 Facebook Large Page-Page Network Data Set
5991 Forest Fires Data Set Portugal
5992 Hepatitis C Virus for Egyptian Patients Data Set
5993 Machine Hack Housing Price Prediction
5994 QSAR Androgen Receptor Data Set
5995 QSAR Bioconcentration Classes Data Set
5996 QSAR Fish Toxicity Data Set
5997 Real Time Election Results: Portugal 2019 Data Set
5998 Student Performance Data Set
5999 Study of Asian Religious & Biblical Texts Data Set
6000 Uber Travel Movement Data [2 Billion+ Trips]
6001 Europe Hotel Satisfaction Score
6002 Emotions in text
6003 Predict HDI of villages using Satellite imagery
6004 List of English contractions
6005 COVID-19 useful features by country
6006 Covid-19 India Dataset
6007 Social 

6255 Mapping the KKK 1921-1940
6256 MRI and Alzheimers
6257 Home Mortgage Disclosure Act Data, NY, 2015
6258 Obama White House Budgets
6259 Penn World Table
6260 Scientific Researcher Migrations
6261 SCOTUS Opinions Corpus
6262 SF Beaches Water Quality
6263 SF Street Trees
6264 Spy Plane Finder
6265 Firearms Provisions in US States
6266 Trial and Terror
6267 Who's the Boss? People with Significant Control
6268 US Permanent Visa Applications
6269 US Traffic, 2015
6270 Traditional Flute Dataset for Score Alignment
6271 Cities 2020 for CDP
6272 Social indicators
6273 Spoken Verbs
6274 Synthetic Speech Commands Dataset
6275 Spotify Daily Top 200 Tracks in the Philippines
6276 roberta_large_squad2
6277 Kaggle Digit Recognition Dataset Reformatted
6278 Easily find ISO country codes (multilanguage)
6279 COVID-19 by country - Daily update
6280 COVID-19 Lockdown dates by country
6281 Japan Population Data
6282 MOA_transformations
6283 GloVe: Global Vectors for Word Representation
6284 BioBERT P

6534 Train/Test Hotel Data
6535 Dating App User Profiles' stats - Lovoo v3
6536 E-commerce - Users of a French C2C fashion store
6537 Influencers' profile descriptions - C2C Ecommerce
6538 Lottery features for time series Machine Learning
6539 Products and ratings of E-commerce - NewChic.com
6540 Sales of summer clothes in E-commerce Wish
6541 Adoptable Dogs
6542 Triple Crown Races (2005 - 2019)
6543 Morocco-coronavirus
6544 My watched movies list
6545 H-1B Visas
6546 PGA Tour Data
6547 Stock Index Prediction (Both Labels and Features)
6548 Agricultural Land Values (1997-2017)
6549 pokemon.com
6550 Facebook Stock
6551 MovieLens
6552 improved_sub.csv
6553 TCGA COAD MSI vs MSS Prediction (JPG)
6554 ENADE - Microdados - 2016, 2017, 2018
6555 IGM - Indice Governanca Municipal
6556 Adoro Cinema Movies Dataset
6557 WTA Matches and Rankings
6558 São Paulo Bus System
6559 Titanic
6560 Pink Floyd Lyrics
6561 Football Events Dictionary
6562 Philippines DepEd List of Senior High Schools
6563 3000

6811 Insurance Churn Prediction : Weekend Hackathon
6812 apex compiled for GPU for specific docker version
6813 jigsaw_multilingula_toxicity_token_encoded
6814 toxic multi-language trained torch model
6815 UDAS loan information
6816 German Credit Risk - With Target
6817 dataset
6818 Denver Hourly Weather
6819 adasdasd
6820 South Africa Consumer Price Index (CPI)
6821 japanese_bert_pretrained_model
6822 1000 Camera Specs
6823 80 Cereals
6824 2018 NCAA Solution File - Men's
6825 2018 NCAA Solution File - Women's
6826 US Dept of Education: College Scorecard
6827 Hillary Clinton's Emails
6828 Kaggle Blog: Winners' Posts
6829 2017 Kaggle ML & DS Survey
6830 2018 Kaggle ML & DS Survey
6831 Meta Kaggle
6832 No Data Sources
6833 Recipe Ingredients Dataset
6834 San Francisco Crime Classification
6835 SF Salaries
6836 US Baby Names
6837 iterative-stratification
6838 TitanicTrainingData
6839 My dataset
6840 NYC Housing Data 2003-2019
6841 BTS Lyrics
6842 APTOS2019-Processed-Images
6843 Find Waldo

7090 House Sales in King County, USA
7091 Access To Electricity
7092 Dhaka - AI
7093 NIH Chest X ray 14 (224x224 resized)
7094 dash_packages
7095 tz_ccode_latlong_locations_data
7096 Star-bucks coffee making
7097 big_data
7098 CSGO Guns Dataset
7099 city_scape
7100 freebitco.in
7101 AAPL.csv
7102 bengaliaicv19_trainedmodels
7103 COVID-19 Xray Dataset (Train & Test Sets)
7104 Jigsaw XLM-Roberta MLM Trained Weights
7105 M5 Forecasting Ensemble Data
7106 OSIC Model Weights
7107 SIIM-ISIC Submission Files
7108 XLNet Weights
7109 2020 Indonesian University Ranking
7110 cc.id.300.vec.gz
7111 English Sarcasm
7112 Rainfall_Data
7113 digit-recognizer
7114 Gold Futures Historical Data
7115 sheep_goat
7116 Eurovision 2018 Detailed Results with Jury Info
7117 Aviation Accident Database & Synopses
7118 ICES_Catch_Dataset
7119 Solar Flares from RHESSI Mission
7120 UCDP Georeferenced Event Dataset
7121 Gender Voice Prediction--Decision tree modeling
7122 MAN_WOMAN_DATA
7123 image-fmix
7124 kh-rsna-mo

7325 Party strength in each US state
7326 Sounding rocket and MSIS temperature dataset
7327 UCI Communities and Crime Unnormalized Data Set
7328 UCI Communities and Crime Unnormalized Data Set
7329 Titanic open Research dataset
7330 Black Friday Sales Prediction
7331 Air Quality Data - Brisbane CBD
7332 All Datasets for Practicing ML
7333 Australian Forest Fire Area _2019-2020
7334 Earthquakes Data NZ
7335 Effects of COVID19 on Trade
7336 Facial Keypoint Detection SampleDS
7337 Hindi English Sentence Pairs
7338 Fine Dust Measurement [2018-2019]
7339 Impact of COVID on Black Communities
7340 Marathi English Sentence Pairs
7341 Mask Wearing
7342 Mild TBI & Concussion Claims
7343 Object Detection Sample Images
7344 Queensland Flora Census - 2019
7345 Random Image for Testing Classification
7346 TACO Dataset
7347 WORD DIFFICULTY PREDICTION
7348 APPLYfdfd
7349 Movie Lens dataset
7350 scores in leaderboard
7351 Activity Logs for smart phone
7352 Armanik Patient Drug-Switch
7353 Corona_Data
7

7618 Weekly Influenza Reports by Country
7619 World population demographics by age 2019
7620 roberta_large_squard2
7621 Identifying Influential Bloggers: Techcrunch
7622 Identifying Influential Bloggers: The TUAW Dataset
7623 Product Classification and Clustering
7624 Product Clustering, Matching & Classification
7625 Crack Segmentation Dataset
7626 Bike Sharing Dataset
7627 gold price dataset
7628 IMDB Dataset of 50K Movie Reviews
7629 Online retail dataset
7630 Santander Customer Transaction Prediction Dataset
7631 India - Trade Data
7632 diabetes prediction
7633 Tom-The-Cat
7634 COVID-19 MX
7635 Haarcascades
7636 LBPcascade
7637 Combined_candy_usip
7638 US_Industrial_Prodcution
7639 Population vs profit made by restuarant
7640 Cryptocurrency Dataset
7641 Mexico road accidents
7642 State wise tree cover India
7643 Urteile des eidg. Bundesgerichts
7644 NYC_Flight_Delay
7645 celeba-hq
7646 Dataset Metnum
7647 Education and COVID-19
7648 Handwriting Recognition
7649 Dummy Datasets
7650 

7917 Gemstones Images
7918 Genome Information by Organism
7919 IMS Daily Northern Hemisphere Sea Ice Coverage
7920 Solid Waste and Recycling Collection Routes
7921 Tarot Deck
7922 Weight vs Age of chicks on different diets
7923 WHO - Immunization coverage estimates by country
7924 Plain Text Wikipedia 2020-11
7925 CPU Specifications
7926 Playing Card
7927 speech commands tensorflow
7928 notMNIST dataset
7929 Energy consumption of the Netherlands
7930 baseline_weights
7931 winequality-red
7932 PT_ASAG_2018
7933 16 Factor Personality Test Responses
7934 Analog to Multiple Broadband Inventories Responses
7935 Big 5 Personality Scores
7936 Consideration of Future Consequences Scale Data
7937 Depression Anxiety Stress Scales Responses
7938 Duckworth Grit Scale Responses
7939 Empathizing-Systemizing Test Responses
7940 Experiences in Close Relationships Scale Responses
7941 Exposure Based Face Memory Test Responses
7942 Feminist Perspectives Scale Responses
7943 Fisher Temperament Inventory 

8168 South Asian Churn dataset
8169 Abalone
8170 final_train
8171 train_v2
8172 wine quality selection
8173 T5_base_pytorch
8174 tmdb_5000_movies
8175 slices
8176 Cassava Leaf Disease Models
8177 Indian Hindi film music
8178 Graduate school admission data
8179 World_Happiness Report_2017
8180 Starbucks Customer Survey
8181 MotionSense Dataset : Smartphone Sensor Data - HAR
8182 python-datatable
8183 Agri-Data-India
8184 All_India Pincode
8185 L&T Vehicle Loan Default Prediction
8186 IPL Batting First Wins Dataset
8187 imdb_movies_data
8188 Quotes- 500k
8189 Lego Images-200 most common pieces
8190 Indian Premier League (Cricket)
8191 minst fashion dataset
8192 DR Congo Conflicts, 1997-2020
8193 DonorsChoose.org Application Screening
8194 Wine Quality analytical data
8195 mercari-train
8196 COVID-19 UK Dataset
8197 UK NHS Hospital Database
8198 US Election 2020 Tweets
8199 2020 World Development Indicators
8200 Gamedevmap
8201 Signs Detection Dataset
8202 Women's Big Bash League matches


8468 Census (Augmented)
8469 Cholesterol
8470 Christianity_by_country
8471 cities by tourist
8472 Citrus_production
8473 Tree Growth
8474 Climate_Change_Performance_Index
8475 cloud dataset
8476 CMU MOSI
8477 Commercial_sorghum
8478 Composite_Index_of_National_Capability
8479 Console Purchases
8480 Coronavirus until 2020-09-01
8481 Corporal_punishment
8482 Corruption_Perceptions_Index
8483 Countries With Most Air Departures 1970 - 2017
8484 Country by International tourism, number of arriv
8485 Country goals in FIFA WorldCup
8486 Country Life Expectancy 1960-2017
8487 Country Production of Wheat 1960 2019
8488 Country Total Patent
8489 COVID-19 image dataset collection (volumes folder)
8490 CPI 1998-2020
8491 Credit (Augmented)
8492 Cricket_World_Cup
8493 Crime Rate in the US
8494 Cross-country_skiing_at_the_Winter_Olympics
8495 Cross-country_skiing_at_the_Winter_Paralympics
8496 Crude Oil Production by Country
8497 Curling_at_the_Winter_Olympics
8498 Cycling_at_the_Summer_Olympics
849

8748 List_of_Austrian_states_by_GDP
8749 List_of_banned_films
8750 List_of_banned_video_games
8751 List_of_Belgian_provinces_by_GDP
8752 List_of_Beyblade_episodes
8753 List_of_books_banned_by_governments
8754 List_of_Brazilian_films_of_the_1980s
8755 List_of_Bulgarian_provinces_by_GDP
8756 List_of_Caribbean_countries_by_population
8757 List_of_Chinese_administrative_divisions_by_GDP
8758 List-of-cities-in-Italy
8759 List_of_Colombian_departments_by_GDP
8760 List_of_continents_by_population
8761 List_of_counties_of_Kenya_by_GDP
8762 List_of_countries_and_dependencies_by_area
8763 List_of_countries_and_dependencies_by_population
8764 List_of_countries_and_territories_by_land_borders
8765 List_of_countries_by_4G_LTE_penetration
8766 List_of_countries_by_age_at_first_marriage
8767 List_of_countries_by_age_structure
8768 List_of_countries_by_aircraft_component_exports
8769 List_of_countries_by_aluminium_exports
8770 List_of_countries_by_aluminium_oxide_production
8771 List_of_countries_by_a

8995 List_of_Indian_states_and_union_territories_by_GDP
8996 List_of_Indonesian_provinces_by_GRP_per_capita
8997 List_of_islands_by_population
8998 List_of_Israeli_films_before_1960
8999 List_of_Italian_regions_by_GDP
9000 List_of_Italian_regions_by_GRP_per_capita
9001 List_of_Japanese_prefectures_by_GDP
9002 List_of_Japanese_prefectures_by_GDP_per_capita
9003 List_of_largest_cities
9004 List_of_largest_empires
9005 List_of_Latin_American_countries_by_population
9006 List_of_Malaysian_states_by_GDP
9007 List_of_Mexican_states_by_GDP
9008 List_of_Mexican_states_by_GDP_per_capita
9009 List_of_Middle_Eastern_countries_by_population
9010 List_of_minimum_wages_by_country
9011 List_of_national_capitals_by_population
9012 List_of_national_legal_systems
9013 List_of_Nigerian_states_by_GDP
9014 List_of_Nobel_laureates_by_country
9015 List_of_North_American_countries_by_GDP_-nominal-
9016 List_of_Oceanian_countries_by_GDP_-nominal-
9017 List_of_Oceanian_countries_by_population
9018 List_of_OECD_

9266 World_population
9267 World_Rowing_Championships
9268 World_Rugby_Rankings
9269 World_Sprint_Speed_Skating_Championships_for_Men
9270 World_Sprint_Speed_Skating_Championships_for_Women
9271 World_Squash_Championships
9272 World_Taekwondo_Championships
9273 World_Tourism_rankings
9274 World_Weightlifting_Championships
9275 World_Wrestling_Championships
9276 World Death Rate by Country
9277 World Literacy Rate Adult Total by Country
9278 World Mortality Rate Infant by Country
9279 World Top 10 Manufacturers 1976-2020
9280 Wrestling_at_the_Summer_Olympics
9281 WSF_World_Team_Squash_Championships
9282 WTBA_World_Tenpin_Bowling_Championships
9283 XBox Data
9284 xlm-mlm-tlm-xnli15-1024
9285 xprophetnet-large-wiki100-cased-xglue-ntg
9286 zindi-spot-the-mask-challenge
9287 Feature Engineering Data
9288 NLP Course
9289 plotly countries
9290 pytorch-pretrained-BERT
9291 Mines vs Rocks
9292 Statlog (Shuttle) Dataset, UCI Data Repo.
9293 iMet Version17
9294 Gen 1 Pokémon Google Scrape
9295 re

9553 Celebrities & Famous People, and their Properties
9554 Finnish Locative Cases for Nouns
9555 Finnish Words and their Concreteness Values
9556 Movie Title Puns
9557 SemFi: Finnish Semantics with Syntactic Relations
9558 The Best Sarcasm Annotated Dataset in Spanish
9559 bert-pretrained-models
9560 GPT-2-BPE
9561 pascal_fastai_version
9562 Bert Tiny
9563 CoronaWhy Plus
9564 hyperion-power-bi
9565 United States crime rates by county
9566 Diversity Index of US counties
9567 Unemployment by County
9568 Wikipedia Sentences
9569 HDMA Washington State Home Loans, 2016
9570 Colorado Fourteeners
9571 Costco Warehouse Information
9572 Kinopoisk's movies reviews
9573 rocketcam
9574 ALBERT pretrained parametrs for Tensorflow
9575 [Jigsaw] Multilingual swear profanity
9576 Jigsaw Train Multilingual Coments (Google API)
9577 OSIC Pulmonary Fibrosis Progression Lungs Mask
9578 Adult Census Income Data
9579 Sample dataset for OLS Reg. Model (Statistics)
9580 test_sample
9581 AvitoFurnitureSet
9582

9840 Yellow Pages of Pakistan
9841 Fake-Real pairs in DFDC test videos
9842 U.S. Last WWII Veterans
9843 Strict or Lenient? COVID-19 Lockdowns Compared
9844 Covid-19 Vaccine by Developmental Phase
9845 U.S. Adults Susceptible to Severe Covid-19
9846 Pandemic's Racial Disparity
9847 Nationalities  Eager To Take Covid-19 Vaccine
9848 Americans Fear for Democracy 2020 Election
9849 U.S. Elections Waiting for Results
9850 Abortion Statistics
9851 COVID-19 Allocations
9852 Anxiety and Depression Psychological Therapies
9853 Bioethics: Trials and Results
9854 Bangladesh: Quarantine for COVID-19
9855 COVID-19 Risk Index
9856 Gas emissions (CO2-e) by transport sector.
9857 COVID-19 Sex-Disaggregated Data
9858 Domestic Violence Incidents
9859 Earnings of females and males employees.
9860 Covid19 Economic Exposure
9861 Fuel poverty
9862 Funeral data.
9863 Digital altimetric data information - GPS
9864 TB Immunity
9865 Women Inventors
9866 Jack-o'-lantern - Trick or Treat!
9867 Healthy Life Expec

10122 National Education Longitudinal Survey, 1998
10123 test25
10124 covid-19 India cleaned data
10125 ICMR Testing Data
10126 Indian Postal Codes
10127 Chicago Crime from 01JAN2001 to 22JUL2020
10128 Novel Coronavirus COVID-19 (2019-nCoV)
10129 COVID-19 Patients Lungs X Ray Images 10000
10130 Ford Cars in Iowa
10131 test.csv
10132 COCO2014
10133 Cars brands +28K
10134 Top Spotify Tracks of 2018
10135 Top Spotify Tracks of 2019
10136 Top Spotify Tracks of 2017
10137 Melbourne Airbnb 2020
10138 Ted Talks Main CSV
10139 Dataset for Predictive Maintenance
10140 ds4gconfiguration
10141 Australian Bush fire satellite data (NASA)
10142 Inventory
10143 Movie Ratings
10144 Particle Identification from Detector Responses
10145 Geeks for Geeks Articles Dataset
10146 Bank card
10147 Basic Needs Basic Rights Kenya
10148 brain image
10149 lion image
10150 LR_labels
10151 OCR working in progress
10152 Segmented images of the skin cancer  dataset
10153 skin cancer
10154 Corona Tunisia Sentiment Anal

10388 NY OATH Hearings Division Case Status
10389 NY Open Parking and Camera Violations
10390 NY Parking Violations Issued
10391 NY Prenatal care services (monthly income levels)
10392 NY Prequalified Firms
10393 NY Property Data (Buildings Information System)
10394 NY Public Recycling Bins
10395 NY Rodent Inspection
10396 NY School Attendance and Enrollment
10397 NY School Demographics and Accountability Snapshot
10398 NY Sidewalk Cafe Licenses and Applications
10399 NY Street Hail Livery (SHL)
10400 NY Tax Lien Sale Lists
10401 NY Taxi Improvement Fund (TIF) Medallion Payments
10402 NY Traffic Volume Counts (2012-2013)
10403 NY Union Square Partnership (USP) Business List
10404 NY Upcoming contracts to be awarded
10405 NY Water Consumption In The New York City
10406 NY Watershed Water Quality Data
10407 NY Work Order Management Module
10408 NY Workforce1 Jobs and Events
10409 NY Zip code breakdowns
10410 NYC ACRIS Codes
10411 NYC Baby Names
10412 NYC Building Complaint Disposition Co

10600 NYS Jail Population By County: Beginning  1997
10601 NYS Jobs By Industry:  Beginning 2012
10602 NYS Key Credit Collection: Beginning 2010
10603 NYS Labor Market Analysts
10604 NYS Law Enforcement Personnel by Agency
10605 NYS License, Permit, Non-Driver ID Cards Issued
10606 NYS License Event Notification Service Customers
10607 NYS Liquor Authority - Active Licenses and Permits
10608 NYS Liquor Authority Brand Label and Wholesaler
10609 NYS Lobbying Clients and Lobbyist Data
10610 NYS Local Area Unemployment Statistics (LAUS)
10611 NYS Local Government Efficiency Program Grants
10612 NYS Local Mental Health Programs
10613 NYS Lottery Information and Data
10614 NYS Manufactured Home Park Registrations
10615 NYS Master Contract Value Added Resellers Report
10616 NYS Meals Served by the Office for the Aging
10617 NYS Medical Care Facilities Financing Agency Data
10618 NYS Mental Health Information
10619 NYS Metropolitan Transport Authority (MTA) Data
10620 NYS Mined Land Permits: 

10813 AGE, GENDER AND ETHNICITY (FACE DATA) CSV
10814 train_churn_pred_av
10815 Amtrak timeseries
10816 Missing Cars
10817 Online Retail Dataset
10818 melanoma-1024x1024
10819 melanoma-1024x1024
10820 IPL-2020-Dataset
10821 Summiteers of Mt. Everest till December 2017
10822 solasta
10823 Letter-Recognition
10824 MESSI goals vs Real Madrid 2005-2017
10825 Socio-Economic Country Profiles
10826 very small  test  data
10827 Japan Real Estate Prices
10828 NOPE_dicom_dtaset_fold0_0_fig
10829 NOPE_dicom_dtaset_fold1_0_fig
10830 NOPE_dicom_dtaset_fold2_0_fig
10831 NOPE_dicom_dtaset_fold3_0_fig
10832 NOPE_dicom_dtaset_fold4_0_fig
10833 Panda_Dataset_medium_256_256_2
10834 Panda_Dataset_medium_256_256_3
10835 PE_dicom_dtaset_3d_fold0
10836 PE_dicom_dtaset_3d_fold0_nope
10837 PE_dicom_dtaset_3d_fold0_pe
10838 PE_dicom_dtaset_3d_fold1_nope
10839 PE_dicom_dtaset_3d_fold1_pe
10840 PE_dicom_dtaset_3d_fold2_nope
10841 PE_dicom_dtaset_3d_fold2_pe
10842 PE_dicom_dtaset_3d_fold3_nope
10843 PE_dicom_dtase

11084 Horse Breeds
11085 HTML Recipes
11086 NY School Districts
11087 Object Detection
11088 OSM Russia. Central District
11089 Python Recipes
11090 Style Color Images
11091 SVHN Preprocessed Fragments
11092 Traditional Decor Patterns
11093 Yale Face Database
11094 Brazilian E-Commerce Public Dataset by Olist
11095 Marketing Funnel by Olist
11096 COVID-19 data from ministry of health in Brazil
11097 Poems in Portuguese
11098 countryinfo
11099 StanfordExtra Dogs Dataset
11100 French ski resort snow data 2010 - 2018
11101 Geographical information for ski resort locations
11102 500 Greatest Songs of All Time
11103 IMDB top 1000
11104 Wuzzuf Jobs in Egypt
11105 football database
11106 Survey: students wage expectation
11107 LinkedIn Profile Data
11108 Mask RCNN PyTorch weights
11109 Historic gold prices
11110 OtoscopeData
11111 USA States to region
11112 Global Terrorism Database (Turkish)
11113 Turkish Market Sales Dataset With 9.000+Items
11114 Tweets about the Top Companies from 2015 to

11363 COVID Tracking Project: Racial Data Tracker
11364 COVID-19 containment and mitigation measures
11365 COVID-19 SOLES Tables
11366 CVPR 2019 Papers
11367 Denver Crime Data
11368 Segmentation of OCT images (AMD)
11369 Retinal OCT Images (optical coherence tomography)
11370 Latitude and Longitude for Every Country and State
11371 LitCovid
11372 Lord of the Rings Data
11373 Medical Speech, Transcription, and Intent
11374 Minneapolis Police Stops and Police Violence
11375 The Mueller Report
11376 nytimes covid-19 data
11377 Open Elections Data: USA
11378 OXFORD COVID-19 GOVERNMENT RESPONSE TRACKER
11379 Pain Pills in the USA
11380 Percent Black Population for Every State in USA
11381 Percent Voting for Democratic Party by State
11382 PhD Stipends, Salaries, and LW Ratios
11383 Song Lyrics
11384 Denver Police Pedestrian Stops and Vehicle Stops
11385 PTM Strava Data
11386 Python Developers Survey (2017 & 2018)
11387 Repository of Coronavirus Genomes
11388 Response Counts for the 2020 Kag

11617 NJ Transit + Amtrak (NEC) Rail Performance
11618 Energy Market Price Time Series
11619 Tennis Weather
11620 Airplanes Dataset for R-CNN
11621 Brain-Tumor-Features(Extracted)
11622 Covid-19 Image Dataset
11623 Employee attrition
11624 Facemask Detection Dataset 20,000 Images
11625 COVID19TestingData
11626 TestData
11627 Breastcancer Dataset
11628 toxic_model_data
11629 efficientd5_dataset
11630 efficientd6_dataset
11631 Loan_Accept_Dataset
11632 boombiledata
11633 bank marketing
11634 assignmentlink
11635 assignmentnode
11636 followerlink
11637 followerlinkdataset
11638 followinglink
11639 followinglink1
11640 followingNode
11641 Animal Crossing: New Horizons
11642 Spotify global 2019 most-streamed tracks
11643 Novel Coronavirus Epidemic Dataset
11644 Game of Thrones
11645 Ionosphere
11646 The Simpsons Dataset
11647 Chest X-ray (Covid-19 & Pneumonia)
11648 Supply chain management for Car
11649 efficientnet_pytorch
11650 Toxic-file
11651 Toxic-files
11652 A-L-R-S
11653 amazon stock

11876 MPLArchitecture
11877 numpyslice
11878 Spain geojson
11879 The 50 Plot Challenge
11880 trick58data
11881 Home Price Index
11882 Airbnb from insiderairbnb
11883 AlexNet
11884 DenseNet-121
11885 DenseNet-161
11886 DenseNet-169
11887 DenseNet-201
11888 InceptionV3
11889 ResNet-101
11890 ResNet-152
11891 ResNet-18
11892 ResNet-34
11893 ResNet-50
11894 SqueezeNet 1.0
11895 SqueezeNet 1.1
11896 VGG-11
11897 VGG-11 with batch normalization
11898 VGG-13
11899 VGG-13 with batch normalization
11900 VGG-16
11901 VGG-16 with batch normalization
11902 VGG-19
11903 VGG-19 with batch normalization
11904 Software Architectural Styles
11905 Charts
11906 bitcoin_prices_coinbase_USD
11907 license-plate
11908 PIA Airline Reviews
11909 my_bert_uncased
11910 my_roberta_large
11911 DengAI Dataset
11912 final_bert_uncased
11913 WM-811K wafer map
11914 global-wheat-detection-extend
11915 albert xlarge v1
11916 BART Large pretrained cache for pytorch
11917 create data for jigsaw tpu bert keras demo v2
119

12147 Deodorant Instant Liking Data
12148 Predicting Food Delivery Time
12149 Twitter Tweets Data for Sentiment Analysis
12150 Taiwan PTT stock topics and intraday trading chats
12151 seaborn_tips_dataset
12152 fifa2017
12153 prediction_template
12154 Esports Earnings 1998 - 2020
12155 Top games on Twitch 2016 - 2020
12156 Philadelphia Government Employee Salary
12157 Sentimental Analysis trained Model on ML.Net
12158 BERT BASE UNCASED - July 2020
12159 E commerce Purchase Dataset
12160 Success of Bank Telemarketing Data
12161 da da rao1
12162 stvcurry
12163 Charlotte NC Traffic Accidents 2018-2019
12164 stanly-county-housing-data2018
12165 U.S. Presidential Election Tweets 2020
12166 CrowdAI Plant Disease Dataset
12167 Titanic: Machine Learning from Disaster
12168 jigsawdatasets
12169 Banking Dataset Classification
12170 Arabic-Handwritten-Chars
12171 Oil and Gas
12172 UCI Poker Hand Dataset
12173 Transjakarta Bus GPS Data
12174 US Chronic Respiratory Disease Mortality Rates
12175 ele

12430 Text Similarity
12431 car prediction
12432 Countrywise Population & Density
12433 Database of Top Indian cities
12434 bach-tfrecords
12435 COVID-19 complete genomes and protein sequences
12436 InternShala Internships
12437 TMDb Datasets
12438 flickr30k-histograms
12439 Bank Note Authentication UCI data
12440 Beer Beer Beer
12441 COVID-19 🦠 Vaccine Tweets
12442 FIFA20 Players Dataset with Stats | Images
12443 Pokemon Dataset with Stats
12444 Spotify Top 50+ Most Streamed Songs
12445 Trending TV Shows on Netflix
12446 COVID-19 Mexico Patient Health Dataset
12447 Indian Premier League Data from 2008 to 2020
12448 ship-imageclassify
12449 Car Logo
12450 ucmsplittedgis
12451 Prediction of Loan status
12452 Top 10000 Movies Based On Ratings
12453 Zomato Kolkata Dataset
12454 HappyDB
12455 Fire Detection from CCTV
12456 Ekosistem JavaScript di Indonesia 2019
12457 mydataset
12458 Unzipped Oil csv
12459 Usable Oil Prices: Simple Price Imputation
12460 alcohol consumption
12461 testdatase

12681 Animal Bites
12682 Atlas of Pidgin and Creole Language Structures
12683 Between Our Worlds: An Anime Ontology
12684 Blog Authorship Corpus
12685 Brazilian Portuguese Literature Corpus
12686 British Birdsong Dataset
12687 Character Encoding Examples
12688 Chocolate Bar Ratings
12689 Clap Emoji in Tweets
12690 Cleaned VA Sheep Livestock Data
12691 CMU Pronouncing Dictionary
12692 Colonia Corpus of Historical Portuguese
12693 Color terms dataset
12694 Corpus of bilingual children's speech
12695 Corpus of Brazilian Portuguese Literature
12696 Deceptive Opinion Spam Corpus
12697 Delpher Dutch Newspaper Archive (1618-1699)
12698 Dictionary of American Regional English (DAREDS)
12699 Did it rain in Seattle? (1948-2017)
12700 Digimon Database
12701 Diplomacy Betrayal Dataset
12702 Discourse Acts on Reddit
12703 Do Conference Livetweets Get More Traffic?
12704 EmojiNet
12705 English Word Frequency
12706 Eurfa Welsh Dictionary
12707 Eurovision YouTube Comments
12708 Every Pub in England
12

12903 Swiggy : Bangalore delivery outlet data
12904 Pokemon with stats. Generation 7
12905 Pokemon with stats. Generation 8
12906 ICU availability by country and region
12907 State of CSS
12908 Breast Cancer Dataset
12909 haberman dataset
12910 A-Z Handwritten Alphabets in .csv format
12911 Pothole Image Data-Set
12912 India States Covid Data
12913 Car Sales
12914 Performance Prediction
12915 Room Occupancy
12916 assignment1
12917 MPD AIO
12918 MPD converted
12919 MPD Extracted
12920 Spotify Million Playlist Dataset (MPD)
12921 Wine Customer Segmentation
12922 Covid19 forecasting data with containment measures
12923 world population by (country, state)
12924 Vale - Semantic Terrain Segmentation
12925 Bike Sales  in  Europe
12926 Suicides Rate 1985 to 2016
12927 Bangla News Articles
12928 Dhaka stock exchange day end archive
12929 DSE News Archive
12930 INSA ML DEEP Project
12931 Sadukie Sensors
12932 GTSDB - German Traffic Sign Detection Benchmark
12933 Safecast Radiation Measurements


13165 SF Privately Owned Public Open Spaces
13166 SF Purchasing Commodity Data
13167 SF Recreation & Park Department Park Info Dataset
13168 SF Registered Business Locations - San Francisco
13169 SF Residential Projects-Inclusionary Requirements
13170 SF Restaurant Scores - LIVES Standard
13171 SF Retiree Pensions Annual Benefit Received
13172 SF Right of Way Exception Codes and Data
13173 SF Salary Ranges by Job Classification
13174 SF Scorecard Measures
13175 SF SF Civic Art Collection
13176 SF SFEC 3.216(d) Gifts of Travel Filings
13177 SF SFMTA-Enforced Temporary Tow Zones
13178 SF SFO Gate and Stand Assignment Information
13179 SF Spending and Revenue
13180 SF Statement of Economic Interests Form 700
13181 SF Stormwater inlets, drains and catch basins
13182 SF Street Acceptance Data
13183 SF Street Names
13184 SF Street Segment and Intersection Change Log
13185 SF Street Tree List
13186 SF Street-Use Permits
13187 SF Supervisor District to ZIP Code Crosswalk
13188 SF Surface Mount

13432 0.85933376.csv
13433 diabetes_columns
13434 Consumer Complaint Database
13435 OECD Housing Prices
13436 OECD Tax on personal income
13437 OECD Unemployment After 2000
13438 COVID-19 worldometer daily snapshots
13439 Historical Hourly Weather Data 2012-2017
13440 Single Neurons as Deep Nets - NMDA test data
13441 YouTube Faces With Facial Keypoints
13442 Environment Impact of Food Production
13443 Irish Data
13444 Predict the Happiness
13445 camera_dataset
13446 cereals dataset
13447 pima-indians-diabetes
13448 Turkish Airlines daily stock prices since 2013
13449 Predict'em All
13450 Software Defect Prediction
13451 Presidential Cabinet Nominations
13452 Stanford Cars (Folder, Crop, Segment)
13453 Movielens (Small)
13454 Short Track Speed Skating Database
13455 APIS_test_data
13456 kings-reign
13457 puzzle-sales
13458 Proton Exchange Membrane (PEM) Fuel Cell Dataset
13459 Single DBFC Dataset
13460 SEPTA - Regional Rail
13461 100K Coursera's Course Reviews Dataset
13462 Compiled da

13772 Vehicle Registered In India
13773 Cyber crime
13774 Amazon Earphones Reviews
13775 kc_house_data
13776 bmw cars prices
13777 Analog-Clocks
13778 Jigsaw-Puzzle
13779 Iris_data
13780 Diamonds
13781 Time sheet data
13782 List of US ZipCodes for DonorsChoose Comp
13783 Company Acquisitions Data
13784 External Data for DonorsChoose RecSys
13785 Ideal Student Life Survey
13786 Netflix Movies and TV Shows
13787 [Real or Fake] Fake JobPosting Prediction
13788 BBC Full Text Document Classification
13789 Novel Coronavirus Aggregated Data, by JHU CSSE
13790 Traffic Sign Data set
13791 Air Quality Data in India
13792 Big Mart Sales Prediction Datasets
13793 Churn Modeling Dataset
13794 Novel Corona Virus 2019 Dataset
13795 AV : Healthcare Analytics
13796 HR Analysis Case Study
13797 Academic ranking of world universities Analytics
13798 Timm Pytorch models
13799 BERT-tiny
13800 COVID-19 India
13801 ./train.csv
13802 Stock Pricing
13803 DataCo SMART SUPPLY CHAIN FOR BIG DATA ANALYSIS
13804 St

14047 Diabetic_Dataset
14048 NYC Dog Licenses
14049 College
14050 Volcanic Eruptions in the Holocene Period
14051 Pneumothorax Dataset
14052 birdcall, background, concat
14053 birdcall, no background, concat, a - b
14054 birdcall, no background, concat, c - f
14055 birdcall, no background, concat, g-m
14056 birdcall, no background, concat, n-r
14057 birdcall, no background, concat, s-y
14058 Detect Fraudulent Transactions
14059 SG Complaint Status
14060 Faze Clan Valorant Invitational
14061 advise
14062 Turkey Football Matches with ELO ratings
14063 9322 letras de rap en español
14064 cloud5fold
14065 panda-bboxes-1
14066 Amazon Fine Food Reviews
14067 SNAP Memetracker
14068 All the news
14069 The Billboard 200 acoustic data
14070 Indie Map
14071 Investing Program Type Prediction
14072 Food Recipe dataset
14073 testses
14074 traines
14075 California housing data set 1990
14076 Coronavirus Italy SARS-CoV-2
14077 Missing Migrants Project
14078 Stack Overflow Annual Developer Survey
14079

14316 electra_small_torch_eng
14317 Stanford Open Policing Project - Bundle 1
14318 Stanford Open Policing Project - Bundle 2
14319 Stanford Open Policing Project - California
14320 Stanford Open Policing Project - Florida
14321 Stanford Open Policing Project - Illinois
14322 Stanford Open Policing Project - North Carolina
14323 Stanford Open Policing Project - Ohio
14324 Stanford Open Policing Project - South Carolina
14325 Stanford Open Policing Project - Texas
14326 Stanford Open Policing Project - Washington State
14327 Dockerfiles
14328 Stanford Natural Language Inference Corpus
14329 Stanford Question Answering Dataset
14330 Street View House Numbers (SVHN)
14331 HK macroeconomics data
14332 RSNA-STR 256x256 Tfrecords
14333 Bears fastai course
14334 US state county name & codes
14335 Nutrition facts for Starbucks Menu
14336 Starbucks Locations Worldwide
14337 Corona Analysis files
14338 Here We Grow cocosynth + mask r_cnn
14339 The Algae Testbed Public-Private Partnership ATP3
14

14565 #SSR tweets dataset
14566 Standard Metropolitan Areas Dataset
14567 NB_SVM
14568 Lyrics of Xu Song Songs from NetEase Cloud Music
14569 Japan LOTO7 raw data
14570 COVID-19 tweets afternoon 27.03.2020.
14571 COVID-19 tweets 26.03.2020.
14572 COVID-19 tweets morning 01.04.2020.
14573 COVID-19 tweets afternoon 28.03.2020.
14574 COVID-19 tweets afternoon 30.03.2020.
14575 COVID-19 tweets afternoon 31.03.2020.
14576 COVID-19 tweets morning 27.03.2020.
14577 COVID-19 tweets morning 28.03.2020.
14578 COVID-19 tweets morning 29.03.2020.
14579 COVID-19 tweets morning 30.03.2020.
14580 COVID-19 tweets morning 31.03.2020.
14581 Gold Price Data (1950 - 2020) in INR & USD
14582 Israel Public Holidays 2016-2018
14583 lastdayone2155
14584 lastonedaysub215
14585 Netflix Original Movies
14586 Heart Sound Database
14587 Music chatbot dataset : intent & entity
14588 Restaurant chatbot dataset : intent & entity
14589 Weather chatbot dataset : intent & entity
14590 Clash royale Dataset
14591 Instrume

14852 grgrtergtr
14853 400+ crypto currency pairs at 1-minute resolution
14854 XLNet_large
14855 Google Trends 2017 Plumber Search Oakland
14856 housing
14857 MrRooters Twitter Daily Follower Counts
14858 Amnesty International "Halt The Hate" Dataset
14859 EMPRES Global Animal Disease Surveillance
14860 Good Morning Tweets
14861 Gravity Spy (Gravitational waves)
14862 Movement coordination in trawling bats
14863 Religious and philosophical texts
14864 SETI Data
14865 Deep Learning Literature: Daily Update
14866 Yemen Data
14867 dictionary
14868 oxfordDict
14869 antiviral
14870 COVID-19's Impact on Airport Traffic
14871 Hazardous Driving Spots Around the World
14872 Historical Weed Stock Prices
14873 Parking Statistics in North America
14874 Malaysia GE14 Election Results
14875 TSA Claims Database
14876 Gutenberg Poetry Dataset
14877 Pokemon- Weedle's Cave
14878 Airlines Passenger Data
14879 IMDB Movie Reviews Classification
14880 You-tube Spam Collections
14881 ner_data
14882 albert_ch

15109 Bigg Boss India - Hindi Telugu Tamil Kannada
15110 Titanic Data
15111 German Fake Companies
15112 September 2018 Donald Trump-Related Tweets
15113 Fantasy Premier League - 2017/18
15114 University course, grade and organizational data
15115 All injuries In Cinematography 1914-2019
15116 High Rated Handguns
15117 Highly Rated Children Books And Stories
15118 Popular Halloween 2020  Costumes Amazon Reviews
15119 The Chase(Game Show) Israel episode data
15120 Top 1000 Patreons
15121 Top 270 Computer Science / Programing Books
15122 World War II Aircrafts
15123 example-fasta
15124 Alphabet Characters Fonts Dataset
15125 understanding asdfmovie
15126 Emoji sentiment data
15127 cleanTrain
15128 Los Angeles 1992 Riot Deaths from LA Times
15129 Housing Simple Regression
15130 TV,Radio,Newspaper-Advertising
15131 Russian_twitter_sentiment
15132 Customer Support on Twitter
15133 Podcast Reviews
15134 Anime dataset
15135 Food data
15136 BashLogs
15137 stacking_feature_nn_v2
15138 Notas Sare

15387 Flights and Airports Data
15388 Melbourne Airbnb Open Data
15389 Bike Ads (images, prices, specifications)
15390 Ice Cream Dataset
15391 xlnetmodel
15392 countries.geo.json
15393 opsenx
15394 steel-fpn-b4
15395 Amazon sales rank data for print and kindle books
15396 Dice: d4, d6, d8, d10, d12, d20 Images
15397 Anna University - Results Dataset
15398 Adult Census Income
15399 Air pressure system failures in Scania trucks
15400 Auto-mpg dataset
15401 Bioassay Datasets
15402 Biomechanical features of orthopedic patients
15403 Breast Cancer Wisconsin (Diagnostic) Data Set
15404 Caravan Insurance Challenge
15405 CAT Scan Localization
15406 Default of Credit Card Clients Dataset
15407 El Nino Dataset
15408 Household Electric Power Consumption
15409 Faulty Steel Plates
15410 Forest Cover Type Dataset
15411 German Credit Risk
15412 Glass Classification
15413 Horse Colic Dataset
15414 Human Activity Recognition with Smartphones
15415 Identifying Interesting Web Pages
15416 Indian Liver Pa

15662 Coronavirus Cases Karnataka
15663 coronavirus-cases-in-india
15664 DSL Corpus Collection (DSLCC)
15665 IEEE COVID-19 Tweets Dataset
15666 Transcripts Press Conferences NL on COVID-19
15667 H1B 2016
15668 Month-wise COVID-19 related tweets
15669 BERTModel
15670 BIOBERTCODE
15671 biobertconfig
15672 biobertmodel2
15673 Code repo for all code
15674 ZippedFolder
15675 imdb Dataset
15676 Employee Salary Dataset
15677 email msg spam detection
15678 Starwars Survey in USA
15679 SpringOpen 406 Books Details
15680 WikiHow Summarization
15681 Pisa 2018 Worldwide Ranking
15682 Sweden COVID-19 Data
15683 UK COVID-19 Data
15684 quora question pair
15685 DAX-index
15686 dax-index2
15687 FTSE 100 index
15688 FTSE 100 index2
15689 ftse100
15690 ftse1-predictions
15691 gold_new
15692 gold_new3
15693 gold_new4
15694 GOLD-index
15695 index_sp
15696 new_data1
15697 sales_train
15698 S&P_500
15699 S&P500
15700 sp500_index
15701 Skylines 12
15702 Predicting Coupon Redemption
15703 Face Recognition Dat

15943 Run or Walk (reduced)
15944 go-nuts archive
15945 California Study
15946 LC50 Data
15947 Okun's Law within the United States
15948 Boba Shops in the Bay Area
15949 wheat-spike-dataset
15950 wheat-dataset-original
15951 lossData
15952 bert_en_cased_L-12_H-768_A-12_1
15953 Bank marketing campaigns dataset | Opening Deposit
15954 Heart_Disease
15955 Coffee Quality database from CQI
15956 Temperature Time-Series for some Brazilian cities
15957 Bolsas da Capes
15958 Weather Istanbul Data 2009-2019
15959 Congressional Voting Records
15960 Nails segmentation
15961 kaggle-porto-seguro-submissions
15962 Annotated NER for Indian language
15963 Janatahack: Customer Segmentation
15964 Datasets used in my study of target encodings
15965 createdbyvrptester
15966 CVE (Common Vulnerabilities and Exposures)
15967 Classified Ads for Cars - unique maker/model/year
15968 Ukraine Register of Individual Entrepreneurs
15969 FastText
15970 Sports Car Choice data
15971 Disaster Tweets
15972 UARU_Str_DB
1

16230 Michael Jordan, Kobe Bryant and Lebron James stats
16231 Star Wars Movie Scripts
16232 The Cure discography
16233 Tweets during Cavaliers vs Warriors
16234 Tweets during Nintendo E3 2018 Conference
16235 Tweets during Real Madrid vs Liverpool
16236 Dataset malware/beningn permissions Android
16237 Malicious and Benign Websites
16238 Network Traffic Android Malware
16239 Cell_segmentation
16240 Steam Sale
16241 Australia Saturday Xlotto Jackpot Numbers
16242 Turkey COVID 19 Complete Dataset
16243 FIFA 21 messy, raw dataset for cleaning/ exploring
16244 Bangladesh Weather Dataset
16245 Most Popular Soccer Leagues
16246 Spotify Dataset 1921-2020, 160k+ Tracks
16247 World Happiness Report (Preprocessed)
16248 RIIID Questions, tags, lectures expanded metadata
16249 Exchange Rates
16250 Top8USACities_PropertySales
16251 NYC Uber Pickups with Weather and Holidays
16252 modelchexnet
16253 bart_code
16254 bart_large
16255 bert_base_pretrained
16256 fairseq_hacked
16257 gpt2bpe
16258 quest

16509 Python package Datatable
16510 A/B testing
16511 Final_Capsule
16512 Final_EfficientNetB5
16513 TitanicDatasets
16514 Defects location for metal surface
16515 driving data
16516 Heart.csv
16517 Dhaka Stock Exchange Broad Historical Data
16518 Global Aquaculture Imports and Exports
16519 wine-price-rating
16520 toxic_comments_fastText
16521 Loan  Data
16522 Technical Indicator Backtest
16523 bert_pretrained_model
16524 bert_pretrained_model_uncased
16525 tensorflow_hub_bert
16526 omegaconf
16527 ernie_model
16528 model_src
16529 rbt3_chinese_bert
16530 roberta-large-wwm-chinese
16531 bank_data_loan_default
16532 lajitraindata
16533 The World Factbook by CIA
16534 United Nations Crime-Data
16535 Divorce dataset
16536 Median Listing Price (1 Bedroom)
16537 Zillow Rent Index, 2010-Present
16538 Zillow Economics Data
16539 Gold Prices London Market 1950 - Present
16540 IATA Airport Codes
16541 Major World Cities in the World
16542 S&P 500 Companies with Financial Information
16543 COV