## Kaggle index scraper

this script is a pipeline for Kaggle  

firstoff, we try build an automation to scrape the index, to get an actual representation  
in dedicated scripts we will scrape and preprocess the content  

In [1]:
# kaggle.com offers an api to download data
# however we are not interested in the data itself
# but the projects and notebooks associated with it
# so we need to find a way to scrape the data from the website

In [2]:
# imports

from requests import Session
import json
import os
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# unfortunatelly the page heavily relies on javascript and ajax calls
# a simple approach to mimic the xmlhhtp-requests to harvest the data failed

session = Session()

# HEAD requests ask for *just* the headers, which is all you need to grab the
# session cookie
session.head('https://www.kaggle.com/')

response = session.post(
    url='https://www.kaggle.com/requests/SearchDatasetsRequest',
    data={"page":290},
    headers={
        'Referer': 'https://www.kaggle.com/'
    }
)

print (response.text)

{"errors":["Bad request"],"wasSuccessful":false}


In [4]:
# so i manually downloaded the sites, splitted in different subsections
# this process is time consuming and does not provide a full list of all projects
# let's scrape the data and parse them into csv-files

In [5]:
# let's reuse some generic functions

# generic store data to file function
def store_data(data, file, mode='w', toJson=False):
    if toJson:
        data = json.dumps(data)
    with open(file, mode, encoding='utf-8') as fp:
        result = fp.write(data)
        return result
    
# generic load data from file function
def load_data(file, fromJson=False):
    if os.path.isfile(file):
        with open(file, 'r', encoding='utf-8', errors="ignore") as fp:
            data = fp.read()
            if fromJson:
                data = json.loads(data)
            return data
    else:
        return 'file not found'

In [6]:
# load list of manually downloaded html-files

path = '../data/repositories/kaggle/index/html/'
files_all = os.listdir(path)
files = []
for file in files_all:
    if os.path.isfile(path+file) and 'dataset' in file:
        files.append(file)
print(files)

['datasets_all_incomplete.html', 'datasets_bigquery_cc_66.html', 'datasets_bigquery_cc_66_temp.html', 'datasets_bigquery_odb.html', 'datasets_bigquery_other.html', 'datasets_cc_newest.html', 'datasets_csv_cc_7678_incomplete.html', 'datasets_csv_cc_7678_temp.html', 'datasets_csv_gpl_385.html', 'datasets_csv_odb_945.html', 'datasets_json_cc_1106.html', 'datasets_json_gpl_39.html', 'datasets_json_odb_136.html', 'datasets_json_other_2488_temp.html', 'datasets_sqlite_cc_39.html', 'datasets_sqlite_odb.html', 'datasets_sqlite_other_35.html']


In [None]:
# scrape datasets from html-code

html_path = '../data/repositories/kaggle/index/html/'
csv_path = '../data/repositories/kaggle/index/csv_datasets/'
html_files = [
    'datasets_bigquery_cc_66-2.html',
    #'datasets_csv_gpl_385.html',
    #'datasets_bigquery_odb.html',
]

html_files = files

def kaggle_scrape_datasets(html):
    soup = BeautifulSoup(html, 'html.parser')
    result = []
    
    partial = soup.find('div', class_="km-list km-list--three-line")
    items = partial.find_all('li', {"role": "listitem"})
    
    for i, item in enumerate(items):
        #print(item)
        #print(i)
        try:
            link = item.select('a.sc-qcrOD')[0].get('href')
            name = item.select('div.sc-Axmtr')[0].text.strip()
            updated = item.select('span.sc-fzqBkg > span')[0].get('title')
            updated = updated.split(' (')[0]
            usability = item.select('div.sc-qQYBZ > span')
            if len(usability) > 0:
                usability = usability[0].text.strip()
            else:
                usability = ''
            subline = item.select('span.sc-fzqBkg')[1].find(text=True, recursive=False)
            subline = subline.split('·')
            files = filesize = tasks = ''
            for cell in subline:
                cell = cell.strip()
                if 'File' in cell:
                    files = cell
                if 'B' in cell:
                    filesize = cell
                if 'Task' in cell:
                    tasks = cell
            score = item.select('span.sc-fzoxKX')[0].text.strip()
            badge = item.select('span.sc-qanSb')
            if len(badge) > 0:
                badge = badge[0].text.strip()
            else:
                badge = ''

            result.append({
                'link': link,
                'name': name,
                'updated': updated,
                'usability': usability,
                'files': files,
                'filesize': filesize,
                'tasks': tasks,
                'score': score,
                'badge': badge,
            })
        except:
            print('an error occured')
            break

    return result

for file in html_files:
    
    print(file)
    
    html = load_data(html_path+file)
    
    if html == 'file not found':
        print(html)
        break
        
    result = kaggle_scrape_datasets(html)
    
    #print(result)
    print(len(result))
    
    csv_file = file.replace('.html', '.csv')
    
    #store_data(result, path+csv_file, toJson=True)
    df = pd.DataFrame(result)
    df.to_csv(csv_path+csv_file,encoding='utf-8-sig')
    #print(df.head())

In [None]:
# bundle csv and drop duplicates

path = '../data/repositories/kaggle/index/csv_datasets/'
files_all = os.listdir(path)
csv = '../data/repositories/kaggle/kaggle_index.csv'

li = []

for file in files_all:
    df = pd.read_csv(os.path.join(path, file), index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df = df.drop('Unnamed: 0', axis=1)

print('count (with duplicates):', len(df))

df = df.drop_duplicates(['link'])
print('count (without duplicates):', len(df))

df.to_csv(csv, encoding='utf-8-sig', index=False)

print(df.head())

In [37]:
# scrape competitions from html-code

html_path = '../data/repositories/kaggle/index/html/'
csv_path = '../data/repositories/kaggle/kaggle_competitions.csv'
html_files = [
    'competitions_completed.html',
]

#html_files = files

def kaggle_scrape_competitions(html):
    soup = BeautifulSoup(html, 'html.parser')
    result = []
    
    partial = soup.find_all('ul', class_="mdc-list")[-1]
    #print(len(partial), partial)
    items = partial.find_all('li', {"role": "listitem"})
    
    for i, item in enumerate(items):
        #print(i, item)
        #try:
        if True:
            link = item.select('a.sc-pdOLj')[0].get('href')
            name = item.select('div.sc-Axmtr')[0].text.strip()
            sublines = item.select('span.sc-fznzOf')
            description = sublines[0].text.strip()
            # 'subline': 'Research • 2 months ago • Code Competition • 1636 Teams'
            subline = sublines[1].text.strip()
            subline = subline.split('•')
            category = date = teams = kind = ''
            for i, cell in enumerate(subline):
                cell = cell.strip()
                if i == 0:
                    category = cell
                if 'ago' in cell:
                    date = cell
                if 'Code Competition' in cell:
                    kind = cell
                if 'Teams' in cell:
                    teams = cell
            prize = item.select('div.sc-pjUyM')[0].text.strip()

            result.append({
                'link': link,
                'name': name,
                'description': description,
                'category': category,
                'date': date,
                'teams': teams,
                'kind': kind,
                'prize': prize,
            })
        #except:
        #    print('an error occured')
        #    break

        #print(result)
        #break
    return result

for file in html_files:
    
    print(file)
    
    html = load_data(html_path+file)
    
    if html == 'file not found':
        print(html)
        break
        
    result = kaggle_scrape_competitions(html)
    
    #print(result)
    print(len(result))
    
    df = pd.DataFrame(result)
    df.to_csv(csv_path,encoding='utf-8-sig')
    print(df.head())

competitions_completed.html
419
                                            link  \
0                      /c/stanford-covid-vaccine   
1       /c/rsna-str-pulmonary-embolism-detection   
2                                    /c/lish-moa   
3           /c/conways-reverse-game-of-life-2020   
4  /c/lyft-motion-prediction-autonomous-vehicles   

                                                name  \
0  OpenVaccine: COVID-19 mRNA Vaccine Degradation...   
1              RSNA STR Pulmonary Embolism Detection   
2              Mechanisms of Action (MoA) Prediction   
3                 Conway's Reverse Game of Life 2020   
4     Lyft Motion Prediction for Autonomous Vehicles   

                                         description    category  \
0  Urgent need to bring the COVID-19 vaccine to m...    Research   
1  Classify Pulmonary Embolism cases in chest CT ...    Featured   
2  Can you improve the algorithm that classifies ...    Research   
3      Reverse the arrow of time in the Game o