In [None]:
# Installs the kaggle library
!pip3 install kaggle

In [None]:
#unlike other API notebooks, no need to import a .py file with API token info
#for kaggle, lives locally in ~/.kaggle/kaggle.json file

To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com/<username>/account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Place this file in the location ~/.kaggle/kaggle.json (on Windows in the location C:\Users\<Windows-username>\.kaggle\kaggle.json - you can check the exact location, sans drive, with echo %HOMEPATH%). You can define a shell environment variable KAGGLE_CONFIG_DIR to change this location to $KAGGLE_CONFIG_DIR/kaggle.json (on Windows it will be %KAGGLE_CONFIG_DIR%\kaggle.json).

In [None]:
import kaggle

In [None]:
#there are Pythonic options, but seems easiest to just run terminal commands from within notebook

#example:
! kaggle competitions list -s health

In [None]:
#search datasets for term
! kaggle datasets list -s "machine learning"

#this returns by default one page of results at a time (matches manual search of datasets page 1)
#need to extract "ref" variable, which is <owner>/<dataset-name>

In [None]:
#searching datasets is a start, but we'll also want to search notebooks, competitions, etc
#looks like can search datasets and kernels
#and get related files for each

In [None]:
#once get list of files in format <owner>/<dataset-name>, can get list of files
#example:
! kaggle datasets files kaggle/kaggle-survey-2018 

In [None]:
#can get metadata for a dataset
#default downloads into current working directory
#need a way to store in memory and iterate for all files
! kaggle datasets metadata -p /Kaggle_results/ kaggle/kaggle-survey-2018 

#this currently barfs, not sure why - something with where putting output,b ut not sure why
#TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

# Pythonic-ish Approach

## Setup

In [None]:
import subprocess # Used to run unix commands
import pandas as pd # For storing/manipulating command data
from io import StringIO # Lets us read csv string output from command into DataFrame
import json # Reading back the metadata files
from tqdm import tqdm

In [None]:
# Not depracated, but not needed, but spent time to write so saving in case needed later
def extract_dataset_paths(output):
    '''
    Takes the output from our kaggle run command and returns the dataset paths
    Params:
    - output (str): command output to be parsed
    Returns:
    - dataset_paths (list): list of dataset paths present in output
    '''
    # Remove newline characters
    output = output.replace('\n', '')
    
    # Extracts all instances of output that contain a '/' character
    # (/ used exclusively in dataset names at time of writing)
    dataset_paths = [out for out in output.split(' ') if '/' in out]
    return dataset_paths

## Data wrangling

In [None]:
DATA_SEARCH_COMPLETION = 'No datasets found\n'

### Getting/extracting dataset names

In [None]:
# Sets the terms to search over
search_terms = ['korea']

In [None]:
%%time
page_idx = 1
dataset_output = ''
cumulative_output = ''
dataset_paths = []

for search_term in tqdm(search_terms):
    while dataset_output != DATA_SEARCH_COMPLETION:
        # Pulls the records for a single page of datasets for the given search term
        # Runs the command, captures the output in stdout, reads it from stdout, and decodes it to str from binary
        dataset_output = subprocess.run(['kaggle', 'datasets', 'list', '-v',
                                         '-s', f'"{search_term}"', 
                                         '-p', str(page_idx)], 
                                        capture_output=True).stdout.decode()
        
        # Accumulate the output
        cumulative_output = cumulative_output + dataset_output

        # Increments the page count for searching
        page_idx += 1

In [None]:
# Create DataFrame of results & clean it up
cumulative_output = StringIO(cumulative_output)
dataset_df = pd.read_csv(cumulative_output).drop_duplicates().reset_index(drop=True)
# Note: we drop duplicates because otherwise each page will add headers as a row
# This should also remedy situations where the same entry comes for multiple search terms

# Removes last row, which is Null entry from stopping criteria set above
dataset_df.dropna(inplace=True)

In [None]:
dataset_df

### Pulling dataset metadata

Note: Unable to find a way to store metadata in memory as opposed to saving file, but this workaround appears to be functional.

In [None]:
JSON_HEADERS = ['id', 'id_no', 'datasetId', 'datasetSlug', 'ownerUser', 'usabilityRating', 'totalViews', 
                'totalVotes', 'totalDownloads', 'title', 'subtitle', 'description', 'isPrivate', 'keywords', 
                'licenses', 'collaborators', 'data']

In [None]:
%%time
# Create DataFrame to store metadata in
metadata_df = pd.DataFrame(columns=JSON_HEADERS, dtype=object)

# Pulls metadata information for each dataset found above
for data_path in tqdm(dataset_df.ref):
    # Download the metadata
    subprocess.run(['kaggle', 'datasets', 'metadata', data_path])
    
    # Access the metadata and load it in as a dictionary
    with open('dataset-metadata.json') as file:
        json_data = json.load(file)
        
    # Store the metadata into our DataFrame created above
    metadata_df = metadata_df.append(json_data, ignore_index=True)

In [None]:
metadata_df