In [1]:
# Installs the kaggle library
!pip3 install kaggle



In [2]:
#unlike other API notebooks, no need to import a .py file with API token info
#for kaggle, lives locally in ~/.kaggle/kaggle.json file

To use the Kaggle API, sign up for a Kaggle account at https://www.kaggle.com. Then go to the 'Account' tab of your user profile (https://www.kaggle.com/<username>/account) and select 'Create API Token'. This will trigger the download of kaggle.json, a file containing your API credentials. Place this file in the location ~/.kaggle/kaggle.json (on Windows in the location C:\Users\<Windows-username>\.kaggle\kaggle.json - you can check the exact location, sans drive, with echo %HOMEPATH%). You can define a shell environment variable KAGGLE_CONFIG_DIR to change this location to $KAGGLE_CONFIG_DIR/kaggle.json (on Windows it will be %KAGGLE_CONFIG_DIR%\kaggle.json).

In [3]:
import kaggle

In [4]:
#there are Pythonic options, but seems easiest to just run terminal commands from within notebook

#example:
! kaggle competitions list -s health

ref                                            deadline             category       reward  teamCount  userHasEntered  
---------------------------------------------  -------------------  ----------  ---------  ---------  --------------  
hhp                                            2013-04-04 07:00:00  Featured     $500,000       1350           False  
datasciencebowl                                2015-03-16 23:59:00  Featured     $175,000       1049           False  
stanford-covid-vaccine                         2020-10-06 23:59:00  Research      $25,000       1636           False  
hubmap-kidney-segmentation                     2021-05-10 23:59:00  Research      $60,000       1200           False  
osic-pulmonary-fibrosis-progression            2020-10-06 23:59:00  Featured      $55,000       2097           False  
histopathologic-cancer-detection               2019-03-30 23:59:00  Playground  Knowledge       1149           False  
rsna-str-pulmonary-embolism-detection   

In [5]:
#search datasets for term
! kaggle datasets list -s "machine learning"

#this returns by default one page of results at a time (matches manual search of datasets page 1)
#need to extract "ref" variable, which is <owner>/<dataset-name>

ref                                                   title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
kaggle/kaggle-survey-2018                             2018 Kaggle Machine Learning & Data Science Survey    4MB  2018-11-03 22:35:07          15168        974  0.85294116       
kaggle/kaggle-survey-2017                             2017 Kaggle Machine Learning & Data Science Survey    4MB  2017-10-27 22:03:03          22736        826  0.8235294        
alopez247/pokemon                                     Pokémon for Data Mining and Machine Learning        715KB  2017-03-05 15:01:26          10366        239  0.85294116       
kashnitsky/mlcourse                                   mlcourse.ai                                        

In [6]:
#searching datasets is a start, but we'll also want to search notebooks, competitions, etc
#looks like can search datasets and kernels
#and get related files for each

In [7]:
#once get list of files in format <owner>/<dataset-name>, can get list of files
#example:
! kaggle datasets files kaggle/kaggle-survey-2018 

name                         size  creationDate         
---------------------------  ----  -------------------  
SurveySchema.csv              8KB  2018-11-03 22:35:07  
freeFormResponses.csv         1MB  2018-11-03 22:35:07  
multipleChoiceResponses.csv  39MB  2018-11-03 22:35:07  


In [8]:
#can get metadata for a dataset
#default downloads into current working directory
#need a way to store in memory and iterate for all files
! kaggle datasets metadata -p /Kaggle_results/ kaggle/kaggle-survey-2018 

#this currently barfs, not sure why - something with where putting output,b ut not sure why
#TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

Traceback (most recent call last):
  File "/Users/michaelbaluja/opt/anaconda3/bin/kaggle", line 8, in <module>
    sys.exit(main())
  File "/Users/michaelbaluja/opt/anaconda3/lib/python3.8/site-packages/kaggle/cli.py", line 67, in main
    out = args.func(**command_args)
  File "/Users/michaelbaluja/opt/anaconda3/lib/python3.8/site-packages/kaggle/api/kaggle_api_extended.py", line 1065, in dataset_metadata_cli
    meta_file = self.dataset_metadata(dataset, path)
  File "/Users/michaelbaluja/opt/anaconda3/lib/python3.8/site-packages/kaggle/api/kaggle_api_extended.py", line 1043, in dataset_metadata
    os.makedirs(effective_path)
  File "/Users/michaelbaluja/opt/anaconda3/lib/python3.8/os.py", line 223, in makedirs
    mkdir(name, mode)
OSError: [Errno 30] Read-only file system: '/Kaggle_results/'


# Pythonic-ish Approach

## Setup

In [9]:
import subprocess # Used to run unix commands
import pandas as pd # For storing/manipulating command data
from io import StringIO # Lets us read csv string output from command into DataFrame
import json # Reading back the metadata files
from tqdm import tqdm # Gives status bar on loop completion

In [10]:
# Not depracated, but not needed, but spent time to write so saving in case needed later
def extract_dataset_paths(output):
    '''
    Takes the output from our kaggle run command and returns the dataset paths
    Params:
    - output (str): command output to be parsed
    Returns:
    - dataset_paths (list): list of dataset paths present in output
    '''
    # Remove newline characters
    output = output.replace('\n', '')
    
    # Extracts all instances of output that contain a '/' character
    # (/ used exclusively in dataset names at time of writing)
    dataset_paths = [out for out in output.split(' ') if '/' in out]
    return dataset_paths

## Data wrangling

### Getting/extracting dataset names

In [11]:
def get_search_output(search_terms, search_type):
    '''
    Calls the Kaggle API with the specified query terms and returns the search output results.
    
    Params:
    - search_terms (str/list-like): string or list of strings that should be searched for
    - search_type (str): objects to search over (must be either datasets or kernels)
    
    Returns:
    - cumulative_output (str): output from all searches
    '''
    # Make sure our input is valid
    assert len(search_terms) > 0, 'Please enter non-empty search terms'
    assert search_type in ('datasets', 'kernels'), 'Search can only be conducted over datasets or kernels'
    
    # If a singular search term is provided as a string, need to wrap it in a list
    if type(search_terms) == str:
        search_terms = [search_terms]
    
    # Set search parameters
    page_idx = 1
    search_output = ''
    cumulative_output = ''
    completion_phrase = f'No {search_type} found\n'
    
    # Search for each term
    for search_term in tqdm(search_terms):
        # Pulls the records for a single page of datasets for the given search term
        # Runs the command, captures the output in stdout, reads it from stdout, and decodes it to str from binary
        search_output = subprocess.run(['kaggle', search_type, 'list', '-v',
                                         '-s', f'"{search_term}"', 
                                         '-p', str(page_idx)], 
                                        capture_output=True).stdout.decode()
        # Once we no longer see new output, we stop
        while search_output != completion_phrase:
            # Accumulate the output
            cumulative_output = cumulative_output + search_output

            # Increments the page count for searching
            page_idx += 1
            
            # Pulls the records for a single page of datasets for the given search term
            # Runs the command, captures the output in stdout, reads it from stdout, and decodes it to str from binary
            search_output = subprocess.run(['kaggle', search_type, 'list', '-v',
                                             '-s', f'"{search_term}"', 
                                             '-p', str(page_idx)], 
                                            capture_output=True).stdout.decode()
        
    return cumulative_output

In [12]:
def convert_string_csv_output_to_dataframe(output):
    '''
    Given a string variable in csv format, returns a Pandas DataFrame
    
    Params:
    - output (str): csv-styled string to be converted
    
    Returns:
    - df (pandas.DataFrame): DataFrame consisting of data from 'output' string variable
    '''
    # Create DataFrame of results
    output = StringIO(output)
    df = pd.read_csv(output)
    
    # Every page of results will append a row of header values to the DataFrame, so we remove them
    df = df[df.ref != 'ref'].reset_index(drop=True)
    
    return df

In [13]:
search_terms = ['svm']
search_type = 'datasets'

In [14]:
search_output = get_search_output(search_terms, search_type)
search_output_df = convert_string_csv_output_to_dataframe(search_output)

100%|██████████| 1/1 [00:05<00:00,  5.10s/it]


In [15]:
search_output_df

Unnamed: 0,ref,title,size,lastUpdated,downloadCount,voteCount,usabilityRating
0,vinod00725/svm-classification,SVM Classification,72KB,2019-06-28 11:02:07,1125,20,0.29411766
1,nishan192/letterrecognition-using-svm,Letter-Recognition,214KB,2019-04-13 05:42:06,1191,15,0.64705884
2,sarahvch/predicting-who-pays-back-loans,Predicting Who Pays Back Loans,80MB,2017-03-02 21:07:38,1715,18,0.8235294
3,hugomathien/soccer,European Soccer Database,33MB,2016-10-23 22:31:38,133155,3442,0.7058824
4,elikplim/forest-fires-data-set,Forest Fires Data Set,7KB,2017-09-04 04:08:24,5082,72,0.64705884
...,...,...,...,...,...,...,...
106,suyiyang/nb-svm,NB_SVM,10MB,2018-02-26 19:24:26,32,0,0.23529412
107,nishithasaravanan/digit-svm,Digit_svm,9MB,2021-03-11 16:33:02,1,0,0.11764706
108,bharathmukka/segmentation-svm,Segmentation_SVM,94KB,2020-06-08 16:16:33,6,0,0.23529412
109,lilylc/gesture-mod,Gesture_mod,6MB,2020-10-30 14:59:57,0,0,0.125


### Pulling dataset metadata

Note: Unable to find a way to store metadata in memory as opposed to saving file, but this workaround appears to be functional.

In [16]:
def _retrieve_metadata_json(dataset_path):
    '''
    Queries Kaggle for metadata json file & returns the json data as a dictionary
    
    Params:
    - dataset_path (str): path for the dataset
    
    Returns:
    - metadata_dict (dict): dictionary containing json metadata
    '''
    # Download the metadata
    subprocess.run(['kaggle', 'datasets', 'metadata', dataset_path])

    # Access the metadata and load it in as a dictionary
    with open('dataset-metadata.json') as file:
        json_data = json.load(file)
        
    return json_data

In [17]:
def get_metadata(dataset_paths):
    '''
    Retrieves the metadata for the file/files listed in dataset_paths
    
    Params:
    - dataset_paths (str/list-like): string or list of strings containing the paths for the datasets
    
    Returns:
    - metadata_df (pandas.DataFrame): DataFrame containing metadata for the requested datasets
    '''
    # Make sure our input is valid
    assert len(dataset_paths) > 0, 'Please enter at least one dataset path'
    
    # If a singular search term is provided as a string, need to wrap it in a list
    if type(dataset_paths) == str:
        dataset_paths = [dataset_paths]
        
    # Run first query
    json_data = _retrieve_metadata_json(dataset_paths[0])
        
    # Create DataFrame to store metadata in, using columns found in first query, and then add query info
    metadata_df = pd.DataFrame(columns=json_data.keys(), dtype=object)
    metadata_df = metadata_df.append(json_data, ignore_index=True)
        
    # Pulls metadata information for each dataset found above
    for dataset_path in tqdm(dataset_paths[1::]):
        # Download & load the metadata
        json_data = _retrieve_metadata_json(dataset_path)

        # Store the metadata into our DataFrame created above
        metadata_df = metadata_df.append(json_data, ignore_index=True)
        
    return metadata_df

In [18]:
dataset_paths = search_output_df.ref.values
metadata_df = get_metadata(dataset_paths)

100%|██████████| 110/110 [02:04<00:00,  1.13s/it]


In [19]:
metadata_df

Unnamed: 0,id,id_no,datasetId,datasetSlug,ownerUser,usabilityRating,totalViews,totalVotes,totalDownloads,title,subtitle,description,isPrivate,keywords,licenses,collaborators,data
0,vinod00725/svm-classification,249595,249595,svm-classification,vinod00725,0.294118,6520,20,1125,SVM Classification,,,False,[computer science],[{'name': 'unknown'}],[],[]
1,nishan192/letterrecognition-using-svm,164400,164400,letterrecognition-using-svm,nishan192,0.647059,10733,15,1191,Letter-Recognition,Letter Recognition using SVM,To solve an interesting letter recognition pro...,False,"[computer science, image data]",[{'name': 'unknown'}],[],[]
2,sarahvch/predicting-who-pays-back-loans,906,906,predicting-who-pays-back-loans,sarahvch,0.823529,13420,18,1715,Predicting Who Pays Back Loans,Prediction using SVM on Lending Club data 2007...,# Context \nThe data being used for this analy...,False,[lending],[{'name': 'CC0-1.0'}],[],[]
3,hugomathien/soccer,63,63,soccer,hugomathien,0.705882,1078997,3442,133155,European Soccer Database,"25k+ matches, players & teams attributes for E...",The ultimate Soccer database for data analysis...,False,"[games, video games, football, europe, sports,...",[{'name': 'ODbL-1.0'}],[],[]
4,elikplim/forest-fires-data-set,2323,2323,forest-fires-data-set,elikplim,0.647059,62871,72,5082,Forest Fires Data Set,predict the burned area of forest fires using ...,Source: https://archive.ics.uci.edu/ml/dataset...,False,"[earth and nature, earth science, business]",[{'name': 'unknown'}],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,suyiyang/nb-svm,14242,14242,nb-svm,suyiyang,0.235294,742,0,32,NB_SVM,,,False,[],[{'name': 'DbCL-1.0'}],[],[]
107,nishithasaravanan/digit-svm,1205518,1205518,digit-svm,nishithasaravanan,0.117647,24,0,1,Digit_svm,,,False,[],[{'name': 'unknown'}],[],[]
108,bharathmukka/segmentation-svm,701586,701586,segmentation-svm,bharathmukka,0.235294,185,0,6,Segmentation_SVM,,,False,[],[{'name': 'CC0-1.0'}],[],[]
109,lilylc/gesture-mod,947411,947411,gesture-mod,lilylc,0.125000,164,0,0,Gesture_mod,,,False,[],[{'name': 'unknown'}],[],[]
