In [None]:
import os
import pytz
import pickle
import time
import subprocess
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

from IPython.display import clear_output

In [None]:
competition_name = "jigsaw-agile-community-rules"

In [None]:
def save_obj(obj, filename, verbose=True):
    with open(filename, "wb") as f:
        pickle.dump(obj, f)

    if verbose:
        print(f"Saved: {filename}")

    return None


def load_obj(filename, verbose=True):
    with open(filename, "rb") as f:
        obj = pickle.load(f)

    if verbose:
        print(f"Loaded: {filename}")

    return obj

In [None]:
def format_dt(dt):
    if dt is None:
        return ""

    H, S = divmod(dt.seconds, 3600)
    M, S = divmod(S, 60)

    dt_str = f"{H:02d}:{M:02d}:{S:02d}"

    return dt_str

In [None]:
def format_line(line, rm_priv_score_col=True):
    line = line[16:]

    if rm_priv_score_col:
        line = line[:-14]

    return line

In [None]:
# import kaggle
# from kaggle.api.kaggle_api_extended import Submission
# raw_resp = kaggle.api.competitions_submissions_list_with_http_info(id=competition_name)
# all_subs_v = [Submission(s) for s in kaggle.api.process_response(raw_resp)]

In [None]:
cache_filepath = f"./{competition_name}-subs-cache.pickle"
no_pending_stop = True
rm_private_score = True

there_is_a_pending = True
if os.path.exists(cache_filepath):
    pending_dt_d = load_obj(cache_filepath, verbose=False)
else:
    pending_dt_d = {}

while no_pending_stop or there_is_a_pending:
    there_is_a_pending = False

    is_ok = False
    while not is_ok:
        try:
            # response_str = kaggle.api.competition_submissions_cli(competition_name)
            response_str = subprocess.check_output(
                f"kaggle competitions submissions {competition_name}".split()
            ).decode()

            is_ok = True

        except Exception as e:
            print(f'WARNING, An error occured: "{e}"')
            time.sleep(60)

    ml_v = [l for l in response_str.split("\n")]

    s = ""
    for i, l in enumerate(ml_v):
        sub_data_v = l.split()

        fl = format_line(l, rm_priv_score_col=rm_private_score)

        if i == 0:
            s += fl + "Elapsed" + "\n"
            continue

        elif i == 1:
            s += fl + "-" * 8 + "\n"
            continue

        elif "submission.csv" not in l:
            s += fl + "\n"
            continue
        else:
            l = fl

        start_datetime_str = (sub_data_v[1] + " " + sub_data_v[2]).split(".")[0]
        status = sub_data_v[-1]

        start_datetime = datetime.strptime(start_datetime_str, "%Y-%m-%d %H:%M:%S")
        start_datetime = pytz.timezone("UTC").localize(start_datetime)
        now_datetime = datetime.now(tz=pytz.timezone("UTC"))

        dt = now_datetime - start_datetime

        if "PENDING" in status:
            pending_dt_d[start_datetime_str] = dt
            save_obj(pending_dt_d, cache_filepath, verbose=False)
            there_is_a_pending = True

        s += l + format_dt(pending_dt_d.get(start_datetime_str)) + "\n"

    clear_output(wait=True)
    print(s)

    if no_pending_stop or there_is_a_pending:
        time.sleep(2 * 60)

print("\nNo more pending submissions left!!!")

In [None]:
import subprocess, re
import pandas as pd
import datetime

competition_name = "physionet-ecg-image-digitization"
n_procs = 4
lines = []
token = ""

while token is not None:
    command = f"kaggle competitions files --page-size 2000 "
    out = (
        subprocess.check_output(
            command
            + (f"--page-token {token} " if len(token) else " ")
            + competition_name,
            shell=True,
        )
        .decode()
        .strip()
        .split("\n")
    )
    if len(out) >= 200 + 2:
        token = re.split("\s+", out[0].strip())[-1]
        out = out[1:]
    else:
        token = None
    lines.extend(out[2:])


# columns = ["name", "size", "date", "time"]
# count = 0
# while True:
#     count += 1
#     if lines[count][0:3] == "---":
#         break

# lines = lines[count + 1:]
# file_df = pd.DataFrame([re.split("\s+", line.strip()) for line in lines], columns = columns)

# file_df ['size'] = pd.to_numeric(file_df['size'])
# file_df ['date'] = pd.to_datetime(file_df['date'])
# file_df ['time'] = pd.to_datetime(file_df['time']).dt.time

# file_df = file_df.sort_values(by = "size", ignore_index=True)
# file_df['split'] = file_df.index % n_procs

# file_df.info()

In [None]:
from kaggle import KaggleApi
from time import sleep

COMPETITION = "csiro-biomass"


def main():
    api = KaggleApi()
    api.authenticate()
    page_size = 200
    all_files = []
    token = None

    while True:
        resp = api.competition_list_files(
            COMPETITION, page_size=page_size, page_token=token
        )
        d = resp.to_dict()
        files = d.get("files") or []
        print(len(files))
        all_files.extend(files)
        if d.get("nextPageToken", None) is not None:
            token = d.get("nextPageToken")
        sleep(5)
        continue
        if not token:
            break

    return all_files


main()

In [None]:
import subprocess, re
import pandas as pd
import datetime

competition_name = "morizin/jigsaw-syn"

lines = (
    subprocess.check_output(f"kaggle datasets files {competition_name}", shell=True)
    .decode()
    .strip()
    .split("\n")
)
columns = ["name", "size", "date", "time"]
lines = lines[2:]
file_df = pd.DataFrame(
    [re.split("\s+", line.strip()) for line in lines], columns=columns
)

file_df["size"] = pd.to_numeric(file_df["size"])
file_df["date"] = pd.to_datetime(file_df["date"])
file_df["time"] = pd.to_datetime(file_df["time"]).dt.time

file_df = file_df.sort_values(by="size")

file_df.info()

In [None]:
import os

os.chdir("/Users/morizin/Documents/Code/jigsaw-competition")
from src.jigsaw.utils.common import load_csv

In [None]:
data = load_csv("input/data/raw/train.csv")
data