# Download All Colabs

In [98]:
SERVICE_ACCOUNT_FILE = 'turing-delivery-g-ga-e36eb2300714.json'

# Combine scopes for both Drive and Sheets
SCOPES = [
    "https://www.googleapis.com/auth/drive",
    "https://www.googleapis.com/auth/spreadsheets",
]

def authenticate_with_service_account():
    """Authenticate using a service account and return credentials."""
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE,
        scopes=SCOPES
    )
    return creds

# Get the shared credentials object
credentials = authenticate_with_service_account()

# Build the Drive service using the credentials
drive_service = build("drive", "v3", credentials=credentials)

# Authorize gspread (for Sheets) using the same credentials
gspread_client = gspread.authorize(credentials)

print("✅ Successfully authenticated for both Google Drive and Google Sheets!\nUse drive_service and gspread_client variables")

REMOTE_NAME = "gdrive"

if not rclone.is_installed():
    !curl https://rclone.org/install.sh | sudo bash
    print("rclone Downloaded and Installed!")
else:
    print("rclone already installed")

# --- Define the configuration parameters for the rclone remote ---
# The keys here correspond to rclone's configuration options.
config = {
    "type": "drive",
    "scope": "drive.readonly",  # Use 'drive' for read/write or 'drive.readonly'
    "service_account_file": str(SERVICE_ACCOUNT_FILE),
    # Add other options here if needed (see advanced section below)
}

# Force recreate the remote to ensure it uses the correct service account file
if rclone.check_remote_existing(REMOTE_NAME):
    try:
        # Delete the existing remote using subprocess
        import subprocess
        result = subprocess.run(['rclone', 'config', 'delete', REMOTE_NAME], 
                              capture_output=True, text=True, input='y\n')
        if result.returncode == 0:
            print(f"Deleted existing remote '{REMOTE_NAME}' to refresh configuration")
        else:
            print(f"Warning: Could not delete existing remote: {result.stderr}")
    except Exception as e:
        print(f"Warning: Could not delete existing remote: {e}")

try:
    # The create_remote function writes to the rclone.conf file.
    rclone.create_remote(
        remote_name=REMOTE_NAME,
        remote_type=RemoteTypes.drive, # This is equivalent to "drive"
        **config # Pass the config dictionary as keyword arguments
    )
    print(f"Success! Remote '{REMOTE_NAME}' created with service account: {SERVICE_ACCOUNT_FILE}")

except Exception as e:
    print(f"An error occurred during rclone setup: {e}")
    # If it still fails, let's try to use the existing remote
    print(f"Will proceed with existing remote configuration")

✅ Successfully authenticated for both Google Drive and Google Sheets!
Use drive_service and gspread_client variables
rclone already installed
Deleted existing remote 'gdrive' to refresh configuration


Success! Remote 'gdrive' created with service account: turing-delivery-g-ga-e36eb2300714.json


In [99]:
# Function to reset current directory
def reset_directory(DIR_TO_RESET):
    if Path(DIR_TO_RESET).exists():
        shutil.rmtree(DIR_TO_RESET)

    Path(DIR_TO_RESET).mkdir(parents=True, exist_ok=True)
    print(f"Directory {DIR_TO_RESET} Cleaned up!")

In [100]:
def download_colabs(
    colab_df,
    *,
    dest_dir: str = "notebooks",
    remote: str  = "gdrive:",
    concurrency: int = 16,
    show_progress: bool = True,
    reset_dest_directory: bool = True
):
    """
    Fetch every Colab notebook listed in `colab_df` via rclone backend copyid.

    Parameters
    ----------
    colab_df     : DataFrame with at least ['colab_id', 'colab_name']
    dest_dir     : local folder to place the .ipynb files (created if missing)
    remote       : rclone remote name that points to Google Drive
    concurrency  : number of parallel rclone workers ( -> xargs -P N )
    show_progress: if True add '--progress' to rclone so you see bars

    Returns
    -------
    DataFrame identical to `colab_df` plus a 'local_path' column
    """
    if remote.strip()[-1] != ":":
        remote += ":"

    if reset_dest_directory:
        reset_directory(dest_dir)
    if colab_df.empty:
        print("Nothing to download – DataFrame is empty.")
        return colab_df

    # ‣ 1.  Preparation -------------------------------------------------------
    dest = pathlib.Path(dest_dir)
    dest.mkdir(parents=True, exist_ok=True)

    ids   = colab_df["colab_id"].tolist()
    names = colab_df["colab_name"].fillna("").tolist()

    # Pre-add the local_path column so the caller can use it even if the
    # download later fails for some items.
    colab_df = colab_df.copy()
    colab_df["local_path"] = [
        str(dest / (n if n.endswith(".ipynb") else f"{n}.ipynb")) if n else
        str(dest / f"{fid}.ipynb")
        for fid, n in zip(ids, names)
    ]

    # Quick “before” summary
    print("─── Download plan ────────────────────────────────────────────")
    print(f"Target directory        : {dest.resolve()}")
    print(f"Remote (Drive)          : {remote}")
    print(f"Concurrent rclone jobs  : {concurrency}")
    print(f"Colabs to download      : {len(ids):,}")
    print("──────────────────────────────────────────────────────────────")

    # ‣ 2.  Build the xargs / rclone command ---------------------------------
    cmd = [
        "xargs", "-P", str(concurrency), "-I{}",      # fan-out
        "rclone", "backend", "copyid",
        "--drive-shared-with-me",
        *(["--progress"] if show_progress else []),
        remote,          # ① remote
        "{}",            # ② placeholder → file-ID
        f"{dest}/",      # ③ destination (trailing / keeps original names)
    ]

    id_stream = "\n".join(ids).encode()

    # ‣ 3.  Execute and time it ----------------------------------------------
    t0 = time.perf_counter()
    subprocess.run(cmd, input=id_stream, check=True)
    elapsed = time.perf_counter() - t0

    # ‣ 4.  Post-run summary -------------------------------------------------
    # Count how many of the expected files now exist locally
    ok = sum(pathlib.Path(p).exists() for p in colab_df["local_path"])

    size_str = ""
    try:
        total_bytes = sum(p.stat().st_size for p in dest.glob("*.ipynb"))
        size_str = f"  (≈{total_bytes/1e6:,.1f} MB)"
    except Exception:
        pass                                          # ignore permission errors

    print("\n─── Download completed ───────────────────────────────────────")
    print(f"Downloaded successfully   : {ok:,} / {len(ids):,}{size_str}")
    print(f"Elapsed time              : {elapsed:,.1f} s")
    if ok < len(ids):
        print("⚠️  Some notebooks may have failed – check the rclone logs.")
    print("──────────────────────────────────────────────────────────────")

    return colab_df

In [101]:
NOTEBOOKS_DIR = "notebooks"

colabs_down_df = download_colabs(
    colabs_df,
    dest_dir=NOTEBOOKS_DIR,
    remote=REMOTE_NAME,
    concurrency=16,
    show_progress=True
)

Directory notebooks Cleaned up!
─── Download plan ────────────────────────────────────────────
Target directory        : /Users/nabeel/PycharmProjects/e2e_sanity_checks/notebooks
Remote (Drive)          : gdrive:
Concurrent rclone jobs  : 16
Colabs to download      : 2,547
──────────────────────────────────────────────────────────────
Transferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B / 0 B, -, 0 B/s, ETA -
Elapsed time:         0.0sTransferred:   	          0 B /