# COMPARE LOCAL FILES TO NCEI FILES
The purpose of this script is to be able to provide a single place for checking whether the files that exist locally, also exist on NCEI. This script also gives you the option of uploading files that are not in NCEI to GCP for intermittent storage.
Just follow along with the instructions in the script to be able to compare and upload files to GCP.

NOTE: This script will compare based on echosounder folders. E.g. Reuben_Lasker/RL2107/EK80/... This is done to help with chunking the survey into manageable pieces.

## Step 1: Fill Out These Variables To Help

In [77]:
# --- VARIABLES TO FILL OUT ---
ship_name = "Reuben Lasker"
cruise_name = "RL2107"
echosounder = "EK80"
local_echosounder_directory_path = "../Reuben_Lasker/RL2107/EK80/"
# Set to True if you want to cache files that do not exist in NCEI by uploading
# them to GCP.
upload_non_s3_files_to_gcp = True
gcp_project_id = "ggn-nmfs-aa-dev-1"
gcp_bucket_name = "ggn-nmfs-aa-dev-1-data"
# --- VARIABLES TO FILL OUT ---

## Step 2: Run, But Ignore, The Cells Below

In [84]:
# --- HELPER FUNCTIONS ---

import os
import hashlib
from typing import Tuple, List, Union
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import glob
import string
from difflib import get_close_matches
import pandas as pd
from google.cloud import storage
import traceback


RAW_DATA_FILE_TYPES = ["raw", "idx", "bot"]
CONVERTED_DATA_FILE_TYPES = ["netcdf", "nc"]


def setup_gcp_storage_objs(
    project_id: str = "ggn-nmfs-aa-dev-1",
    gcp_bucket_name: str = "ggn-nmfs-aa-dev-1-data",
) -> Tuple[storage.Client, str, storage.Client.bucket]:
    """Sets up Google Cloud Platform storage objects for use in accessing and
    modifying storage buckets.

    Args:
        project_id (str, optional): The project id of the project you want to
            access. Defaults to "ggn-nmfs-aa-dev-1".
        gcp_bucket_name (str, optional): The name of the exact bucket you want
            to access. Defaults to "ggn-nmfs-aa-dev-1-data".

    Returns:
        Tuple[storage.Client, str, storage.Client.bucket]: The storage client,
            followed by the GCP bucket name (str) and then the actual bucket
            object itself (which will be executing the commands used in this
            api).
    """

    gcp_stor_client = storage.Client(project=project_id)

    gcp_bucket = gcp_stor_client.bucket(gcp_bucket_name)

    return (gcp_stor_client, gcp_bucket_name, gcp_bucket)


def create_s3_objs(bucket_name: str = "noaa-wcsd-pds") -> Tuple:
    """Creates the s3 objects needed for using boto3 for a particular bucket.

    Args:
        bucket_name (str, optional): The bucket you want to refer to. The
            default points to the NCEI bucket. Defaults to "noaa-wcsd-pds".

    Returns:
        Tuple: The s3 client (used for certain portions of the boto3 api), the
            s3 resource (newer, more used object for accessing s3 buckets), and
            the actual s3 bucket itself.
    """
    # Setup access to S3 bucket as an anonymous user
    s3_client = boto3.client(
        "s3",
        aws_access_key_id="",
        aws_secret_access_key="",
        config=Config(signature_version=UNSIGNED),
    )
    s3_resource = boto3.resource(
        "s3",
        aws_access_key_id="",
        aws_secret_access_key="",
        config=Config(signature_version=UNSIGNED),
    )
    s3_bucket = s3_resource.Bucket(bucket_name)
    return s3_client, s3_resource, s3_bucket


def get_all_file_names_in_a_surveys_echosounder_folder(
    ship_name: str = "",
    survey_name: str = "",
    echosounder: str = "",
    s3_resource: boto3.resource = None,
    return_full_paths: bool = False,
) -> List[str]:
    """Gets all of the file names from a particular NCEI survey's echosounder
    folder.

    Args:
        ship_name (str, optional): The ship's name you want to get all surveys
            from. Defaults to None.
            NOTE: The ship's name MUST be spelled exactly as it is in NCEI. Use
            the `get_all_ship_names_in_ncei` function to see all possible NCEI
            ship names.
        survey_name (str, optional): The survey name exactly as it is in NCEI.
            Defaults to "".
        echosounder (str, optional): The echosounder used. Defaults to "".
        s3_resource (boto3.resource, optional): The resource used to perform
            this operation. Defaults to None, but creates a client for you
            instead.
        return_full_paths (bool, optional): Whether or not you want a full
            path from bucket root to the subdirectory returned. Set to false
            if you only want the subdirectory names listed. Defaults to False.

    Returns:
        List[str]: A list of strings, each being the file name. Whether
            these are full paths or just file names are specified by the
            `return_full_paths` parameter.
    """

    survey_prefix = f"data/raw/{ship_name}/{survey_name}/{echosounder}/"
    all_files = list_all_objects_in_s3_bucket_location(
        prefix=survey_prefix,
        s3_resource=s3_resource,
        return_full_paths=return_full_paths,
    )
    return all_files


def list_all_objects_in_s3_bucket_location(
    prefix: str = "",
    s3_resource: boto3.resource = None,
    return_full_paths: bool = False,
    bucket_name: str = "noaa-wcsd-pds",
) -> List[str]:
    """Lists all of the objects in a s3 bucket location denoted by `prefix`.
    Returns a list containing str. You get full paths if you specify the
    `return_full_paths` parameter.

    Args:
        prefix (str, optional): The bucket location. Defaults to "".
        s3_resource (boto3.resource, optional): The bucket resource object.
            Defaults to None.
        return_full_paths (bool, optional): Whether or not you want a full
            path from bucket root to the subdirectory returned. Set to false
            if you only want the subdirectory names listed. Defaults to False.
        bucket_name (str, optional): The bucket name. Defaults to
            "noaa-wcsd-pds".

    Returns:
        List[str]: A list of strings containing either the objects name or
            path, dependent on the `return_full_paths` parameter.
    """
    if not s3_resource:
        _, s3_resource, _ = create_s3_objs(bucket_name)

    object_keys = set()
    bucket = s3_resource.Bucket(bucket_name)
    for obj in bucket.objects.filter(Prefix=prefix):
        if return_full_paths:
            object_keys.add(obj.key)
        else:
            object_keys.add(obj.key.split("/")[-1])

    return list(object_keys)


def check_if_file_exists_in_s3(
    object_key: str = "",
    s3_resource: boto3.resource = None,
    s3_bucket_name: str = "",
) -> bool:
    """Checks to see if a file exists in an s3 bucket. Intended for use with
    NCEI, but will work with other s3 buckets as well.

    Args:
        object_key (str, optional): The object key (location of the object).
            Defaults to "".
        s3_resource (boto3.resource, optional): The boto3 resource for this
            particular bucket. Defaults to None.
        s3_bucket_name (str, optional): The bucket name. Defaults to "".

    Returns:
        bool: True if the file exists within the bucket. False otherwise.
    """

    try:
        s3_resource.Object(s3_bucket_name, object_key).load()
        return True
    except Exception:
        # object key does not exist.
        # print(e)
        return False


def get_file_size_from_s3(object_key, s3_resource):
    """Gets the file size of an object in s3."""
    obj = s3_resource.Object("noaa-wcsd-pds", object_key)
    file_size = obj.content_length
    return file_size


def get_checksum_sha256_from_s3(object_key, s3_resource):
    """Gets the SHA-256 checksum of the s3 object."""
    obj = s3_resource.Object("noaa-wcsd-pds", object_key)
    checksum = obj.checksum_sha256
    return checksum


def get_local_file_size(local_file_path: str) -> int:
    """Gets the size of a local file in bytes.

    Args:
        local_file_path (str): The local file path.

    Returns:
        int: The size of the file in bytes.
    """
    return os.path.getsize(local_file_path)


def get_local_sha256_checksum(local_file_path, chunk_size=65536) -> str:
    """
    Calculates the SHA256 checksum of a file.

    Args:
        local_file_path (str): The path to the file.
        chunk_size (int): The size of chunks to read the file in (in bytes).
                          Larger chunks can be more efficient for large files.

    Returns:
        str: The SHA256 checksum of the file as a hexadecimal string.
    """

    sha256_hash = hashlib.sha256()
    try:
        with open(local_file_path, "rb") as f:
            # Read the file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(chunk_size), b""):
                sha256_hash.update(chunk)
        return sha256_hash.hexdigest()
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"


def get_closest_ncei_formatted_ship_name(
    ship_name: str = "",
    s3_client: boto3.client = None,
) -> Union[str, None]:
    """Gets the closest NCEI formatted ship name to the given ship name.
    NOTE: Only use if the `data_source`=="NCEI".

    Args:
        ship_name (str, optional): The ship name to search the closest match
            for.
            Defaults to "".
        s3_client (boto3.client, optional): The client used to perform this
            operation. Defaults to None, but creates a client for you instead.

    Returns:
        Union[str, None]: The NCEI formatted ship name or None, if none
            matched.
    """

    # Create client objects if they dont exist.
    if s3_client is None:
        s3_client, _, _ = create_s3_objs()

    all_ship_names = get_all_ship_names_in_ncei(
        normalize=False, s3_client=s3_client, return_full_paths=False
    )
    close_matches = get_close_matches(
        ship_name, all_ship_names, n=3, cutoff=0.85
    )
    if len(close_matches) >= 1:
        return close_matches[0]
    else:
        return None


def get_all_ship_names_in_ncei(
    normalize: bool = False,
    s3_client: boto3.client = None,
    return_full_paths: bool = False,
):
    """Gets all of the ship names from NCEI. This is based on all of the
    folders listed under the `data/raw/` prefix.

    Args:
        normalize (bool, optional): Whether or not to normalize the ship_name
            attribute to how GCP stores it. Defaults to False.
        s3_client (boto3.client, optional): The client used to perform this
            operation. Defaults to None, but creates a client for you instead.
        return_full_paths (bool, optional): Whether or not you want a full
            path from bucket root to the subdirectory returned. Set to false
            if you only want the subdirectory names listed. Defaults to False.
    """

    # Create client objects if they dont exist.
    if s3_client is None:
        s3_client, _, _ = create_s3_objs()

    # Get the initial subdirs
    prefix = "data/raw/"
    subdirs = get_subdirectories_in_s3_bucket_location(
        prefix=prefix, s3_client=s3_client, return_full_paths=return_full_paths
    )
    if normalize:
        subdirs = [normalize_ship_name(ship_name=subdir) for subdir in subdirs]
    return subdirs


def get_subdirectories_in_s3_bucket_location(
    prefix: str = "",
    s3_client: boto3.client = None,
    return_full_paths: bool = False,
    bucket_name: str = "noaa-wcsd-pds",
) -> List[str]:
    """Gets a list of all the subdirectories in a specific bucket location
    (called a prefix). The return can be with full paths (root to folder
    inclusive), or just the folder names.

    Args:
        prefix (str, optional): The bucket folder location. Defaults to "".
        s3_client (boto3.client, optional): The bucket client object.
            Defaults to None.
        return_full_paths (bool, optional): Whether or not you want a full
            path from bucket root to the subdirectory returned. Set to false
            if you only want the subdirectory names listed. Defaults to False.
        bucket_name (str, optional): The bucket name. Defaults to
            "noaa-wcsd-pds".

    Returns:
        List[str]: A list of strings, each being the subdirectory. Whether
            these are full paths or just folder names are specified by the
            `return_full_paths` parameter.
    """
    if not s3_client:
        s3_client, _, _ = create_s3_objs(bucket_name)

    subdirs = set()
    result = s3_client.list_objects(
        Bucket=bucket_name, Prefix=prefix, Delimiter="/"
    )
    for o in result.get("CommonPrefixes"):
        subdir_full_path_from_prefix = o.get("Prefix")
        if return_full_paths:
            subdir = subdir_full_path_from_prefix
        else:
            subdir = subdir_full_path_from_prefix.replace(prefix, "")
            subdir = subdir.replace("/", "")
        subdirs.add(subdir)
    return list(subdirs)


def normalize_ship_name(ship_name: str = "") -> str:
    """Normalizes a ship's name. This is necessary for creating a deterministic
    file structure within our GCP storage bucket.
    The ship name is returned as a Title_Cased_And_Snake_Cased ship name, with
    no punctuation.
    Ex. `HENRY B. BIGELOW` will return `Henry_B_Bigelow`

    Args:
        ship_name (str, optional): The ship name string. Defaults to "".

    Returns:
        str: The formatted and normalized version of the ship name.
    """

    # Lower case the string
    ship_name = ship_name.lower()
    # Un-normalize (replace `_` with ` ` to help further processing)
    # In the edge-case that users include an underscore.
    ship_name = ship_name.replace("_", " ")
    # Remove all punctuation.
    ship_name = "".join(
        [char for char in ship_name if char not in string.punctuation]
    )
    # Title-case it
    ship_name = ship_name.title()
    # Snake-case it
    ship_name = ship_name.replace(" ", "_")

    return ship_name


def upload_file_to_gcp_bucket(
    bucket: storage.Client.bucket,
    blob_file_path: str,
    local_file_path: str,
):
    """Uploads a file to the blob storage bucket.

    Args:
        bucket (storage.Client.bucket): The bucket object used for uploading.
        blob_file_path (str): The blob's file path.
            Ex. "data/itds/logs/execute_code_files/temp.csv"
            NOTE: This must include the file name as well as the extension.
        local_file_path (str): The local file path you wish to upload to the
            blob.
    """

    if not bucket:
        _, _, bucket = setup_gcp_storage_objs()

    blob = bucket.blob(blob_file_path, chunk_size=1024 * 1024 * 1)
    # Upload a new blob
    try:
        blob.upload_from_filename(local_file_path)
    except Exception:
        print(traceback.format_exc())
        raise


def parse_correct_gcp_storage_bucket_location(
    file_name: str = "",
    file_type: str = "",
    ship_name: str = "",
    survey_name: str = "",
    echosounder: str = "",
    data_source: str = "",
    is_metadata: bool = False,
    is_survey_metadata: bool = False,
    debug: bool = False,
) -> str:
    """Calculates the correct gcp storage location based on data source, file
    type, and if the file is metadata or not.

    Args:
        file_name (str, optional): The file name (includes extension).
            Defaults to "".
        file_type (str, optional): The file type (not include the dot ".").
            Defaults to "".
        ship_name (str, optional): The ship name associated with this survey.
            Defaults to "".
        survey_name (str, optional): The survey name/identifier. Defaults
            to "".
        echosounder (str, optional): The echosounder used to gather the data.
            Defaults to "".
        data_source (str, optional): The source of the data. Can be one of
            ["NCEI", "OMAO"]. Defaults to "".
        is_metadata (bool, optional): Whether or not the file is a metadata
            file. Necessary since files that are considered metadata (metadata
            json, or readmes) are stored in a separate directory. Defaults to
            False.
        is_survey_metadata (bool, optional): Whether or not the file is a
            metadata file associated with a survey. The files are stored at
            the survey level, in the `metadata/` folder. Defaults to False.
        debug (bool, optional): Whether or not to print debug statements.
            Defaults to False.

    Returns:
        str: The correctly parsed GCP storage bucket location.
    """

    assert (
        (is_metadata and is_survey_metadata is False)
        or (is_metadata is False and is_survey_metadata)
        or (is_metadata is False and is_survey_metadata is False)
    ), (
        "Please make sure that only one of `is_metadata` and"
        " `is_survey_metadata` is True. Or you can set both to False."
    )

    # Creating the correct upload location
    if is_survey_metadata:
        gcp_storage_bucket_location = (
            f"{data_source}/{ship_name}/{survey_name}/metadata/{file_name}"
        )
    elif is_metadata:
        gcp_storage_bucket_location = (
            f"{data_source}/{ship_name}/{survey_name}/{echosounder}/metadata/"
        )
        # Figure out if its a raw or idx file (belongs in raw folder)
        if file_type.lower() in RAW_DATA_FILE_TYPES:
            gcp_storage_bucket_location = (
                gcp_storage_bucket_location + f"raw/{file_name}.json"
            )
        elif file_type.lower() in CONVERTED_DATA_FILE_TYPES:
            gcp_storage_bucket_location = (
                gcp_storage_bucket_location + f"netcdf/{file_name}.json"
            )
    else:
        # Figure out if its a raw or idx file (belongs in raw folder)
        if file_type.lower() in RAW_DATA_FILE_TYPES:
            gcp_storage_bucket_location = (
                f"{data_source}/{ship_name}/"
                f"{survey_name}/{echosounder}/data/raw/{file_name}"
            )
        elif file_type.lower() in CONVERTED_DATA_FILE_TYPES:
            gcp_storage_bucket_location = (
                f"{data_source}/{ship_name}/"
                f"{survey_name}/{echosounder}/data/netcdf/{file_name}"
            )

    if debug:
        print(
            "PARSED GCP_STORAGE_BUCKET_LOCATION: %s",
            gcp_storage_bucket_location,
        )

    return gcp_storage_bucket_location


def check_if_file_exists_in_gcp(
    bucket: storage.Bucket = None, file_path: str = ""
) -> bool:
    """Checks whether a particular file exists in GCP using the file path
    (blob).

    Args:
        bucket (storage.Bucket, optional): The bucket object used to check for
            the file. Defaults to None.
        file_path (str, optional): The blob file path within the bucket.
            Defaults to "".

    Returns:
        Bool: True if the file already exists, False otherwise.
    """

    return bucket.blob(file_path).exists()


def compare_local_cruise_files_to_cloud(
    local_cruise_file_path: str = "",
    ship_name: str = "",
    survey_name: str = "",
    echosounder: str = "",
    save_to_local_path: bool = False,
    debug: bool = False,
):
    """Compares the locally stored cruise files (per echosounder) to what
    exists on the cloud by number of files, file sizes, and
    checksums. Reports any discrepancies in the console.

    Args:
        local_cruise_file_path (str, optional): The folder path for the locally
            stored cruise data. Defaults to "".
        ship_name (str, optional): The ship name that the cruise falls under.
            Defaults to "".
        survey_name (str, optional): The survey/cruise name. Defaults to "".
        echosounder (str, optional): The specific echosounder you want to
            check. Defaults to "".
        save_to_local_path (bool, optional): If True, saves the detailed
            dataframe to this local path as an excel file. Defaults to False.
        debug (bool, optional): Whether or not to print out debug info.
            Defaults to False.
    """
    print(
        f"COMPARING LOCAL CRUISE FILES TO CLOUD FOR {ship_name}/{survey_name}/"
        f"{echosounder}"
    )

    # Create vars for use later
    _, s3_resource, _ = create_s3_objs()

    print("PARSING LOCAL FILES...")
    # Get all local files paths in cruise directory
    all_raw_file_paths = glob.glob(local_cruise_file_path + "/*.raw")
    all_idx_file_paths = glob.glob(local_cruise_file_path + "/*.idx")
    all_bot_file_paths = glob.glob(local_cruise_file_path + "/*.bot")
    # Check file numbers & types
    num_local_raw_files = len(all_raw_file_paths)
    num_local_idx_files = len(all_idx_file_paths)
    num_local_bot_files = len(all_bot_file_paths)
    num_local_files = (
        num_local_raw_files + num_local_idx_files + num_local_bot_files
    )
    # Get file names along with file paths
    # [(local_file_path, file_name_with_extension), (...)]
    all_raw_file_paths = [
        (file_path, file_path.split(os.path.sep)[-1])
        for file_path in all_raw_file_paths
    ]
    all_idx_file_paths = [
        (file_path, file_path.split(os.path.sep)[-1])
        for file_path in all_idx_file_paths
    ]
    all_bot_file_paths = [
        (file_path, file_path.split(os.path.sep)[-1])
        for file_path in all_bot_file_paths
    ]

    print("PARSING S3 FILES...")
    # Compare number of files in cruise, local vs cloud
    files_in_s3 = get_all_file_names_in_a_surveys_echosounder_folder(
        ship_name=ship_name,
        survey_name=survey_name,
        echosounder=echosounder,
        s3_resource=s3_resource,
        return_full_paths=False,
    )
    num_s3_raw_files = len([x for x in files_in_s3 if x.endswith(".raw")])
    num_s3_idx_files = len([x for x in files_in_s3 if x.endswith(".idx")])
    num_s3_bot_files = len([x for x in files_in_s3 if x.endswith(".bot")])
    num_files_in_s3 = len(files_in_s3)

    # Create a dataframe to keep track of all files and their statuses
    df = pd.DataFrame(
        columns=[
            "local_file_path",
            "local_file_name",
            "file_type",
            "s3_object_key",
            "exists_in_s3",
            "local_file_size",
            "s3_file_size",
            "file_size_match",
            "local_file_checksum",
            "s3_checksum",
            "checksum_match",
        ]
    )

    print("COMPARING FILES...")
    # Go through each local file, and compare file existence, size, checksum
    for local_file_path, file_name in all_raw_file_paths:
        file_json = {
            "local_file_path": local_file_path,
            "local_file_name": file_name,
            "file_type": file_name.split(".")[-1],
            "s3_object_key": None,
            "exists_in_s3": None,
            "local_file_size": None,
            "s3_file_size": None,
            "file_size_match": False,
            "local_file_checksum": None,
            "s3_checksum": None,
            "checksum_match": False,
        }
        # Create s3 object key
        s3_object_key = (
            f"data/raw/{ship_name}/{survey_name}/{echosounder}/{file_name}"
        )
        file_json["s3_object_key"] = s3_object_key

        # Get existence of file in s3
        file_exists_in_s3 = check_if_file_exists_in_s3(
            object_key=s3_object_key,
            s3_resource=s3_resource,
            s3_bucket_name="noaa-wcsd-pds",
        )
        file_json["exists_in_s3"] = file_exists_in_s3

        # Compare existence
        if file_exists_in_s3:
            # Get file size for s3 object key
            s3_file_size = get_file_size_from_s3(
                object_key=s3_object_key, s3_resource=s3_resource
            )
            # Get checksum for object key
            s3_checksum = get_checksum_sha256_from_s3(
                object_key=s3_object_key, s3_resource=s3_resource
            )
            # Get local file size
            local_file_size = get_local_file_size(local_file_path)
            # Get local file checksum
            local_file_checksum = get_local_sha256_checksum(local_file_path)
            file_json["s3_file_size"] = s3_file_size
            file_json["local_file_size"] = local_file_size
            if local_file_size == s3_file_size:
                file_json["file_size_match"] = True

            file_json["local_file_checksum"] = local_file_checksum
            file_json["s3_checksum"] = s3_checksum
            if local_file_checksum == s3_checksum:
                file_json["checksum_match"] = True

        file_df = pd.json_normalize(file_json)
        df = pd.concat([df, file_df], ignore_index=True)
    print("\n\n\n-------------------------")
    print("EXECUTIVE SUMMARY")
    print("-------------------------")
    print("LOCAL | S3")
    print(f"TOTAL FILES: {num_local_files} | {num_files_in_s3}")
    print(f"TOTAL .raw FILES: {num_local_raw_files} | {num_s3_raw_files}")
    print(f"TOTAL .idx FILES: {num_local_idx_files} | {num_s3_idx_files}")
    print(f"TOTAL .bot FILES: {num_local_bot_files} | {num_s3_bot_files}")
    print(f"FILES NOT IN S3: {len(df[df['exists_in_s3'] == False])}")
    print(f"FILE SIZE MISMATCHES: {len(df[df['file_size_match'] == False])}")
    print(f"CHECKSUM MISMATCHES: {len(df[df['checksum_match'] == False])}")
    print("\n-------------------------")
    print("FINDINGS")
    print("-------------------------")

    print(
        f"NUMBER OF FILES BETWEEN LOCAL AND S3 {"DO NOT" if num_files_in_s3 != (num_local_files) else ""} MATCH."
    )

    num_files_needing_reverification = len(
        df[(df[["file_size_match", "checksum_match"]].any(axis=1) & (df["exists_in_s3"]==True))]
    )
    if num_files_needing_reverification > 0:
        print(
            f"NUMBER OF FILES NEEDED FOR RE-VERIFICATION/CHECK: {num_files_needing_reverification}"
        )

    if save_to_local_path:
        df.to_excel(save_to_local_path, index=False)
        print(f"Detailed dataframe saved to {save_to_local_path}")
    else:
        print(
            "NOTE: The detailed Excel report can be saved by setting the "
            "`save_to_local_path` parameter."
        )

    return df

In [78]:
# --- IGNORE ---
ship_name = get_closest_ncei_formatted_ship_name(ship_name)
local_echosounder_directory_path = os.path.normpath(local_echosounder_directory_path)
# --- IGNORE ---

## Step 3: Let's Get The Executive Comparison Summary Of Our Local Files...

In [69]:
df = compare_local_cruise_files_to_cloud(
    local_cruise_file_path=local_echosounder_directory_path,
    ship_name=ship_name,
    survey_name=cruise_name,
    echosounder=echosounder,
    debug=True,
)

COMPARING LOCAL CRUISE FILES TO CLOUD FOR Reuben_Lasker/RL2107/EK80
PARSING LOCAL FILES...
PARSING S3 FILES...
COMPARING FILES...



-------------------------
EXECUTIVE SUMMARY
-------------------------
LOCAL | S3
TOTAL FILES: 7 | 39990
TOTAL .raw FILES: 5 | 19995
TOTAL .idx FILES: 2 | 19995
TOTAL .bot FILES: 0 | 0
FILES NOT IN S3: 1
FILE SIZE MISMATCHES: 1
CHECKSUM MISMATCHES: 5

-------------------------
FINDINGS
-------------------------
NUMBER OF FILES BETWEEN LOCAL AND S3 DO NOT MATCH.
NOTE: The detailed Excel report can be saved by setting the `save_to_local_path` parameter.


In [65]:
# TODO: pip install requirements.txt
df.head()

Unnamed: 0,local_file_path,local_file_name,file_type,s3_object_key,exists_in_s3,local_file_size,s3_file_size,file_size_match,local_file_checksum,s3_checksum,checksum_match
0,..\Reuben_Lasker\RL2107\EK80\2107RL_CW-D202107...,2107RL_CW-D20210712-T201130.raw,raw,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_CW-D...,True,1075268856.0,1075268856.0,True,45a90595ade62e01fe96791690f15e76afdb32dae9d7b5...,,False
1,..\Reuben_Lasker\RL2107\EK80\2107RL_CW-D202110...,2107RL_CW-D20211012-T224001.raw,raw,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_CW-D...,True,24623276.0,24623276.0,True,7d976dce9e6ea1a2c9dd079d32bde1c75f4cd596521781...,,False
2,..\Reuben_Lasker\RL2107\EK80\2107RL_FM-D202108...,2107RL_FM-D20210804-T113532.raw,raw,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,True,1093757948.0,1093757948.0,True,6adc6ecceb50ea79004caecb9a0016644d36d60749ea5e...,,False
3,..\Reuben_Lasker\RL2107\EK80\2107RL_FM-D202108...,2107RL_FM-D20210805-T041347.raw,raw,data/raw/Reuben_Lasker/RL2107/EK80/2107RL_FM-D...,True,1093758544.0,1093758544.0,True,7246570235b60ac6197a2a1ed45b4b032398e6b71a3d85...,,False
4,..\Reuben_Lasker\RL2107\EK80\test_new_file.raw,test_new_file.raw,raw,data/raw/Reuben_Lasker/RL2107/EK80/test_new_fi...,False,,,False,,,False


## Let's See If We Can Upload Any Of The Files To GCP
We can only upload the files that do not exist in NCEI.

In [86]:
# Set-up GCP storage objects
gcp_stor_client, gcp_bucket_name, gcp_bucket = setup_gcp_storage_objs(
    project_id=gcp_project_id, gcp_bucket_name=gcp_bucket_name
)
# Iterate through the dataframe and upload each file one-by-one.
df = df.sort_values(by="local_file_size")
if upload_non_s3_files_to_gcp:
    for index, row in df.iterrows():
        if not row["exists_in_s3"]:
            local_file_path = row["local_file_path"]
            file_name = row["local_file_name"]
            file_type = row["file_type"]

            # Parse the correct GCP storage bucket location
            gcp_storage_bucket_location = (
                parse_correct_gcp_storage_bucket_location(
                    file_name=file_name,
                    file_type=file_type,
                    ship_name=ship_name,
                    survey_name=cruise_name,
                    echosounder=echosounder,
                    data_source="NCEI",
                    is_metadata=False,
                    is_survey_metadata=False,
                    debug=False,
                )
            )
            if not check_if_file_exists_in_gcp(
                bucket=gcp_bucket, file_path=gcp_storage_bucket_location
            ):
                upload_file_to_gcp_bucket(
                    bucket=gcp_bucket,
                    blob_file_path=gcp_storage_bucket_location,
                    local_file_path=local_file_path,
                )
                print(
                    f"UPLOADED {file_name} TO {gcp_storage_bucket_location}."
                )