In [None]:
import os
import boto3
from botocore.exceptions import ClientError
from botocore.client import Config as BotoConfig
from dotenv import load_dotenv

TIMEOUT = 180
CONFIG = BotoConfig(connect_timeout=TIMEOUT, retries={"mode": "adaptive", 'max_attempts': 5},
                     tcp_keepalive=True)
load_dotenv()


def copy_s3_objects(source_bucket,
                    source_prefix,
                    destination_bucket,
                    destination_prefix,
                    endpoint_url="https://storage.yandexcloud.net"):
    s3 = boto3.client('s3',
                      aws_access_key_id=os.environ.get("S3_ID"),
                      aws_secret_access_key=os.environ.get("S3_SECRET"),
                      endpoint_url=endpoint_url,
                      config=CONFIG)

    response = s3.list_objects_v2(
        Bucket=source_bucket,
        # Prefix=source_prefix
    )

    for obj in response.get('Contents', []):
        source_key = obj['Key']

        if source_key.endswith('.txt'):
            destination_key = source_key.replace(source_prefix, destination_prefix, 1)

            copy_source = {
                'Bucket': source_bucket,
                'Key': source_key,
                # 'ACL': 'public-read'
            }

            s3.copy_object(
                CopySource=copy_source,
                Bucket=destination_bucket,
                Key=destination_key
            )
            print(f"Source {source_key} was successfully copied to {destination_bucket} with prefix {source_prefix}")

    print("Data copied successfully!")



s3_key_id = os.environ.get("S3_ID")
s3_secret = os.environ.get("S3_SECRET")
endpoint_url = os.environ.get("S3_ENDPOINT_URL")

source_bucket = os.environ.get("S3_BUCKET_NAME")
source_prefix = "fraud-data/"

destination_bucket = "otus-task-n2"
destination_prefix = source_prefix

copy_s3_objects(source_bucket,
                source_prefix,
                destination_bucket,
                destination_prefix,
                endpoint_url=endpoint_url
                )

In [2]:
import boto3
import os

from dotenv import load_dotenv

load_dotenv()

s3_key_id = os.environ.get("S3_ID")
s3_secret = os.environ.get("S3_SECRET")
bucket_name = os.environ.get("S3_BUCKET_NAME")
bucket_name = "otus-task-n3"
source_prefix = "fraud-data/"

source_prefix = "artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/"
local_folder = "./local_download_folder"

# Create local directory if it does not exist
if not os.path.exists(local_folder):
    os.makedirs(local_folder)

# Initialize S3 resource
s3_resource = boto3.resource("s3",
                             aws_access_key_id=s3_key_id,
                             aws_secret_access_key=s3_secret,
                             endpoint_url="https://storage.yandexcloud.net")

bucket = s3_resource.Bucket(bucket_name)

# Download all objects with the specified prefix
for obj in bucket.objects.filter(Prefix=source_prefix):
    target_path = os.path.join(local_folder, os.path.relpath(obj.key, source_prefix))
    target_dir = os.path.dirname(target_path)

    # Create directories if necessary
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Download the file
    bucket.download_file(obj.key, target_path)
    print(f"Downloaded {obj.key} to {target_path}")


Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/MLmodel to ./local_download_folder\run-name\MLmodel
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/conda.yaml to ./local_download_folder\run-name\conda.yaml
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/python_env.yaml to ./local_download_folder\run-name\python_env.yaml
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/requirements.txt to ./local_download_folder\run-name\requirements.txt
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/sparkml/metadata/._SUCCESS.crc to ./local_download_folder\run-name\sparkml\metadata\._SUCCESS.crc
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/sparkml/metadata/.part-00000.crc to ./local_download_folder\run-name\sparkml\metadata\.part-00000.crc
Downloaded artifacts/1/73decc58d8b642b4a7cc70bb3dfaf6f9/artifacts/run-name/sparkml/metadata/_SUCCESS t

In [None]:
def get_bucket_files_size(bucket_name, prefix, endpoint_url="https://storage.yandexcloud.net"):
    total_size = 0
    s3 = boto3.client('s3', endpoint_url=endpoint_url)
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page['Contents']:
            total_size += obj['Size']
    return total_size

bucket_size = get_bucket_files_size(bucket_name, source_prefix)
print(f"Size of files in bucket {bucket_name}: {bucket_size / (1024**3)} GB")

In [3]:
import logging
import os
import subprocess

# List of packages to install
packages = [
    "findspark==2.0.1",
    "fsspec",
    "s3fs"
]

# Install each package
for package in packages:
    subprocess.check_call(["pip", "install", package])

import s3fs

In [10]:
def list_bucket(bucket_name):
    fs = s3fs.S3FileSystem(anon=True,
                      endpoint_url="https://storage.yandexcloud.net")
    bucket_objects = fs.ls(bucket_name)
    # bucket_objects = [x for x in bucket_objects if x.endswith(".txt")]

    return bucket_objects

In [14]:
bucket_objects=list_bucket("otus-task-n3/scripts")

In [15]:
bucket_objects

['otus-task-n3/scripts/',
 'otus-task-n3/scripts/clean_fraud_data.py',
 'otus-task-n3/scripts/install_packages.py',
 'otus-task-n3/scripts/pyspark_script.py']