In [24]:
import os
import boto3
from botocore.config import Config
from datetime import datetime, timedelta
import git
import json

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


# Setup S3 client
Need to set env variables in `.env` file.

In [25]:
region_name = os.getenv("AWS_DEFAULT_REGION")
endpoint_url = os.getenv("S3_ENDPOINT_URL")
aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_bucket_name = os.getenv('S3_BUCKET_NAME')
to_ignore = json.loads(os.getenv("FILE_TO_IGNORE"))

s3_client = boto3.client(
    "s3",
    region_name=region_name,
    use_ssl=True,
    endpoint_url=endpoint_url,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Import Gitea libs and setup

In [26]:
repo_path = "./backup_repo"
if not git.repo.fun.is_git_dir(repo_path):
    exit

files_to_create = []
files_to_update = []
files_to_delete = []

repo = git.Repo(repo_path)
commit_message = ""

repo_files = []
for root, dirs, files in os.walk(repo.working_dir):
    # Exclude the .git directory
    if '.git' in dirs:
        dirs.remove('.git')
    for file in files:
        repo_files.append(os.path.relpath(os.path.join(root, file), repo.working_dir))


In [27]:
# Create a reusable Paginator
paginator = s3_client.get_paginator('list_objects')

# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=s3_bucket_name)

for page in page_iterator:
    for object in page['Contents']:
        if object["Key"].endswith("/") or object["Key"] in to_ignore:
            continue
        local_file_path = os.path.join(repo.working_dir, object['Key'])

        # Try to remove
        try: 
            repo_files.remove(object["Key"])
        # If it doesn't work it means that the file does not exist
        except:
            commit_message += f"File {object['Key']} was created in the S3 bucket."
            subdirs = os.path.dirname(local_file_path)
            print(subdirs)
            if subdirs:
                os.makedirs(subdirs, mode=0o777, exist_ok=True)
            s3_client.download_file(s3_bucket_name, object["Key"], local_file_path)
            continue

        # File exists locally, compare the file's hashes
        local_file_modified = os.path.getmtime(local_file_path)
        s3_file_last_modified = object['LastModified'].timestamp()
        if local_file_modified < s3_file_last_modified:
            # File has been modified
            commit_message += f"File {object['Key']} was modified in the S3 bucket."
            s3_client.download_file(s3_bucket_name, object["Key"], local_file_path)

for file_to_del in repo_files:
    if file_to_del in to_ignore:
        continue
    commit_message += f"File {file_to_del} was deleted from the S3 bucket."
    local_file_path = os.path.join(repo.working_dir, file_to_del)
    # Remove the file from the Git index and commit the changes
    os.remove(local_file_path)

In [28]:
if commit_message:
    repo.git.add(update=True)
    repo.index.commit(commit_message)
    origin = repo.remote(name='origin')
    origin.push()
    print(commit_message)

File journal/2023/06/2023-06-26.md was modified in the S3 bucket.
