In [4]:
from datetime import datetime
import json
import boto3 
from botocore.client import Config

localstack_url = 'http://localhost:4566'

data = [
    {"name": "Alice", "age": 25, "email": "alice@gmail.com"},
    {"name": "Bob", "age": 30, "email": "bob@gmail.com"},
    {"name": "Cathy", "age": 22, "email": "cathy@gmail.com"}
]


def store_s3_json(s3_client, items: list, bucket_name: str):
    try:
        current_date = datetime.now().strftime("%Y/%m/%d")
        prefix = f"stripe/{current_date}/type=json"
        file_name = f"{prefix}/users_{datetime.now().strftime('%H%M%S')}.json"
        json_data = json.dumps(items)
        s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=json_data)
    except Exception as e:
        print(f"Error: {e}")


# Kết nối tới MinIO
# s3 = boto3.client(
#     's3',
#     endpoint_url='http://localhost:9001',
#     aws_access_key_id='minio',
#     aws_secret_access_key='minio123',
#     config=Config(signature_version='s3v4')
# )

s3 = boto3.client(
    's3',
    endpoint_url=localstack_url,
    aws_access_key_id='test',  # Use any access key and secret key for LocalStack
    aws_secret_access_key='test',
    config=Config(signature_version='s3v4')
)

# s3.create_bucket(Bucket='mybucket')

# Tên bucket
bucket_name = 'mybucket'

# Tải file lên MinIO
file_name = 'data.json'
store_s3_json(s3, items=data, bucket_name="mybucket")
print(f'File "{file_name}" uploaded successfully to bucket "{bucket_name}".')


# List objects in the bucket
response = s3.list_objects(Bucket='mybucket')
print('Objects in mybucket:')
for obj in response.get('Contents', []):
    print(f'  {obj["Key"]}')

File "data.json" uploaded successfully to bucket "mybucket".
Objects in mybucket:
  stripe/2024/07/15/type=json/users_153357.json
  stripe/2024/07/16/type=json/users_090908.json
  stripe/2024/07/16/type=json/users_090924.json
  stripe/2024/07/16/type=json/users_091804.json


In [6]:
import os
# # Set environment variables
os.environ["SPARK_HOME"] = "/opt/homebrew/Cellar/apache-spark/3.5.1/libexec"
os.environ["AWS_ACCESS_KEY_ID"] = "dummy"
os.environ["AWS_SECRET_ACCESS_KEY"] = "dummy"

# Verify environment variables
print("SPARK_HOME:", os.environ["SPARK_HOME"])
print("AWS_ACCESS_KEY_ID:", os.environ.get("AWS_ACCESS_KEY_ID"))
print("AWS_SECRET_ACCESS_KEY:", os.environ.get("AWS_SECRET_ACCESS_KEY"))


import hashlib

def pseudonymize_doc_string(doc):
    '''
    Pseudonmyisation is a deterministic type of PII-obscuring
    Its role is to allow identifying users by their hash,
    without revealing the underlying info.
    '''
    # add a constant salt to generate
    salt = 'WI@N57%zZrmk#88c'
    salted_string = doc + salt
    sh = hashlib.sha256()
    sh.update(salted_string.encode())
    hashed_string = sh.digest().hex()
    return hashed_string


import findspark
# Initialize findspark with the specified SPARK_HOME
findspark.init(os.environ["SPARK_HOME"])
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

# Create SparkSession
spark = SparkSession.builder \
    .appName("S3App") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.2,com.amazonaws:aws-java-sdk-bundle:1.11.375,ru.yandex.clickhouse:clickhouse-jdbc:0.2.6") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.access.key", os.environ["AWS_ACCESS_KEY_ID"]) \
    .config("spark.hadoop.fs.s3a.secret.key", os.environ["AWS_SECRET_ACCESS_KEY"]) \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:4566") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

pseudonymize_udf = F.udf(pseudonymize_doc_string, StringType())

# Read a JSON file from S3 (LocalStack)
try:
    df = spark.read.json("s3a://mybucket/stripe/2024/07/15/type=json/users_153357.json")
    df = df.withColumn("email", pseudonymize_udf(F.col("email")))
    df.show()
    df.printSchema()
except Exception as e:
    print(f"Error reading data: {e}")


SPARK_HOME: /opt/homebrew/Cellar/apache-spark/3.5.1/libexec
AWS_ACCESS_KEY_ID: dummy
AWS_SECRET_ACCESS_KEY: dummy
+---+--------------------+-----+
|age|               email| name|
+---+--------------------+-----+
| 25|0129c7f7c8cd4dda3...|Alice|
| 30|df6b0a875d719135e...|  Bob|
| 22|1676e9f3b5a8004cd...|Cathy|
+---+--------------------+-----+

root
 |-- age: long (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)



In [3]:

clickhouse_url = "jdbc:clickhouse://localhost:8123/default"
clickhouse_properties = {
    "driver": "ru.yandex.clickhouse.ClickHouseDriver",
    "user": "clickhouse-user",
    "password": "secret"
}

# Write DataFrame to ClickHouse
try:
    df.write \
      .format("jdbc") \
      .option("url", clickhouse_url) \
      .option("dbtable", "users") \
      .option("user", clickhouse_properties["user"]) \
      .option("password", clickhouse_properties["password"]) \
      .option("driver", clickhouse_properties["driver"]) \
      .mode("append") \
      .save()
    print("Data written to ClickHouse successfully.")
except Exception as e:
    print(f"Error writing data to ClickHouse: {e}")

Data written to ClickHouse successfully.


24/07/16 09:10:50 WARN JdbcUtils: Requested isolation level 1, but transactions are unsupported


In [4]:
from pymongo import MongoClient

# Cấu hình thông tin xác thực
username = 'admin' # Người dùng bạn đã tạo trong mongo-init.js hoặc MONGO_INITDB_ROOT_USERNAME
password = 'admin' # Mật khẩu tương ứng
host = 'localhost' # Hoặc tên dịch vụ MongoDB trong Docker Compose nếu Python chạy trong cùng môi trường Docker
port = '27017'
database_name = 'mongo'

# Tạo chuỗi kết nối
connection_string = f'mongodb://{username}:{password}@{host}:{port}/{database_name}'
# connection_string = f'mongodb://{username}:{password}@{host}:{port}/{database_name}?replicaSet=rs0'

# Kết nối tới MongoDB
client = MongoClient(connection_string)
db = client[database_name]


# Access the collection
collection = db.mongousers

# Sample data to insert
data = [
    {"_id":1, "email": "user11@example.com", "name": "long le One 1", "age": 25},
    {"_id":2, "email": "user21@example.com", "name": "long le Two 1", "age": 30},
    {"_id":3,"email": "user31@example.com", "name": "long le Three 1", "age": 22}
]

# Insert data into the collection
result = collection.insert_many(data)

# Print the inserted IDs
print("Inserted IDs:", result.inserted_ids)

documents = collection.find({}) 
for doc in documents:
    print(doc)


{'_id': ObjectId('6694de078283b50447c427d7'), 'email': 'user1@example.com', 'name': 'User One', 'age': 25}
{'_id': ObjectId('6694de078283b50447c427d8'), 'email': 'user2@example.com', 'name': 'User Two', 'age': 30}
{'_id': ObjectId('6694de078283b50447c427d9'), 'email': 'user3@example.com', 'name': 'User Three', 'age': 22}
{'_id': ObjectId('66953c538283b50447c427db'), 'email': 'user11@example.com', 'name': 'User One 1', 'age': 25}
{'_id': ObjectId('66953c538283b50447c427dc'), 'email': 'user21@example.com', 'name': 'User Two 1', 'age': 30}
{'_id': ObjectId('66953c538283b50447c427dd'), 'email': 'user31@example.com', 'name': 'User Three 1', 'age': 22}
{'_id': 1, 'email': 'user11@example.com', 'name': 'User One 1', 'age': 25}
{'_id': 2, 'email': 'user21@example.com', 'name': 'User Two 1', 'age': 30}
{'_id': 3, 'email': 'user31@example.com', 'name': 'User Three 1', 'age': 22}
{'_id': 11, 'email': 'user11@example.com', 'name': 'User One 1', 'age': 25}
{'_id': 22, 'email': 'user21@example.com',