In [1]:
import os
from pyspark.sql import SparkSession

# Set GCS credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/longnguyen/credentials/key.json'

# Create Spark session with GCS connector
spark = SparkSession.builder \
    .appName("Test GCS with Spark") \
    .master("spark://node-2:7077") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/home/longnguyen/credentials/key.json") \
    .config("spark.jars.packages", "com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.11") \
    .getOrCreate()

# Verify connection
print(f"Spark Version: {spark.version}")
print(f"Master: {spark.sparkContext.master}")
print(f"App Name: {spark.sparkContext.appName}")
print("✓ Spark session with GCS connector created successfully!")

:: loading settings :: url = jar:file:/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/longnguyen/.ivy2.5.2/cache
The jars for the packages stored in: /home/longnguyen/.ivy2.5.2/jars
com.google.cloud.bigdataoss#gcs-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a105eaaf-57b1-4c79-bc5b-4c9573d9037f;1.0
	confs: [default]
	found com.google.cloud.bigdataoss#gcs-connector;hadoop3-2.2.11 in central
	found com.google.api-client#google-api-client-jackson2;2.0.1 in central
	found com.google.api-client#google-api-client;2.0.1 in central
	found com.google.oauth-client#google-oauth-client;1.34.1 in central
	found com.google.http-client#google-http-client;1.42.3 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.15 in central
	found commons-logging#commons

Spark Version: 4.1.0
Master: spark://node-2:7077
App Name: Test GCS with Spark
✓ Spark session with GCS connector created successfully!


In [3]:
# Write DataFrame to GCS bucket in different formats
# Replace 'your-bucket-name' with your actual GCS bucket name

GCS_BUCKET = "cdc-pipeline-data"  # Change this to your GCS bucket name
GCS_BASE_PATH = f"gs://{GCS_BUCKET}/spark-test"

In [2]:
# Create a test DataFrame
test_data = [
    ("Alice", 34, "Data Engineer", "2024-01-15"),
    ("Bob", 45, "Data Scientist", "2024-02-20"),
    ("Charlie", 29, "Developer", "2024-03-10"),
    ("Diana", 31, "Analyst", "2024-04-05"),
    ("Eve", 38, "Manager", "2024-05-12")
]

columns = ["name", "age", "role", "join_date"]
df = spark.createDataFrame(test_data, columns)

print("Test DataFrame created:")
df.show()
print(f"Total records: {df.count()}")

Test DataFrame created:


25/12/21 06:01:27 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/12/21 06:01:42 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

+-------+---+--------------+----------+
|   name|age|          role| join_date|
+-------+---+--------------+----------+
|  Alice| 34| Data Engineer|2024-01-15|
|    Bob| 45|Data Scientist|2024-02-20|
|Charlie| 29|     Developer|2024-03-10|
|  Diana| 31|       Analyst|2024-04-05|
|    Eve| 38|       Manager|2024-05-12|
+-------+---+--------------+----------+



[Stage 2:>                                                          (0 + 2) / 2]

Total records: 5


                                                                                

In [3]:
print("=== Writing test data to GCS ===\n")

# 1. Write as CSV
csv_path = f"{GCS_BASE_PATH}/csv/employees"
print(f"Writing CSV to: {csv_path}")
df.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv(csv_path)
print("✓ CSV file written successfully\n")


print("=== All files written to GCS successfully! ===")
print(f"\nCheck your GCS bucket at: gs://{GCS_BUCKET}/spark-test/")

=== Writing test data to GCS ===

Writing CSV to: gs://cdc-pipeline-data/spark-test/csv/employees


NameError: name 'df' is not defined

In [4]:
# Verify the files were created by reading them back
print("=== Verifying files in GCS ===\n")


csv_path = f'{GCS_BASE_PATH}/csv/employees'
# Read CSV
print("Reading CSV from GCS:")
df_csv = spark.read.option("header", "true").csv(csv_path)
df_csv.show()

print("✓ All files verified successfully!")

=== Verifying files in GCS ===

Reading CSV from GCS:


                                                                                

+-------+---+--------------+----------+
|   name|age|          role| join_date|
+-------+---+--------------+----------+
|Charlie| 29|     Developer|2024-03-10|
|  Diana| 31|       Analyst|2024-04-05|
|    Eve| 38|       Manager|2024-05-12|
|  Alice| 34| Data Engineer|2024-01-15|
|    Bob| 45|Data Scientist|2024-02-20|
+-------+---+--------------+----------+

✓ All files verified successfully!


In [5]:
# Clean up (optional)
# Uncomment the following to delete test files from GCS
# spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()).delete(
#     spark._jvm.org.apache.hadoop.fs.Path(GCS_BASE_PATH), True
# )
# print(f"✓ Test files deleted from {GCS_BASE_PATH}")

# Stop Spark session
spark.stop()
print("✓ Spark session stopped")

✓ Spark session stopped
