In [None]:
# installing pyarrow and pyspark
# pip install pyarrow
# pip install delta-spark==1.0.0

In [None]:
# import sys
# print(sys.version)
# from pyspark.sql import SparkSession

# spark = SparkSession.builder \
#     .appName("TestApp") \
#     .getOrCreate()

# print(f"spark version : {spark.version}")
# spark.stop()

from pyspark.sql import SparkSession
from delta import *

# Initialize Spark session with Delta support
spark = SparkSession.builder \
    .appName("DeltaExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") \
    .getOrCreate()


data = [("Alice", 1), ("Bob", 2)]
df = spark.createDataFrame(data, ["name", "id"])

# Test writing to a Delta table
path = "/tmp/delta-table"
df.write.format("delta").mode("overwrite").save(path)

# Test reading from a Delta table
df_read = spark.read.format("delta").load(path)
df_read.show()



In [None]:
# 1. Creating a Sample CSV File

import pandas as pd
import numpy as np

# Creating a DataFrame
data = {
    "id": np.arange(1, 11),
    "name": ["Alice", "Bob", "Charlie", "David", "Eva", "Fiona", "Gina", "Harry", "Ivy", "John"],
    "age": np.random.randint(18, 40, size=10)
}

# write the content of df to CSV file
df = pd.DataFrame(data)
print("df to csv")
print(df)

# Saving to CSV
csv_file_path = 'sample_data.csv'
print(csv_file_path)
df.to_csv(csv_file_path, index=False)
print("\n after csv file creation")
print(df)

In [None]:
# 2. Converting CSV to Parquet

# Reading CSV
df = pd.read_csv(csv_file_path)

# Converting to Parquet
parquet_file_path = 'sample_data.parquet'
df.to_parquet(parquet_file_path, index=False)
print("after converting csv file to parquet file")
print(df)

In [None]:
# 3. Accessing Parquet Metadata and Viewing Blocks

import pyarrow.parquet as pq

# Opening the Parquet file
parquet_file = pq.ParquetFile(parquet_file_path)

# Viewing Metadata
print(parquet_file.metadata)
print(parquet_file.schema)

# Viewing the data by row group and column
for i in range(parquet_file.num_row_groups):
    rg = parquet_file.read_row_group(i)
    print(f"Row Group {i}:")
    print(rg.to_pandas())


In [None]:
# 4. Converting CSV to Delta-Parquet and Viewing Data

from pyspark.sql import SparkSession
from delta import *

# Initialize Spark session with Delta support
spark = SparkSession.builder \
    .appName("DeltaExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Ensure the DataFrame from the CSV is loaded into Spark
df_spark = spark.read.csv('sample_data.csv', header=True, inferSchema=True)
print(df_spark)

# Define a path where the Delta table will be stored
path_to_delta_table = "/tmp/delta-table"

# Write DataFrame to Delta format
df_spark.write.format("delta").mode("overwrite").save(path_to_delta_table)

# Reading from the Delta table
delta_df = spark.read.format("delta").load(path_to_delta_table)
delta_df.show()

# Accessing DeltaTable functions
delta_table = DeltaTable.forPath(spark, path_to_delta_table)
full_history = delta_table.history()  # Provides a DataFrame with the full history of the table
full_history.show()



In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session with Delta support
# spark = SparkSession.builder \
#     .appName("DeltaExample") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
#     .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") \
#     .getOrCreate()

spark = SparkSession.builder \
    .appName("DeltaExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

test_df = spark.createDataFrame([("Alice", 1), ("Bob", 2)], ["name", "id"])


print(test_df)