In [None]:
# installing pyarrow and pyspark
# pip install pyarrow
# pip install pyspark delta-spark


In [1]:
# 1. Creating a Sample CSV File

import pandas as pd
import numpy as np

# Creating a DataFrame
data = {
    "id": np.arange(1, 11),
    "name": ["Alice", "Bob", "Charlie", "David", "Eva", "Fiona", "Gina", "Harry", "Ivy", "John"],
    "age": np.random.randint(18, 40, size=10)
}

df = pd.DataFrame(data)
print("df to csv")
print(df)

# Saving to CSV
csv_file_path = 'sample_data.csv'
df.to_csv(csv_file_path, index=False)
print("\n after csv file creation")
print(df)

df to csv
   id     name  age
0   1    Alice   28
1   2      Bob   33
2   3  Charlie   24
3   4    David   31
4   5      Eva   39
5   6    Fiona   20
6   7     Gina   26
7   8    Harry   22
8   9      Ivy   29
9  10     John   34

 after csv file creation
   id     name  age
0   1    Alice   28
1   2      Bob   33
2   3  Charlie   24
3   4    David   31
4   5      Eva   39
5   6    Fiona   20
6   7     Gina   26
7   8    Harry   22
8   9      Ivy   29
9  10     John   34


In [19]:
# 2. Converting CSV to Parquet

# Reading CSV
df = pd.read_csv(csv_file_path)

# Converting to Parquet
parquet_file_path = 'sample_data.parquet'
df.to_parquet(parquet_file_path, index=False)
print("after converting csv file to parquet file")
print(df)

after converting csv file to parquet file
   id     name  age
0   1    Alice   25
1   2      Bob   34
2   3  Charlie   34
3   4    David   26
4   5      Eva   38
5   6    Fiona   34
6   7     Gina   27
7   8    Harry   24
8   9      Ivy   21
9  10     John   21


In [None]:
# 3. Accessing Parquet Metadata and Viewing Blocks

import pyarrow.parquet as pq

# Opening the Parquet file
parquet_file = pq.ParquetFile(parquet_file_path)

# Viewing Metadata
print(parquet_file.metadata)
print(parquet_file.schema)

# Viewing the data by row group and column
for i in range(parquet_file.num_row_groups):
    rg = parquet_file.read_row_group(i)
    print(f"Row Group {i}:")
    print(rg.to_pandas())


In [20]:
# 4. Converting CSV to Delta-Parquet and Viewing Data

from pyspark.sql import SparkSession
from delta import *

# Initialize Spark session with Delta support
spark = SparkSession.builder \
    .appName("DeltaExample") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Ensure the DataFrame from the CSV is loaded into Spark
df_spark = spark.read.csv('sample_data.csv', header=True, inferSchema=True)

# Define a path where the Delta table will be stored
# Make sure to change "/tmp/delta-table" to a suitable path where you have write permissions
path_to_delta_table = "/tmp/delta-table"

# Write DataFrame to Delta format
df_spark.write.format("delta").mode("overwrite").save(path_to_delta_table)

# Reading from the Delta table
delta_df = spark.read.format("delta").load(path_to_delta_table)
delta_df.show()

# Accessing DeltaTable functions
delta_table = DeltaTable.forPath(spark, path_to_delta_table)
full_history = delta_table.history()  # Provides a DataFrame with the full history of the table
full_history.show()



24/05/08 17:31:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 34|
|  3|Charlie| 34|
|  4|  David| 26|
|  5|    Eva| 38|
|  6|  Fiona| 34|
|  7|   Gina| 27|
|  8|  Harry| 24|
|  9|    Ivy| 21|
| 10|   John| 21|
+---+-------+---+

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      0|2024-05-08 17:31:...|  NULL|    NULL|    WRITE|{mode -> Overwrit...|NULL|    NULL|     NULL|       NULL|  Serializable|        false|{numFiles -