# Version pandas with deltalake

This notebook works with the `pandas-deltalake` environment.

In [24]:
import os

import pandas as pd
from deltalake import DeltaTable
from deltalake.writer import write_deltalake

## Create Delta Lake

In [25]:
df = pd.DataFrame({"x": [1, 2, 3]})

In [26]:
df

Unnamed: 0,x
0,1
1,2
2,3


In [27]:
os.makedirs("tmp/some_delta_lake", exist_ok=True)

In [28]:
write_deltalake("tmp/some_delta_lake", df)

In [29]:
dt = DeltaTable("tmp/some_delta_lake")

In [30]:
dt.to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [31]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-c095d563-6d54-47bb-a8ee-c6d6148e8d38-0.parquet[0m
└── [01;34m_delta_log[0m
    └── [00m00000000000000000000.json[0m

1 directory, 2 files


In [32]:
!jq . tmp/some_delta_lake/_delta_log/00000000000000000000.json

[1;39m{
  [0m[34;1m"commitInfo"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"delta-rs"[0m[1;39m: [0m[0;32m"0.8.0"[0m[1;39m,
    [0m[34;1m"timestamp"[0m[1;39m: [0m[0;39m1683225239524[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"protocol"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"minReaderVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m,
    [0m[34;1m"minWriterVersion"[0m[1;39m: [0m[0;39m1[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"metaData"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"id"[0m[1;39m: [0m[0;32m"006bc626-3d8a-4ee1-b141-07e772e360ce"[0m[1;39m,
    [0m[34;1m"name"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"description"[0m[1;39m: [0m[1;30mnull[0m[1;39m,
    [0m[34;1m"format"[0m[1;39m: [0m[1;39m{
      [0m[34;1m"provider"[0m[1;39m: [0m[0;32m"parquet"[0m[1;39m,
      [0m[34;1m"options"[0m[1;39m: [0m[1;39m{}[0m[1;39m
    [1;39m}[0m[1;39m,
    [0m[34;1m"schemaString"[0m

## Append to Delta Lake

In [33]:
df2 = pd.DataFrame({"x": [8, 9, 10]})

In [34]:
write_deltalake("tmp/some_delta_lake", df2, mode="append")

In [35]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-c095d563-6d54-47bb-a8ee-c6d6148e8d38-0.parquet[0m
├── [00m1-75c05f5e-116c-4d87-aebf-1b80ad497d66-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    └── [00m00000000000000000001.json[0m

1 directory, 4 files


In [36]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


## Overwrite Delta Lake

In [37]:
df3 = pd.DataFrame({"x": [55, 66, 77]})

In [38]:
df3

Unnamed: 0,x
0,55
1,66
2,77


In [39]:
write_deltalake("tmp/some_delta_lake", df3, mode="overwrite")

In [40]:
!tree tmp/some_delta_lake

[01;34mtmp/some_delta_lake[0m
├── [00m0-c095d563-6d54-47bb-a8ee-c6d6148e8d38-0.parquet[0m
├── [00m1-75c05f5e-116c-4d87-aebf-1b80ad497d66-0.parquet[0m
├── [00m2-b5214974-a0a4-42cf-84ba-f8d4efcb9131-0.parquet[0m
└── [01;34m_delta_log[0m
    ├── [00m00000000000000000000.json[0m
    ├── [00m00000000000000000001.json[0m
    └── [00m00000000000000000002.json[0m

1 directory, 6 files


In [41]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


In [42]:
!jq . tmp/some_delta_lake/_delta_log/00000000000000000002.json

[1;39m{
  [0m[34;1m"add"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"2-b5214974-a0a4-42cf-84ba-f8d4efcb9131-0.parquet"[0m[1;39m,
    [0m[34;1m"size"[0m[1;39m: [0m[0;39m1654[0m[1;39m,
    [0m[34;1m"partitionValues"[0m[1;39m: [0m[1;39m{}[0m[1;39m,
    [0m[34;1m"modificationTime"[0m[1;39m: [0m[0;39m1683225508280[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;39mtrue[0m[1;39m,
    [0m[34;1m"stats"[0m[1;39m: [0m[0;32m"{\"numRecords\": 3, \"minValues\": {\"x\": 55}, \"maxValues\": {\"x\": 77}, \"nullCount\": {\"x\": 0}}"[0m[1;39m,
    [0m[34;1m"tags"[0m[1;39m: [0m[1;30mnull[0m[1;39m
  [1;39m}[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"remove"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"path"[0m[1;39m: [0m[0;32m"0-c095d563-6d54-47bb-a8ee-c6d6148e8d38-0.parquet"[0m[1;39m,
    [0m[34;1m"deletionTimestamp"[0m[1;39m: [0m[0;39m1683225508281[0m[1;39m,
    [0m[34;1m"dataChange"[0m[1;39m: [0m[0;

## Confirm other versions are still accessible

In [43]:
DeltaTable("tmp/some_delta_lake", version=0).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3


In [44]:
DeltaTable("tmp/some_delta_lake", version=1).to_pandas()

Unnamed: 0,x
0,1
1,2
2,3
3,8
4,9
5,10


In [45]:
DeltaTable("tmp/some_delta_lake").to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


In [46]:
DeltaTable("tmp/some_delta_lake", version=2).to_pandas()

Unnamed: 0,x
0,55
1,66
2,77


## Schema enforcement prevents bad appends

In [47]:
df4 = pd.DataFrame({"y": [111, 222]})

In [48]:
write_deltalake("tmp/some_delta_lake", df4, mode="append")

ValueError: Schema of data does not match table schema
Table schema:
y: int64
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 358
Data Schema:
x: int64

## Cleanup

In [None]:
!rm -rf tmp