<a href="https://colab.research.google.com/github/nuevocs/colab-repo/blob/main/file/data_compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
# loading libraries
import time
import lzma
import gzip
import bz2

In [2]:
# declaring a data to compress

data = b'This is Sample DATA' * 99000
print(f"Original data size {len(data)}")

Original data size 1881000


In [72]:
type(data)

bytes

In [3]:
start_time = time.time()

compressed_data_lzma = lzma.compress(data)

end_time = time.time()

In [4]:
print(end_time - start_time)
print(len(compressed_data_lzma))

0.0652768611907959
420


In [10]:
compressed_pct =  len(compressed_data_lzma) / len(data) * 100
print(compressed_pct)

0.022328548644338118


In [16]:
# de-compress
decon = lzma.decompress(compressed_data_lzma)
check_eq = len(data) == len(decon)
print(check_eq)

True


In [41]:
from dataclasses import dataclass

@dataclass
class CompressionFile:
    original_size: int
    compressed_size: int
    compressed_time: float
    compressed_pct: float


In [71]:
def compression_comaprison(
    original_data: bytes,
    compress_algorithm: str
) -> CompressionFile:
    start_time = time.time()

    match compress_algorithm:
        case "lzma":
            compressed_data = lzma.compress(original_data)
            pass
        case "gzip":
            compressed_data = gzip.compress(original_data)
            pass
        case "bz2":
            compressed_data = bz2.compress(original_data)
            pass
        case _:
            raise ValueError
    end_time = time.time()

    compressed = CompressionFile(
        original_size=len(original_data),
        compressed_size=len(compressed_data),
        compressed_time=float(end_time - start_time),
        compressed_pct=len(compressed_data) / len(original_data) * 100
    )
    return compressed


In [70]:
compression_comaprison(data, "lzma")

CompressionFile(original_size=1881000, compressed_size=420, compressed_time=0.1038670539855957, compressed_pct=0.022328548644338118)

In [47]:
def saving_compressed_file(
    original_data: bytes,
    compress_algorithm: str,
    output_path: str
    ) -> None:
    match compress_algorithm:
        case "lzma":
            compressed_data = lzma.compress(original_data)
            pass
        case "gzip":
            compressed_data = gzip.compress(original_data)
            pass
        case "bz2":
            compressed_data = bz2.compress(original_data)
            pass
        case _:
            raise ValueError
    with open(output_path, 'wb') as f:
        f.write(data)

In [48]:
saving_compressed_file(
    data,
    "gzip",
    "file.gzip"
)

In [57]:
import requests
import json

r = requests.get("https://storage.googleapis.com/zenn-user-upload/baf7832b7c3c-20230729.png")
parsed = r.content
with open("test.png", "wb") as f:
    f.write(parsed)

In [55]:
saving_compressed_file(
    parsed,
    "lzma",
    "file.lzma"
)

In [74]:
len(parsed)
print(type(parsed))

<class 'bytes'>


In [73]:
compression_comaprison(parsed, "lzma")

CompressionFile(original_size=10287, compressed_size=9840, compressed_time=0.007701873779296875, compressed_pct=95.65470982793818)

In [76]:
def comparison_compressed(
    sample_a: CompressionFile,
    sample_b: CompressionFile
    ) -> None:
        time_diff = sample_a.compressed_time - sample_b.compressed_time
        data_diff = sample_a.compressed_size - sample_b.compressed_size

        print(f"Compression Time - Sample A - Sample B: {time_diff}")
        print(f"Compression Size - Sample A - Sample B: {data_diff}")


In [81]:
comparison_compressed(
    sample_a=compression_comaprison(data, "lzma"),
    sample_b=compression_comaprison(data, "gzip")
)

Compression Time - Sample A - Sample B: 0.0227968692779541
Compression Size - Sample A - Sample B: -4197
