In [1]:
import pandas as pd
import os
import gzip
import shutil
import time

In [2]:
FILE_NAME = 'D:/read_large_csv/2019-Oct.csv'
COMPRESSED_NAME = f"{FILE_NAME}.gz"

In [3]:
if not os.path.exists(FILE_NAME):
    raise FileNotFoundError(f"{FILE_NAME} not found in {os.getcwd()}")

file_size = os.path.getsize(FILE_NAME) / (1024**2)  # MB
print(f"File found: {FILE_NAME} ({file_size:.2f} MB)")  

File found: D:/read_large_csv/2019-Oct.csv (5406.01 MB)


In [4]:

if not os.path.exists(FILE_NAME):
    raise FileNotFoundError(f"File {FILE_NAME} not found!")

# Compress
try:
    with open(FILE_NAME, 'rb') as f_in, gzip.open(COMPRESSED_NAME, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
except Exception as e:
    print(f"Compression failed: {e}")
    exit()

# Time file reading
start_load = time.time()
try:
    df = pd.read_csv(COMPRESSED_NAME, compression='gzip')
except Exception as e:
    print(f"Reading failed: {e}")
    exit()
load_time = time.time() - start_load

# Time data processing
start_process = time.time()
if 'price' not in df.columns:
    raise KeyError("'price' column missing")
filtered_df = df[df['price'] > 50]
process_time = time.time() - start_process

print(f"\nLoaded in: {load_time:.2f}s | Filtered in: {process_time:.2f}s")
print("Sample data:")
print(filtered_df.head(2))


Loaded in: 264.96s | Filtered in: 9.86s
Sample data:
                event_time event_type  product_id          category_id  \
2  2019-10-01 00:00:01 UTC       view    17200506  2053013559792632471   
3  2019-10-01 00:00:01 UTC       view     1307067  2053013558920217191   

                category_code   brand   price    user_id  \
2  furniture.living_room.sofa     NaN  543.10  519107250   
3          computers.notebook  lenovo  251.74  550050854   

                           user_session  
2  566511c2-e2e3-422b-b695-cf8e6e792ca8  
3  7c90fc70-0e80-4590-96f3-13c02c18c713  
