In [1]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport

# Configuration for sampling
PARQUET_INPUT = "../data/2019-Oct_chunk.parquet"
SAMPLE_OUTPUT = "../data/sample.parquet"
SAMPLE_SIZE = 3000000

# Step 1: Load the optimized data
print("Loading optimized Parquet file...")
df = pd.read_parquet(PARQUET_INPUT)

# Step 2: Random Sampling
# Creating a small, manageable sample for quick visualizations and reports
print(f"Generating a random sample of {SAMPLE_SIZE} records...")
df_sample = df.sample(n=SAMPLE_SIZE, random_state=42)

# Step 3: Persistence
# Save the sample to a separate file for lightweight sharing/testing
df_sample.to_parquet(SAMPLE_OUTPUT)

# Step 4: Automated EDA Report
# Using ydata-profiling to generate an interactive HTML analysis report
print("Generating Data Profiling Report (Minimal Mode)...")
profile = ProfileReport(df_sample, title="E-Commerce Behavioral Report", minimal=True)

# Step 5: Save report to disk
profile.to_file("../data/my_data_report.html")

print("EDA Notebook execution completed.")

Loading optimized Parquet file...
Generating a random sample of 3000000 records...
Generating Data Profiling Report (Minimal Mode)...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:46<00:00,  5.22s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

EDA Notebook execution completed.
