In [None]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import os
import pprint

In [None]:
parquet_file_path = "exports/sample/all-tickets.parquet"

--- Debugging Parquet File: all-tickets.parquet ---


In [None]:

print("\n--- Pandas DataFrame Inspection ---")
try:
    df = pd.read_parquet(parquet_file_path, engine='pyarrow')
    print("File loaded successfully into Pandas DataFrame.")

    print("\nDataFrame Head (first 5 rows):")
    display(df.head()) # Use display() in notebooks for richer output

    print("\nDataFrame Shape (rows, columns):", df.shape)
    print("\nDataFrame Columns:", df.columns.tolist())

    print("\nDataFrame Info (data types and non-null counts):")
    df.info()

    print("\nDataFrame Data Types:")
    print(df.dtypes)
    # Notice that the 'data' column (and any other complex columns) will likely be 'object' dtype.
    # This means Pandas is storing generic Python objects (dicts, lists) in those cells.

    print("\nCheck for Missing Values:")
    print(df.isnull().sum())

    # --- 4. Inspecting Nested Data (the 'data' column) ---
    if 'data' in df.columns and not df.empty:
        print("\n--- Inspecting 'data' column (nested content) ---")
        first_nested_item = df['data'].iloc[0]
        print(f"Type of content in 'data' column (first row): {type(first_nested_item)}")
        print("\nPretty print of first item in 'data' column:")
        pprint.pprint(first_nested_item)

        # You can access nested elements directly if they are dictionaries
        # For example, if your original JSON had {"id": 1, "nested_key": {"sub_key": "value"}}
        # you might access:
        # if isinstance(first_nested_item, dict) and 'your_nested_key' in first_nested_item:
        #     print("\nAccessing a specific nested key:")
        #     pprint.pprint(first_nested_item['your_nested_key'])

        # You can also sample other nested items
        if df.shape[0] > 5:
            print("\nPretty print of 5th item in 'data' column (if exists):")
            pprint.pprint(df['data'].iloc[4])

    else:
        print("\n'data' column not found or DataFrame is empty. Cannot inspect nested content.")

except Exception as e:
    print(f"\nError loading Parquet into Pandas DataFrame: {e}", file=sys.stderr)
    print("This might indicate file corruption or a specific schema issue.")



--- Pandas DataFrame Inspection ---
File loaded successfully into Pandas DataFrame.

DataFrame Head (first 5 rows):


Unnamed: 0,original_file_path,original_filename,data
0,2025/April/03/024699.json,024699.json,"{'conversations': [], 'export_version': '1.0',..."
1,2025/April/03/024748.json,024748.json,"{'conversations': [], 'export_version': '1.0',..."
2,2025/April/03/024416.json,024416.json,"{'conversations': [], 'export_version': '1.0',..."
3,2025/April/03/024504.json,024504.json,"{'conversations': [], 'export_version': '1.0',..."
4,2025/April/03/024787.json,024787.json,"{'conversations': [], 'export_version': '1.0',..."



DataFrame Shape (rows, columns): (11364, 3)

DataFrame Columns: ['original_file_path', 'original_filename', 'data']

DataFrame Info (data types and non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11364 entries, 0 to 11363
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   original_file_path  11364 non-null  object
 1   original_filename   11364 non-null  object
 2   data                11364 non-null  object
dtypes: object(3)
memory usage: 133.2+ KB

DataFrame Data Types:
original_file_path    object
original_filename     object
data                  object
dtype: object

Check for Missing Values:
original_file_path    0
original_filename     0
data                  0
dtype: int64

--- Inspecting 'data' column (nested content) ---
Type of content in 'data' column (first row): <class 'dict'>

Pretty print of first item in 'data' column:
{'conversations': array([], dtype=object),
 'expo

In [None]:
print("\n--- PyArrow Direct Inspection ---")
try:
    # Read the Parquet schema directly
    parquet_schema = pq.read_schema(parquet_file_path)
    print("\nPyArrow Parquet Schema (detailed types including nested):")
    print(parquet_schema)
    # Look for 'struct' (for dictionaries) and 'list' (for arrays) types here.

    # Read the entire Parquet file into an Arrow Table
    arrow_table = pq.read_table(parquet_file_path)
    print(f"\nArrow Table loaded successfully. Rows: {arrow_table.num_rows}, Columns: {arrow_table.num_columns}")
    print("\nArrow Table Column Names:", arrow_table.column_names)

    # Access a specific column in PyArrow
    if 'data' in arrow_table.column_names:
        data_column = arrow_table.column('data')
        print(f"\nArrow Type of 'data' column: {data_column.type}")
        print("First value in 'data' column (as PyArrow Scalar):")
        print(data_column[0])
        # Convert to Python object for easier viewing
        print("\nFirst value in 'data' column (converted to Python object):")
        pprint.pprint(data_column[0].as_py())
    else:
        print("\n'data' column not found in Arrow Table.")

    # Inspect Parquet file metadata (e.g., number of row groups, compression)
    parquet_file = pq.ParquetFile(parquet_file_path)
    print("\nParquet File Metadata:")
    print(f"Number of Row Groups: {parquet_file.num_row_groups}")
    print(f"File Version: {parquet_file.metadata.format_version}")

    # You can loop through row groups for more detail, e.g.,
    # for i in range(parquet_file.num_row_groups):
    #     row_group_metadata = parquet_file.metadata.row_group(i)
    #     print(f"  Row Group {i}: Num Rows={row_group_metadata.num_rows}, Compressed Size={row_group_metadata.compressed_size}")
    #     for col_meta_idx in range(row_group_metadata.num_columns):
    #         col_meta = row_group_metadata.column(col_meta_idx)
    #         print(f"    Column {col_meta.path_in_schema}: Compression={col_meta.compression}")


except Exception as e:
    print(f"\nError during PyArrow direct inspection: {e}", file=sys.stderr)
    print("This could be due to a malformed Parquet file or a schema issue.")


print("\n--- Debugging complete ---")


--- PyArrow Direct Inspection ---

PyArrow Parquet Schema (detailed types including nested):
original_file_path: string
original_filename: string
data: struct<conversations: list<element: struct<attachments: list<element: struct<attachment_url: string, content_type: string, created_at: string, id: int64, name: string, size: int64, updated_at: string>>, bcc_emails: list<element: string>, body: string, body_text: string, cc_emails: list<element: string>, created_at: string, from_email: string, id: int64, incoming: bool, private: bool, source: int64, support_email: string, ticket_id: int64, to_emails: list<element: string>, updated_at: string, user_id: int64>>, export_version: string, exported_at: string, ticket: struct<category: string, cc_emails: list<element: string>, created_at: string, custom_fields: struct<business_impact: null, change_number: null, fedex_return_tracking: null, fedex_tracking: string, impacted_locations: null, jira_url: string, lf_on_behalf_of: int64, location: str