## Libraries and settings

## Section 2: File Formats  (~7 min)

In [None]:
# Libraries
import os
import warnings
import pandas as pd
import numpy as np
import pyarrow

# Ignore warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

### Task 2.1: Reading Different File Formats

The `part_2/` directory contains data in two formats:
- `apartments_data_winterthur.csv` — apartment rental data in CSV format
- `supermarkets.json` — supermarket locations from OpenStreetMap in JSON format

**Your tasks:**
1. Read the CSV file into a DataFrame using `pd.read_csv()` with appropriate parameters
2. Read the JSON file into a DataFrame using `pd.read_json()`
3. Display the first 3 rows and the shape of each DataFrame

In [None]:
# Task 2.1.1 — Read the CSV file
# TODO: Read 'apartments_data_winterthur.csv' into a DataFrame
df_apartments = pd.read_csv('apartments_data_winterthur.csv')

# Print info and first 3 rows
print(f'Info: {df_apartments.info()}')

df_apartments.head(3)

In [None]:
# Task 2.1.2 — Read the JSON file
# TODO: Read 'supermarkets.json' into a DataFrame
df_supermarkets = pd.read_json('supermarkets.json')

# Print last 3 rows
df_supermarkets.tail(3)

### Task 2.2: Nested Objects

The `df_supermarkets` DataFrame has a nested object in the `tags` column. Each row contains a dictionary with multiple OSM (OpenStreetMap) attributes like brand, opening hours, address details, etc.

**Your tasks:**
1. Inspect the `tags` column to understand its structure (display one example)
2. Flatten the nested `tags` dictionary into separate columns using `pd.json_normalize()`
3. Combine the flattened columns with the original location columns (`type`, `id`, `lat`, `lon`)
4. Drop the original `tags` column and display the resulting DataFrame
5. Compare the shape before and after flattening

In [None]:
# Task 2.2 Solution — Flattening Nested Objects

#TODO Step 1: Inspect the tags column structure
print("Step 1: Inspect one example of the nested 'tags' column")
print(f"Type of tags[0]: {type(df_supermarkets['tags'].iloc[0])}")
print(f"\nExample tags dictionary (first row):")
print(df_supermarkets['tags'].iloc[0])
print(f"\nNumber of keys in first row: {len(df_supermarkets['tags'].iloc[0])}")


In [None]:
# Step 2 & 3: Flatten the tags columns and combine with original columns

#TODO Flatten the tags dictionary into separate columns
tags_normalized = pd.json_normalize(df_supermarkets['tags'])

#TODO Combine with the original location columns
df_supermarkets_flattened = pd.concat(
    [df_supermarkets[['type', 'id', 'lat', 'lon']], tags_normalized],
    axis=1
)

print(f"Original shape: {df_supermarkets.shape}")
print(f"Flattened shape: {df_supermarkets_flattened.shape}")


In [None]:
#TODO Step 4: Display information about the flattened DataFrame
print("Column names:")
print(df_supermarkets_flattened.columns.tolist())
print(f"\nFirst 3 rows of flattened data:")
print(df_supermarkets_flattened.head(3))
print(f"\nData types:")
print(df_supermarkets_flattened.dtypes)


### Task 2.3: File Format Conversion

**Your tasks:**
1. Take the apartments DataFrame (from Task 2.1) and write it to a **Parquet** file
2. Read the Parquet file back and verify the data is identical
3. Compare the file sizes of the CSV and Parquet files

In [None]:
# Task 2.3.1 — Write to Parquet
# TODO: Save df_apartments to 'apartments_winterthur.parquet'
df_apartments.to_parquet('apartments_winterthur.parquet', index=False)

# Task 2.3.2 — Read back from Parquet and verify
# TODO: Read the parquet file and compare shape/dtypes with the original
df_from_parquet = pd.read_parquet('apartments_winterthur.parquet')

print(f'Original shape:  {df_apartments.shape}')
print(f'Parquet shape:   {df_from_parquet.shape}')
print(f'DataFrames equal: {df_apartments.equals(df_from_parquet)}')

In [None]:
# Task 2.3.3 — Compare file sizes
# TODO: Use os.path.getsize() to compare the CSV and Parquet file sizes
csv_size = os.path.getsize('apartments_data_winterthur.csv')
parquet_size = os.path.getsize('apartments_winterthur.parquet')

print(f'CSV file size:     {csv_size:>10,} bytes')
print(f'Parquet file size:  {parquet_size:>10,} bytes')
print(f'Compression ratio:  {csv_size / parquet_size:.2f}x')