## Libraries and settings

## Section 2: File Formats

In [1]:
# Libraries
import os
import warnings
import pandas as pd
import numpy as np
import pyarrow

# Ignore warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/workspaces/data_engineer_assessment/part_2


### Task 2.1: Reading Different File Formats

The `part_2/` directory contains data in two formats:
- `apartments_data_winterthur.csv` — apartment rental data in CSV format
- `supermarkets.json` — supermarket locations from OpenStreetMap in JSON format

**Your tasks:**
1. Read the CSV file into a DataFrame
2. Read the JSON file into a DataFrame
3. Display the first 3 rows and the shape of each DataFrame

In [3]:
# Task 2.1.1 — Read the CSV file
# TODO: Read 'apartments_data_winterthur.csv' into a DataFrame
df_apartments = pd.read_csv('apartments_data_winterthur.csv')

# TODO: Print info and first 3 rows
print(f'Info: ')
df_apartments.info()
df_apartments.head(3)


Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   web-scraper-order      120 non-null    object
 1   web-scraper-start-url  120 non-null    object
 2   rooms_area_price_raw   120 non-null    object
 3   address_raw            120 non-null    object
 4   price_raw              120 non-null    object
 5   description_raw        120 non-null    object
 6   text_raw               120 non-null    object
dtypes: object(7)
memory usage: 6.7+ KB


Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1693993818-1,https://www.immoscout24.ch/de/wohnung/mieten/o...,"6,5 Zimmer, 143 m², CHF 3017.—","Am Eulachpark 25, 8404 Winterthur, ZH",CHF 3017.—,«Sie suchen die spezielle Maisonettewohnung?»,"6,5 Zimmer, 143 m², CHF 3017.—Am Eulachpark 25..."
1,1693993818-2,https://www.immoscout24.ch/de/wohnung/mieten/o...,"1 Zimmer, 132 m², CHF 3260.—","Katharina Sulzer Platz 2, 8400 Winterthur, ZH",CHF 3260.—,«In Loft-iger Höhe MIETEN OHNE KAUTION»,"1 Zimmer, 132 m², CHF 3260.—Katharina Sulzer P..."
2,1693993818-3,https://www.immoscout24.ch/de/wohnung/mieten/o...,"4,5 Zimmer, 117 m², CHF 3782.—","8400 Winterthur, ZH",CHF 3782.—,"«MÖBLIERT, TEMPORÄR: 4½ ZI-WOHNUNG IN WINTERTH...","4,5 Zimmer, 117 m², CHF 3782.—8400 Winterthur,..."


In [6]:
# Task 2.1.2 — Read the JSON file
# TODO: Read 'supermarkets.json' into a DataFrame
df_supermarkets = pd.read_json('supermarkets.json')

# TODO: Print last 3 rows
df_supermarkets.tail(3)

Unnamed: 0,type,id,lat,lon,tags
3389,node,11107076347,47.466556,9.04825,"{'addr:city': 'Wil SG', 'addr:housenumber': '3..."
3390,node,11107594883,47.322228,8.529748,"{'addr:city': 'Adliswil', 'addr:housenumber': ..."
3391,node,11129298207,47.537518,7.608581,"{'brand': 'Coop', 'brand:wikidata': 'Q432564',..."


### Task 2.2: Nested Objects

The `df_supermarkets` DataFrame has a nested object in the `tags` column. Each row contains a dictionary with multiple OSM (OpenStreetMap) attributes like brand, opening hours, address details, etc.

**Your tasks:**
1. Inspect the `tags` column to understand its structure (display one example)
2. Flatten the nested `tags` dictionary into separate columns
3. Combine the flattened columns with the original location columns (`type`, `id`, `lat`, `lon`)
4. Drop the original `tags` column and display the resulting DataFrame
5. Compare the shape before and after flattening

In [None]:
# Task 2.2 Solution — Flattening Nested Objects

#TODO Step 1: Inspect the tags column structure
df_supermarkets.tags
df_supermarkets.tags


AttributeError: 'Series' object has no attribute 'brand'

In [10]:
# Step 2 & 3: Flatten the tags columns and combine with original columns

#TODO Flatten the tags dictionary into separate columns
tags_normalized = pd.json_normalize(df_supermarkets['tags'])
tags_normalized.head()





Unnamed: 0,brand,brand:wikidata,brand:wikipedia,name,opening_hours,shop,addr:city,addr:housenumber,addr:postcode,addr:street,...,source:addr,opening_date,postid,operator:website,diet:local,payment:account_cards,diet:fish,diet:mediterranean,diet:organic,diet:seafood
0,Spar,Q610492,en:SPAR (retailer),Spar,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00,supermarket,,,,,...,,,,,,,,,,
1,Migros,Q680727,de:Migros,Migros,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17...",supermarket,Uznach,25.0,8730.0,Zürcherstrasse,...,,,,,,,,,,
2,Coop,Q432564,,Coop,,supermarket,Uznach,,8730.0,,...,,,,,,,,,,
3,Coop,Q432564,de:Coop (Schweiz),Coop,Mo-Sa 06:00-22:00,supermarket,Zürich,1.0,8001.0,Bahnhofbrücke,...,,,,,,,,,,
4,Migros,Q680727,,Migros,Mo-Sa 08:00-21:00; PH off,supermarket,Zürich,7.0,8004.0,Wengistrasse,...,,,,,,,,,,


In [14]:
#TODO Combine with the original location columns
df_supermarkets_flattened = pd.concat([df_supermarkets, tags_normalized], axis=1)
print(f"Original shape: {df_supermarkets.shape}")
print(f"Flattened shape: {df_supermarkets_flattened.shape}")

Original shape: (3392, 5)
Flattened shape: (3392, 238)


In [16]:
df_supermarkets_flattened.head()

Unnamed: 0,type,id,lat,lon,tags,brand,brand:wikidata,brand:wikipedia,name,opening_hours,...,source:addr,opening_date,postid,operator:website,diet:local,payment:account_cards,diet:fish,diet:mediterranean,diet:organic,diet:seafood
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',...",Spar,Q610492,en:SPAR (retailer),Spar,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00,...,,,,,,,,,,
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2...",Migros,Q680727,de:Migros,Migros,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17...",...,,,,,,,,,,
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730...",Coop,Q432564,,Coop,,...,,,,,,,,,,
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ...",Coop,Q432564,de:Coop (Schweiz),Coop,Mo-Sa 06:00-22:00,...,,,,,,,,,,
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7...",Migros,Q680727,,Migros,Mo-Sa 08:00-21:00; PH off,...,,,,,,,,,,


In [15]:
#TODO Step 4: Display information about the flattened DataFrame
df_supermarkets_flattened.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3392 entries, 0 to 3391
Columns: 238 entries, type to diet:seafood
dtypes: float64(2), int64(1), object(235)
memory usage: 6.2+ MB


### Task 2.3: File Format Conversion

**Your tasks:**
1. Take the apartments DataFrame (from Task 2.1) and write it to a **Parquet** file
2. Read the Parquet file back and verify the data is identical
3. Compare the file sizes of the CSV and Parquet files

In [17]:
# Task 2.3.1 — Write to Parquet
# TODO: Save df_apartments to 'apartments_winterthur.parquet'
df_apartments.to_parquet('apartments_winterthur.parquet')
# Task 2.3.2 — Read back from Parquet and verify
# TODO: Read the parquet file and compare shape/dtypes with the original
df_from_parquet = pd.read_parquet('apartments_winterthur.parquet')

print(f'Original shape:  {df_apartments.shape}')
print(f'Parquet shape:   {df_from_parquet.shape}')
print(f'DataFrames equal: {df_apartments.equals(df_from_parquet)}')

Original shape:  (120, 7)
Parquet shape:   (120, 7)
DataFrames equal: True


In [18]:
# Task 2.3.3 — Compare file sizes
# TODO: Use os.path.getsize() to compare the CSV and Parquet file sizes
csv_size = os.path.getsize('apartments_data_winterthur.csv')
parquet_size = os.path.getsize('apartments_winterthur.parquet')

print(f'CSV file size:     {csv_size:>10,} bytes')
print(f'Parquet file size:  {parquet_size:>10,} bytes')
print(f'Compression ratio:  {csv_size / parquet_size:.2f}x')

CSV file size:         99,628 bytes
Parquet file size:      49,305 bytes
Compression ratio:  2.02x
