### Task 1
1. Read `apartments_data_wintherthur.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description.

In [13]:
import pandas as pd
import numpy as np
import re

# Read apartments_data_winterthur.csv
df_winterthur_raw = pd.read_csv('apartments_data_winterthur.csv')
print(f"Raw data shape: {df_winterthur_raw.shape}")
print(f"\nColumns: {df_winterthur_raw.columns.tolist()}")
print(f"\nFirst row of raw data:")
print(df_winterthur_raw.iloc[0])


Raw data shape: (120, 7)

Columns: ['web-scraper-order', 'web-scraper-start-url', 'rooms_area_price_raw', 'address_raw', 'price_raw', 'description_raw', 'text_raw']

First row of raw data:
web-scraper-order                                             1693993818-1
web-scraper-start-url    https://www.immoscout24.ch/de/wohnung/mieten/o...
rooms_area_price_raw                        6,5 Zimmer, 143 m², CHF 3017.—
address_raw                          Am Eulachpark 25, 8404 Winterthur, ZH
price_raw                                                       CHF 3017.—
description_raw              «Sie suchen die spezielle Maisonettewohnung?»
text_raw                 6,5 Zimmer, 143 m², CHF 3017.—Am Eulachpark 25...
Name: 0, dtype: object


#### Helper Functions (provided)

In [None]:
# Helper function to parse rooms, area, and price from the raw column
def parse_rooms_area_price(raw_str):
    """Extract rooms, area, and price from a raw string like '6,5 Zimmer, 143 m², CHF 3017.—'"""
    try:
        # Extract rooms (handles both comma and dot as decimal separator)
        rooms_match = re.search(r'([\d,\.]+)\s+Zimmer', raw_str)
        rooms = float(rooms_match.group(1).replace(',', '.')) if rooms_match else None
        
        # Extract area (in m²)
        area_match = re.search(r'([\d,\.]+)\s+m²', raw_str)
        area = float(area_match.group(1).replace(',', '.')) if area_match else None
        
        # Extract price (CHF amount)
        price_match = re.search(r'CHF\s+([\d,\.]+)', raw_str)
        price = float(price_match.group(1).replace("'", "").replace(',', '.')) if price_match else None
        
        return rooms, area, price
    except:
        return None, None, None


def parse_address(address_raw):
    """Extract address, postal code (plz), city, and canton from address string"""
    try:
        # Canton is always the last two-letter code (after the last comma)
        canton_match = re.search(r',\s+([A-Z]{2})$', address_raw)
        canton = canton_match.group(1) if canton_match else None
        
        # Remove canton from address to work with the rest
        address_without_canton = re.sub(r',\s+[A-Z]{2}$', '', address_raw).strip()
        
        # Postal code is a 4-digit number, followed by city name
        plz_match = re.search(r',\s+(\d{4})\s+(.+)$', address_without_canton)
        if plz_match:
            plz = plz_match.group(1)
            city = plz_match.group(2).strip()
            # Extract the street address (everything before the postal code)
            address = re.sub(r',\s+\d{4}\s+.+$', '', address_without_canton).strip()
        else:
            # Fallback: if no postal code pattern found
            plz = None
            city = address_without_canton
            address = address_raw
        
        return address, plz, city, canton
    except:
        return None, None, None, None

In [15]:
#TODO Create new DataFrame with processed columns

df_room_area_etc = df_winterthur_raw['rooms_area_price_raw'].apply(parse_rooms_area_price)
print(df_room_area_etc.head())
df_address_etc = df_winterthur_raw['address_raw'].apply(parse_address)
df_winterthur = df_room_area_etc.add(df_address_etc)




0    (6.5, 143.0, 3017.0)
1    (1.0, 132.0, 3260.0)
2    (4.5, 117.0, 3782.0)
3     (3.5, 88.0, 2244.0)
4     (3.5, 80.0, 1980.0)
Name: rooms_area_price_raw, dtype: object


In [16]:
print(f"Processed Winterthur data shape: {df_winterthur.shape}")
print(f"\nFirst 3 rows:")
print(df_winterthur.head(3))
print(f"\nData types:")
print(df_winterthur.dtypes)

Processed Winterthur data shape: (120,)

First 3 rows:
0    (6.5, 143.0, 3017.0, Am Eulachpark 25, 8404, W...
1    (1.0, 132.0, 3260.0, Katharina Sulzer Platz 2,...
2    (4.5, 117.0, 3782.0, 8400 Winterthur, ZH, None...
dtype: object

Data types:
object


### Task 2
1. Read `apartments_data_zuerich.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description.

In [None]:
# Read apartments_data_zuerich.csv
df_zuerich_raw = pd.read_csv('apartments_data_zuerich.csv')
print(f"Raw data shape: {df_zuerich_raw.shape}")
print(f"\nFirst row of raw data:")
print(df_zuerich_raw.iloc[0])

In [None]:
# TODO Apply the same parsing functions to Zurich data
df_zuerich = ...


### Task 3
Compare the Winterthur and Zurich apartment datasets:
1. Compare dataset sizes
2. Compare price statistics using `.describe()`
3. Identify missing values across key columns


In [None]:
# Task 3: Compare the Data
print("COMPARISON: WINTERTHUR vs ZURICH APARTMENTS")

# 1.TODO Compare dataset sizes
print(f"\nDATASET SIZES:")
print(f"   Winterthur: {...} | Zurich: {...} | Total: {...}")

# 2.TODO Compare price statistics
print(f"\nPRICE STATISTICS (CHF):\n")
print(f"Winterthur:\n{...}")
print(f"\n")
print(f"Zurich:\n{...}")

# 3.TODO Compare missing values
print(f"\nMISSING VALUES:\n")
cols = ['rooms', 'area', 'price', 'address', 'plz', 'city', 'canton']
missing = ...
print(missing.to_string())
