### Task 1
1. Read `apartments_data_wintherthur.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description.

In [1]:
import pandas as pd
import numpy as np
import re

# Read apartments_data_winterthur.csv
df_winterthur_raw = pd.read_csv('apartments_data_winterthur.csv')
print(f"Raw data shape: {df_winterthur_raw.shape}")
print(f"\nColumns: {df_winterthur_raw.columns.tolist()}")
print(f"\nFirst row of raw data:")
print(df_winterthur_raw.iloc[0])


Raw data shape: (120, 7)

Columns: ['web-scraper-order', 'web-scraper-start-url', 'rooms_area_price_raw', 'address_raw', 'price_raw', 'description_raw', 'text_raw']

First row of raw data:
web-scraper-order                                             1693993818-1
web-scraper-start-url    https://www.immoscout24.ch/de/wohnung/mieten/o...
rooms_area_price_raw                        6,5 Zimmer, 143 m², CHF 3017.—
address_raw                          Am Eulachpark 25, 8404 Winterthur, ZH
price_raw                                                       CHF 3017.—
description_raw              «Sie suchen die spezielle Maisonettewohnung?»
text_raw                 6,5 Zimmer, 143 m², CHF 3017.—Am Eulachpark 25...
Name: 0, dtype: object


#### Helper Functions (provided)

In [2]:
# Helper function to parse rooms, area, and price from the raw column
def parse_rooms_area_price(raw_str):
    """Extract rooms, area, and price from a raw string like '6,5 Zimmer, 143 m², CHF 3017.—'"""
    try:
        # Extract rooms (handles both comma and dot as decimal separator)
        rooms_match = re.search(r'([\d,\.]+)\s+Zimmer', raw_str)
        rooms = float(rooms_match.group(1).replace(',', '.')) if rooms_match else None
        
        # Extract area (in m²)
        area_match = re.search(r'([\d,\.]+)\s+m²', raw_str)
        area = float(area_match.group(1).replace(',', '.')) if area_match else None
        
        # Extract price (CHF amount)
        price_match = re.search(r'CHF\s+([\d,\.]+)', raw_str)
        price = float(price_match.group(1).replace("'", "").replace(',', '.')) if price_match else None
        
        return rooms, area, price
    except:
        return None, None, None


def parse_address(address_raw):
    """Extract address, postal code (plz), city, and canton from address string"""
    try:
        # Canton is always the last two-letter code (after the last comma)
        canton_match = re.search(r',\s+([A-Z]{2})$', address_raw)
        canton = canton_match.group(1) if canton_match else None
        
        # Remove canton from address to work with the rest
        address_without_canton = re.sub(r',\s+[A-Z]{2}$', '', address_raw).strip()
        
        # Postal code is a 4-digit number, followed by city name
        plz_match = re.search(r',\s+(\d{4})\s+(.+)$', address_without_canton)
        if plz_match:
            plz = plz_match.group(1)
            city = plz_match.group(2).strip()
            # Extract the street address (everything before the postal code)
            address = re.sub(r',\s+\d{4}\s+.+$', '', address_without_canton).strip()
        else:
            # Fallback: if no postal code pattern found
            plz = None
            city = address_without_canton
            address = address_raw
        
        return address, plz, city, canton
    except:
        return None, None, None, None

In [3]:
df_winterthur_raw.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw
0,1693993818-1,https://www.immoscout24.ch/de/wohnung/mieten/o...,"6,5 Zimmer, 143 m², CHF 3017.—","Am Eulachpark 25, 8404 Winterthur, ZH",CHF 3017.—,«Sie suchen die spezielle Maisonettewohnung?»,"6,5 Zimmer, 143 m², CHF 3017.—Am Eulachpark 25..."
1,1693993818-2,https://www.immoscout24.ch/de/wohnung/mieten/o...,"1 Zimmer, 132 m², CHF 3260.—","Katharina Sulzer Platz 2, 8400 Winterthur, ZH",CHF 3260.—,«In Loft-iger Höhe MIETEN OHNE KAUTION»,"1 Zimmer, 132 m², CHF 3260.—Katharina Sulzer P..."
2,1693993818-3,https://www.immoscout24.ch/de/wohnung/mieten/o...,"4,5 Zimmer, 117 m², CHF 3782.—","8400 Winterthur, ZH",CHF 3782.—,"«MÖBLIERT, TEMPORÄR: 4½ ZI-WOHNUNG IN WINTERTH...","4,5 Zimmer, 117 m², CHF 3782.—8400 Winterthur,..."
3,1693993818-4,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3,5 Zimmer, 88 m², CHF 2244.—","Untere Briggerstrasse 66, 8406 Winterthur, ZH",CHF 2244.—,«Modernes Leben im EG mit Gartensitzplatz»,"3,5 Zimmer, 88 m², CHF 2244.—Untere Briggerstr..."
4,1693993818-5,https://www.immoscout24.ch/de/wohnung/mieten/o...,"3,5 Zimmer, 80 m², CHF 1980.—","Wülflingerstrasse 25, 8400 Winterthur, ZH",CHF 1980.—,«Schöne 3.5-Zimmerwohnung mit Balkon zu vermie...,"3,5 Zimmer, 80 m², CHF 1980.—Wülflingerstrasse..."


In [6]:
#TODO Create new DataFrame with processed columns
df_winterthur = pd.DataFrame(df_winterthur_raw.rooms_area_price_raw.apply(parse_rooms_area_price), columns=['rooms', 'area', 'price'])

print(f"Processed Winterthur data shape: {df_winterthur.shape}")
print(f"\nFirst 3 rows:")
print(df_winterthur.head(3))
print(f"\nData types:")
print(df_winterthur.dtypes)


Processed Winterthur data shape: (0, 3)

First 3 rows:
Empty DataFrame
Columns: [rooms, area, price]
Index: []

Data types:
rooms    object
area     object
price    object
dtype: object


In [10]:
df_addresses = df_winterthur_raw.address_raw.apply(parse_address)
pd.DataFrame(zip(*df_addresses))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,Am Eulachpark 25,Katharina Sulzer Platz 2,"8400 Winterthur, ZH",Untere Briggerstrasse 66,Wülflingerstrasse 25,Ida-Sträuli-Strasse 39,Ernst-Jung-Gasse 16B,Hobelwerkweg 39a,Hobelwerkweg 39a,Marktgasse 17,...,Schaffhauserstrasse 6,Wülflingerstrasse 349,Heiniweg 12,Sulzerallee 63,Zürcherstrasse 135,Schiltwiesenweg 24,Zürcherstrasse 58,Maienstrasse 8,Neuwiesenstr. 14,Steiggasse 3
1,8404,8400,,8406,8400,8404,8400,8404,8404,8400,...,8400,8408,8404,8404,8406,8404,8406,8406,8400,8400
2,Winterthur,Winterthur,8400 Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,...,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur,Winterthur
3,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH,...,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH,ZH


### Task 2
1. Read `apartments_data_zuerich.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description.

In [None]:
# Read apartments_data_zuerich.csv
df_zuerich_raw = pd.read_csv('apartments_data_zuerich.csv')
print(f"Raw data shape: {df_zuerich_raw.shape}")
print(f"\nFirst row of raw data:")
print(df_zuerich_raw.iloc[0])

In [None]:
# TODO Apply the same parsing functions to Zurich data
df_zuerich = ...


### Task 3
Compare the Winterthur and Zurich apartment datasets:
1. Compare dataset sizes
2. Compare price statistics using `.describe()`
3. Identify missing values across key columns


In [None]:
# Task 3: Compare the Data
print("COMPARISON: WINTERTHUR vs ZURICH APARTMENTS")

# 1.TODO Compare dataset sizes
print(f"\nDATASET SIZES:")
print(f"   Winterthur: {...} | Zurich: {...} | Total: {...}")

# 2.TODO Compare price statistics
print(f"\nPRICE STATISTICS (CHF):\n")
print(f"Winterthur:\n{...}")
print(f"\n")
print(f"Zurich:\n{...}")

# 3.TODO Compare missing values
print(f"\nMISSING VALUES:\n")
cols = ['rooms', 'area', 'price', 'address', 'plz', 'city', 'canton']
missing = ...
print(missing.to_string())
