### Task 1
1. Read `apartments_data_wintherthur.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description. rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw


In [None]:
import pandas as pd
import numpy as np
import re

# Read apartments_data_winterthur.csv
df_winterthur_raw = pd.read_csv('apartments_data_winterthur.csv')
print(f"Raw data shape: {df_winterthur_raw.shape}")
print(f"\nColumns: {df_winterthur_raw.columns.tolist()}")
print(f"\nFirst row of raw data:")
print(df_winterthur_raw.iloc[0])


In [None]:
# Helper function to parse rooms, area, and price from the raw column
def parse_rooms_area_price(raw_str):
    """Extract rooms, area, and price from a raw string like '6,5 Zimmer, 143 m², CHF 3017.—'"""
    try:
        # Extract rooms (handles both comma and dot as decimal separator)
        rooms_match = re.search(r'([\d,\.]+)\s+Zimmer', raw_str)
        rooms = float(rooms_match.group(1).replace(',', '.')) if rooms_match else None
        
        # Extract area (in m²)
        area_match = re.search(r'([\d,\.]+)\s+m²', raw_str)
        area = float(area_match.group(1).replace(',', '.')) if area_match else None
        
        # Extract price (CHF amount)
        price_match = re.search(r'CHF\s+([\d,\.]+)', raw_str)
        price = float(price_match.group(1).replace("'", "").replace(',', '.')) if price_match else None
        
        return rooms, area, price
    except:
        return None, None, None


def parse_address(address_raw):
    """Extract address, postal code (plz), city, and canton from address string"""
    try:
        # Canton is always the last two-letter code (after the last comma)
        canton_match = re.search(r',\s+([A-Z]{2})$', address_raw)
        canton = canton_match.group(1) if canton_match else None
        
        # Remove canton from address to work with the rest
        address_without_canton = re.sub(r',\s+[A-Z]{2}$', '', address_raw).strip()
        
        # Postal code is a 4-digit number, followed by city name
        plz_match = re.search(r',\s+(\d{4})\s+(.+)$', address_without_canton)
        if plz_match:
            plz = plz_match.group(1)
            city = plz_match.group(2).strip()
            # Extract the street address (everything before the postal code)
            address = re.sub(r',\s+\d{4}\s+.+$', '', address_without_canton).strip()
        else:
            # Fallback: if no postal code pattern found
            plz = None
            city = address_without_canton
            address = address_raw
        
        return address, plz, city, canton
    except:
        return None, None, None, None


# Create new DataFrame with processed columns
df_winterthur = pd.DataFrame()
df_winterthur['rooms'], df_winterthur['area'], df_winterthur['price'] = zip(
    *df_winterthur_raw['rooms_area_price_raw'].apply(parse_rooms_area_price)
)
df_winterthur['address'], df_winterthur['plz'], df_winterthur['city'], df_winterthur['canton'] = zip(
    *df_winterthur_raw['address_raw'].apply(parse_address)
)
df_winterthur['description'] = df_winterthur_raw['description_raw']

print(f"Processed Winterthur data shape: {df_winterthur.shape}")
print(f"\nFirst 3 rows:")
print(df_winterthur.head(3))
print(f"\nData types:")
print(df_winterthur.dtypes)


### Task 2
1. Read `apartments_data_zuerich.csv`
2. Create a DataFrame with the following columns: rooms, area, price, address, plz, city, canton, description. rooms_area_price_raw,address_raw,price_raw,description_raw,text_raw

In [None]:
# Read apartments_data_zuerich.csv
df_zuerich_raw = pd.read_csv('apartments_data_zuerich.csv')
print(f"Raw data shape: {df_zuerich_raw.shape}")
print(f"\nFirst row of raw data:")
print(df_zuerich_raw.iloc[0])


In [None]:
# Apply the same parsing functions to Zurich data
df_zuerich = pd.DataFrame()
df_zuerich['rooms'], df_zuerich['area'], df_zuerich['price'] = zip(
    *df_zuerich_raw['rooms_area_price_raw'].apply(parse_rooms_area_price)
)
df_zuerich['address'], df_zuerich['plz'], df_zuerich['city'], df_zuerich['canton'] = zip(
    *df_zuerich_raw['address_raw'].apply(parse_address)
)
df_zuerich['description'] = df_zuerich_raw['description_raw']

# Keep raw columns as well
df_zuerich['rooms_area_price_raw'] = df_zuerich_raw['rooms_area_price_raw']
df_zuerich['address_raw'] = df_zuerich_raw['address_raw']
df_zuerich['price_raw'] = df_zuerich_raw['price_raw']
df_zuerich['description_raw'] = df_zuerich_raw['description_raw']
df_zuerich['text_raw'] = df_zuerich_raw['text_raw']

print(f"Processed Zurich data shape: {df_zuerich.shape}")
print(f"\nFirst 3 rows:")
print(df_zuerich.head(3))
print(f"\nData types:")
print(df_zuerich.dtypes)


### Task 3
Compare the Data

In [None]:
# Task 3: Compare the Data

print("=" * 80)
print("COMPARISON: WINTERTHUR vs ZURICH APARTMENTS")
print("=" * 80)

# Overall statistics
print("\n1. DATASET SIZES:")
print(f"   Winterthur: {len(df_winterthur)} apartments")
print(f"   Zurich:     {len(df_zuerich)} apartments")
print(f"   Total:      {len(df_winterthur) + len(df_zuerich)} apartments")

# Price statistics
print("\n2. PRICE STATISTICS (CHF):")
print("\n   WINTERTHUR:")
print(f"   Mean:   {df_winterthur['price'].mean():>10,.2f}")
print(f"   Median: {df_winterthur['price'].median():>10,.2f}")
print(f"   Min:    {df_winterthur['price'].min():>10,.2f}")
print(f"   Max:    {df_winterthur['price'].max():>10,.2f}")
print(f"   Std:    {df_winterthur['price'].std():>10,.2f}")

print("\n   ZURICH:")
print(f"   Mean:   {df_zuerich['price'].mean():>10,.2f}")
print(f"   Median: {df_zuerich['price'].median():>10,.2f}")
print(f"   Min:    {df_zuerich['price'].min():>10,.2f}")
print(f"   Max:    {df_zuerich['price'].max():>10,.2f}")
print(f"   Std:    {df_zuerich['price'].std():>10,.2f}")

# Area statistics
print("\n3. AREA STATISTICS (m²):")
print("\n   WINTERTHUR:")
print(f"   Mean:   {df_winterthur['area'].mean():>10,.2f}")
print(f"   Median: {df_winterthur['area'].median():>10,.2f}")
print(f"   Min:    {df_winterthur['area'].min():>10,.2f}")
print(f"   Max:    {df_winterthur['area'].max():>10,.2f}")

print("\n   ZURICH:")
print(f"   Mean:   {df_zuerich['area'].mean():>10,.2f}")
print(f"   Median: {df_zuerich['area'].median():>10,.2f}")
print(f"   Min:    {df_zuerich['area'].min():>10,.2f}")
print(f"   Max:    {df_zuerich['area'].max():>10,.2f}")

# Rooms statistics
print("\n4. ROOMS STATISTICS:")
print("\n   WINTERTHUR:")
print(f"   Mean:   {df_winterthur['rooms'].mean():>10,.2f}")
print(f"   Median: {df_winterthur['rooms'].median():>10,.2f}")

print("\n   ZURICH:")
print(f"   Mean:   {df_zuerich['rooms'].mean():>10,.2f}")
print(f"   Median: {df_zuerich['rooms'].median():>10,.2f}")

# Price per square meter
df_winterthur['price_per_m2'] = df_winterthur['price'] / df_winterthur['area']
df_zuerich['price_per_m2'] = df_zuerich['price'] / df_zuerich['area']

print("\n5. PRICE PER M² (CHF/m²):")
print("\n   WINTERTHUR:")
print(f"   Mean:   {df_winterthur['price_per_m2'].mean():>10,.2f}")
print(f"   Median: {df_winterthur['price_per_m2'].median():>10,.2f}")

print("\n   ZURICH:")
print(f"   Mean:   {df_zuerich['price_per_m2'].mean():>10,.2f}")
print(f"   Median: {df_zuerich['price_per_m2'].median():>10,.2f}")

# City distribution in Zurich dataset
print("\n6. CITY DISTRIBUTION IN ZURICH DATASET:")
city_counts = df_zuerich['city'].value_counts().head(10)
for city, count in city_counts.items():
    pct = count / len(df_zuerich) * 100
    print(f"   {city:20s}: {count:3d} ({pct:5.1f}%)")

# Missing values
print("\n7. MISSING VALUES:")
print("\n   WINTERTHUR:")
print(df_winterthur[['rooms', 'area', 'price', 'address', 'plz', 'city', 'canton']].isnull().sum())
print("\n   ZURICH:")
print(df_zuerich[['rooms', 'area', 'price', 'address', 'plz', 'city', 'canton']].isnull().sum())
