In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
from datetime import datetime, timedelta
import time

# Initialize Spark for Fabric environment
from pyspark.sql import SparkSession

# Get or create Spark session (works in both Fabric and standalone)
spark = SparkSession.getActiveSession()
if spark is None:
    spark = SparkSession.builder.appName("WeatherCollection").getOrCreate()


API_KEY = "95ad33cbee34b534d98e1eb9b590b9ef"
SILVER_SCHEMA = "silver"
TABLE_NAME = "weather"
DELTA_TABLE_PATH = f"Tables/{TABLE_NAME}"

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 3, Finished, Available, Finished)

In [2]:
european_cities = [
    ("London", "GB"), ("Paris", "FR"), ("Berlin", "DE"), ("Madrid", "ES"),
    ("Rome", "IT"), ("Amsterdam", "NL"), ("Vienna", "AT"), ("Brussels", "BE"),
    ("Prague", "CZ"), ("Warsaw", "PL"), ("Budapest", "HU"), ("Stockholm", "SE"),
    ("Copenhagen", "DK"), ("Oslo", "NO"), ("Helsinki", "FI"), ("Dublin", "IE"),
    ("Lisbon", "PT"), ("Athens", "GR"), ("Zurich", "CH"), ("Ljubljana", "SI")
]

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 4, Finished, Available, Finished)

In [3]:
def collect_today_weather_data(cities, api_key):
    """
    Collect weather data for current day only
    """
    weather_data = []
    today = datetime.now().strftime('%Y-%m-%d')
    current_timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    print(f"🌤️ Collecting data for {today}")
    
    for city, country_code in cities:
        print(f"   📍 {city}, {country_code}")
        
        url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
        
        try:
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                
                weather_record = {
                    'country_code': country_code,
                    'city_name': city,
                    'date': today,
                    'temperature_avg': float(data['main']['temp']),
                    'temperature_max': float(data['main']['temp_max']),
                    'temperature_min': float(data['main']['temp_min']),
                    'humidity': int(data['main']['humidity']),
                    'pressure': int(data['main']['pressure']),
                    'weather_condition': data['weather'][0]['main'],
                    'weather_description': data['weather'][0]['description'],
                    'wind_speed': float(data.get('wind', {}).get('speed', 0)),
                    'cloudiness': int(data.get('clouds', {}).get('all', 0)),
                    'created_at': current_timestamp
                }
                
                weather_data.append(weather_record)
                
            time.sleep(0.1)  # Rate limiting for free API
            
        except Exception as e:
            print(f"   ❌ Error for {city}: {e}")
    
    print(f"✅ Collected {len(weather_data)} records")
    return weather_data

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 5, Finished, Available, Finished)

In [4]:
def load_existing_data(spark, table_name):
    """
    Load existing data from delta table or return None
    """
    try:
        # Try to read from delta table
        existing_df = spark.read.format("delta").table(table_name)
        record_count = existing_df.count()
        date_range = existing_df.select(
            min(col("date")).alias("min_date"),
            max(col("date")).alias("max_date")
        ).collect()[0]
        
        print(f"📂 Existing delta table found: {record_count} records")
        print(f"📅 Current period: {date_range['min_date']} to {date_range['max_date']}")
        return existing_df
    except Exception as e:
        print(f"📂 No existing delta table found: {e}")
        print("📂 Creating new delta table.")
        return None

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 6, Finished, Available, Finished)

In [5]:
def check_today_already_exists(existing_df, today_date):
    """
    Check if data for today already exists
    """
    if existing_df is None:
        return False
    
    today_count = existing_df.filter(col("date") == today_date).count()
    return today_count > 0

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 7, Finished, Available, Finished)

In [6]:
def create_weather_schema():
    """
    Define schema for weather data
    """
    return StructType([
        StructField("country_code", StringType(), True),
        StructField("city_name", StringType(), True),
        StructField("date", StringType(), True),
        StructField("temperature_avg", DoubleType(), True),
        StructField("temperature_max", DoubleType(), True),
        StructField("temperature_min", DoubleType(), True),
        StructField("humidity", IntegerType(), True),
        StructField("pressure", IntegerType(), True),
        StructField("weather_condition", StringType(), True),
        StructField("weather_description", StringType(), True),
        StructField("wind_speed", DoubleType(), True),
        StructField("cloudiness", IntegerType(), True),
        StructField("created_at", StringType(), True)
    ])


StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 8, Finished, Available, Finished)

In [7]:
def add_derived_columns(df):
    """
    Add derived columns for analysis
    """
    return df.withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
             .withColumn("year", year(col("date"))) \
             .withColumn("month", month(col("date"))) \
             .withColumn("day_of_week", dayofweek(col("date"))) \
             .withColumn("week_of_year", weekofyear(col("date"))) \
             .withColumn(
                 "season",
                 when(month(col("date")).isin([12, 1, 2]), "Winter")
                 .when(month(col("date")).isin([3, 4, 5]), "Spring")
                 .when(month(col("date")).isin([6, 7, 8]), "Summer")
                 .otherwise("Autumn")
             ) \
             .withColumn(
                 "temperature_category",
                 when(col("temperature_avg") < 0, "Freezing")
                 .when(col("temperature_avg") < 10, "Cold")
                 .when(col("temperature_avg") < 20, "Mild")
                 .when(col("temperature_avg") < 30, "Warm")
                 .otherwise("Hot")
             ) \
             .withColumn(
                 "is_weekend",
                 when(col("day_of_week").isin([1, 7]), True).otherwise(False)
             )

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 9, Finished, Available, Finished)

In [8]:
def merge_and_save_data(spark, existing_df, new_df, table_name):
    """
    Merge existing data with new data and save to delta table
    """
    today = datetime.now().strftime('%Y-%m-%d')
    
    # Add derived columns to new data
    new_df_enhanced = add_derived_columns(new_df)
    
    if existing_df is None:
        # First run - create new delta table
        print("📝 First execution - creating new delta table")
        final_df = new_df_enhanced
        
        # Create delta table
        final_df.write \
                .format("delta") \
                .mode("overwrite") \
                .option("mergeSchema", "true") \
                .saveAsTable(table_name)
                
    else:
        # Existing table - use merge operation
        print("🔄 Merging with existing delta table")
        
        # Remove today's data if exists (to overwrite)
        existing_df_filtered = existing_df.filter(col("date") != today)
        
        # Union existing data with new data
        combined_df = existing_df_filtered.union(new_df_enhanced)
        
        # Overwrite the table
        combined_df.write \
                   .format("delta") \
                   .mode("overwrite") \
                   .option("mergeSchema", "true") \
                   .saveAsTable(table_name)
        
        final_df = combined_df
    
    # Sort by date and city for final result
    final_df = final_df.orderBy("date", "city_name")
    
    return final_df

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 10, Finished, Available, Finished)

In [9]:
def show_statistics(df):
    """
    Display dataset statistics
    """
    total_records = df.count()
    unique_dates = df.select("date").distinct().count()
    unique_cities = df.select("city_name").distinct().count()
    date_range = df.select(
        min(col("date")).alias("min_date"),
        max(col("date")).alias("max_date")
    ).collect()[0]
    
    print("=" * 60)
    print("✅ SUCCESS! Data updated")
    print(f"📊 Total records: {total_records}")
    print(f"📅 Complete period: {date_range['min_date']} to {date_range['max_date']}")
    print(f"🏙️ Cities: {unique_cities}")
    print(f"📈 Unique dates: {unique_dates}")
    
    # Recent week summary
    recent_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
    recent_df = df.filter(col("date") >= recent_date)
    
    if recent_df.count() > 0:
        print(f"\n📋 Last week summary:")
        recent_summary = recent_df.groupBy("city_name") \
                                 .agg(
                                     avg("temperature_avg").alias("avg_temp"),
                                     count("date").alias("day_count")
                                 ) \
                                 .orderBy("city_name")
        recent_summary.show(5)


StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 11, Finished, Available, Finished)

In [10]:
def main():
    """
    Main function - daily execution
    """
    print("🚀 Starting incremental weather data collection")
    print("=" * 60)
    
    today = datetime.now().strftime('%Y-%m-%d')
    
    # 1. Load existing data
    existing_df = load_existing_data(spark, TABLE_NAME)
    
    # 2. Check if today's data already exists
    if check_today_already_exists(existing_df, today):
        print(f"⚠️ Data for {today} already exists. Overwriting...")
    
    # 3. Collect today's data
    new_data = collect_today_weather_data(european_cities, API_KEY)
    
    if new_data:
        # Create DataFrame with schema
        schema = create_weather_schema()
        new_df = spark.createDataFrame(new_data, schema)
        
        # 4. Merge and save to delta table
        final_df = merge_and_save_data(spark, existing_df, new_df, TABLE_NAME)
        
        # 5. Show statistics
        show_statistics(final_df)
        print("🎯 Delta table created successfully!")
        print(f"📊 Table: {TABLE_NAME}")
        print(f"💾 Format: Delta Lake")
        print(f"🗂️ Schema: Managed table in lakehouse")
        print(f"\n🔍 Data Quality Checks:")
        null_check = final_df.select([
            count(when(col(c).isNull(), c)).alias(c) 
            for c in ["temperature_avg", "humidity", "pressure"]
        ])
        null_check.show()
        
        # Temperature range check
        temp_stats = final_df.select(
            min("temperature_avg").alias("min_temp"),
            max("temperature_avg").alias("max_temp"),
            avg("temperature_avg").alias("avg_temp")
        )
        temp_stats.show()
        
    else:
        print("❌ No data collected today.")

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 12, Finished, Available, Finished)

In [11]:
try:
    main()
except Exception as e:
    print(f"❌ Error in main execution: {e}")

StatementMeta(, c43030d5-d044-403b-b647-e532c9c9f8d9, 13, Finished, Available, Finished)

🚀 Starting incremental weather data collection
📂 Existing delta table found: 20 records
📅 Current period: 2025-05-28 to 2025-05-28
🌤️ Collecting data for 2025-05-29
   📍 London, GB
   📍 Paris, FR
   📍 Berlin, DE
   📍 Madrid, ES
   📍 Rome, IT
   📍 Amsterdam, NL
   📍 Vienna, AT
   📍 Brussels, BE
   📍 Prague, CZ
   📍 Warsaw, PL
   📍 Budapest, HU
   📍 Stockholm, SE
   📍 Copenhagen, DK
   📍 Oslo, NO
   📍 Helsinki, FI
   📍 Dublin, IE
   📍 Lisbon, PT
   📍 Athens, GR
   📍 Zurich, CH
   📍 Ljubljana, SI
✅ Collected 20 records
🔄 Merging with existing delta table
✅ SUCCESS! Data updated
📊 Total records: 40
📅 Complete period: 2025-05-28 to 2025-05-29
🏙️ Cities: 20
📈 Unique dates: 2

📋 Last week summary:
+---------+------------------+---------+
|city_name|          avg_temp|day_count|
+---------+------------------+---------+
|Amsterdam|14.450000000000001|        2|
|   Athens|21.229999999999997|        2|
|   Berlin|             15.72|        2|
| Brussels|            16.165|        2|
| Budapest|  