# Data Acquisition

### Import Libraries


In [1]:
!pip install geemap google-cloud-storage rasterio matplotlib > /dev/null 2>&1

In [2]:
import ee
import geemap
import os
import pandas as pd
from google.colab import drive
import rasterio
import matplotlib.pyplot as plt


drive.mount('/content/drive')
work_dir = '/content/drive/MyDrive/UHI-Detection-Analysis/data/raw/'
os.makedirs(work_dir, exist_ok=True)


Mounted at /content/drive


In [3]:
ee.Authenticate()
ee.Initialize(project='manifest-pride-258211')

*** Earth Engine *** Share your feedback by taking our Annual Developer Satisfaction Survey: https://google.qualtrics.com/jfe/form/SV_7TDKVSyKvBdmMqW?ref=4i2o6


### Determining Hotest Summer Days

*Using MODIS hottest cloud-free summer days through 10 year*

In [71]:
def get_modis_top_n_hottest_days(start_year=2014, end_year=2024, n_days=3):
    """
    Finds the top N hottest cloud-free summer days for Hamburg using MODIS LST data.
    Returns a DataFrame where each year maps to a list of dictionaries.
    """
    hamburg = ee.Geometry.Point(9.9937, 53.5511).buffer(5000)
    results = {}

    for year in range(start_year, end_year + 1):
        try:
            modis_collection = ee.ImageCollection('MODIS/061/MOD11A1') \
                .filterBounds(hamburg) \
                .filterDate(f'{year}-05-15', f'{year}-09-15')

            def compute_lst(img):
                mean_lst = img.reduceRegion(reducer=ee.Reducer.mean(), geometry=hamburg, scale=1000).get('LST_Day_1km')
                return ee.Feature(None, {'lst': mean_lst, 'date': img.date().format('YYYY-MM-dd')})

            lst_features = modis_collection.map(compute_lst).filter(ee.Filter.notNull(['lst']))

            if lst_features.size().getInfo() == 0:
                continue

            # Instead of .first(), use .limit(n) to get the top N days
            hottest_list = lst_features.sort('lst', False).limit(n_days).getInfo()['features']

            top_days = []
            for feature in hottest_list:
                props = feature['properties']
                if props['lst'] is not None:
                    celsius = props['lst'] * 0.02 - 273.15
                    top_days.append({
                        'date': props['date'],
                        'lst_celsius': round(celsius, 2)
                    })

            results[year] = {'top_days': top_days}
            print(f"Found top {len(top_days)} days for {year}.")

        except Exception as e:
            print(f"Error processing {year}: {str(e)}")
            continue

    return pd.DataFrame.from_dict(results, orient='index')

print("Extracting MODIS data...")
df_top_days = get_modis_top_n_hottest_days()

Extracting MODIS data...
Found top 7 days for 2014.
Found top 7 days for 2015.
Found top 7 days for 2016.
Found top 7 days for 2017.
Found top 7 days for 2018.
Found top 7 days for 2019.
Found top 7 days for 2020.
Found top 7 days for 2021.
Found top 7 days for 2022.
Found top 7 days for 2023.
Found top 7 days for 2024.


In [72]:
for year, row in df_top_days.iterrows():
    print(f"--- Year: {year} ---")

    # Get the list of candidate days for the current year
    top_days_list = row['top_days']

    # Check if the list is not empty
    if not top_days_list:
        print("  No candidate days found.")
        continue

    # Print each candidate day
    for i, day_info in enumerate(top_days_list):
        date = day_info['date']
        temp = day_info['lst_celsius']
        print(f"  {i+1}. Hottest: Date: {date}, Temp: {temp}°C")
    print("-" * 20)

--- Year: 2014 ---
  1. Hottest: Date: 2014-07-04, Temp: 34.83°C
  2. Hottest: Date: 2014-07-19, Temp: 34.75°C
  3. Hottest: Date: 2014-07-20, Temp: 34.27°C
  4. Hottest: Date: 2014-07-18, Temp: 33.79°C
  5. Hottest: Date: 2014-06-07, Temp: 33.3°C
  6. Hottest: Date: 2014-07-11, Temp: 33.06°C
  7. Hottest: Date: 2014-07-29, Temp: 32.76°C
--------------------
--- Year: 2015 ---
  1. Hottest: Date: 2015-07-05, Temp: 38.17°C
  2. Hottest: Date: 2015-07-02, Temp: 34.78°C
  3. Hottest: Date: 2015-06-12, Temp: 34.33°C
  4. Hottest: Date: 2015-07-04, Temp: 34.18°C
  5. Hottest: Date: 2015-07-07, Temp: 32.98°C
  6. Hottest: Date: 2015-07-01, Temp: 32.9°C
  7. Hottest: Date: 2015-06-05, Temp: 31.52°C
--------------------
--- Year: 2016 ---
  1. Hottest: Date: 2016-06-05, Temp: 35.11°C
  2. Hottest: Date: 2016-06-23, Temp: 34.02°C
  3. Hottest: Date: 2016-07-20, Temp: 32.88°C
  4. Hottest: Date: 2016-06-04, Temp: 32.35°C
  5. Hottest: Date: 2016-06-06, Temp: 32.33°C
  6. Hottest: Date: 2016-07-2

### Extracting Landsat-8 Images Based on Hottest Days

*Landsat 8 images based on hottest days and calculating LTS*

In [87]:
# Hamburg coordinates, define bounding box for missing tiles issue
hamburg = ee.Geometry.Rectangle([
    9.7,  # Min Longitude
    53.4, # Min Latitude
    10.3, # Max Longitude
    53.7  # Max Latitude
])

def get_robust_landsat_data(target_date_str, max_cloud=20, search_radius_days=15):
    """
    Finds a cloud-free Landsat 8 image for a specific target date.
    Returns the image, the found date, and a status ('SUCCESS', 'FALLBACK', 'FAILURE').
    """
    target_date = ee.Date(target_date_str) # Takes a string date instead of a year
    image_collection = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2").filterBounds(hamburg)

    # --- Strategy A: Iterative search for a low-cloud image ---
    for day_offset in range(search_radius_days + 1):
        start_date = target_date.advance(-day_offset, 'day')
        end_date = target_date.advance(day_offset, 'day').advance(1, 'day')

        landsat_collection = image_collection \
            .filterDate(start_date, end_date) \
            .filter(ee.Filter.lt('CLOUD_COVER', max_cloud)) \
            .sort('CLOUD_COVER')

        if landsat_collection.size().getInfo() > 0:
            found_date_ee = ee.Date(landsat_collection.first().get('system:time_start'))
            found_date_str = found_date_ee.format('YYYY-MM-dd').getInfo()
            image = landsat_collection.mosaic().clip(hamburg)

            def mask_clouds(img):
                qa = img.select('QA_PIXEL')
                cloud_bit_mask = 1 << 3; cloud_shadow_bit_mask = 1 << 4
                mask = qa.bitwiseAnd(cloud_bit_mask).eq(0).And(qa.bitwiseAnd(cloud_shadow_bit_mask).eq(0))
                return img.updateMask(mask)

            # Return with 'SUCCESS' status
            return mask_clouds(image), found_date_str, 'SUCCESS'

    # --- Strategy B: Fallback if no low-cloud image was found ---
    start_date = target_date.advance(-search_radius_days, 'day')
    end_date = target_date.advance(search_radius_days, 'day').advance(1, 'day')

    fallback_collection = image_collection.filterDate(start_date, end_date).sort('CLOUD_COVER')

    if fallback_collection.size().getInfo() > 0:
        best_image = fallback_collection.first()
        cloud_cover_val = best_image.get('CLOUD_COVER').getInfo()

        # Fallback Threshold: Reject images that are excessively cloudy
        if cloud_cover_val > 60: # You can adjust this threshold
             return None, None, 'FAILURE'

        found_date_ee = ee.Date(best_image.get('system:time_start'))
        found_date_str = found_date_ee.format('YYYY-MM-dd').getInfo()
        image = fallback_collection.mosaic().clip(hamburg)

        def mask_clouds(img):
            qa = img.select('QA_PIXEL')
            cloud_bit_mask = 1 << 3; cloud_shadow_bit_mask = 1 << 4
            mask = qa.bitwiseAnd(cloud_bit_mask).eq(0).And(qa.bitwiseAnd(cloud_shadow_bit_mask).eq(0))
            return img.updateMask(mask)

        # Return with 'FALLBACK' status
        return mask_clouds(image), found_date_str, 'FALLBACK'

    # If even the fallback fails, return 'FAILURE'
    return None, None, 'FAILURE'


# Add this helper function to your code
def get_coverage_percentage(image, geometry):
    """Calculates the percentage of valid (unmasked) pixels within a geometry."""
    # Create an image where valid pixels are 1, masked pixels are 0
    valid_pixels = image.select(0).unmask(0).gt(0)

    # Calculate the mean of this binary image within the geometry
    # The mean of a 0/1 image is the percentage of 1s.
    coverage_stats = valid_pixels.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=geometry,
        scale=30, # Landsat scale
        maxPixels=1e9
    )

    # The result is a fraction (0 to 1), so multiply by 100
    return ee.Number(coverage_stats.values().get(0)).multiply(100)

# LST calculation function
def calculate_lst(image):
    # Check if input is a valid ee.Image
    if image is None:
        return None

    lst = image.expression(
        '(TIRS1 * 0.00341802 + 149.0) - 273.15',  # Convert Kelvin to Celsius
        {'TIRS1': image.select('ST_B10')}
    ).rename('LST')
    return image.addBands(lst)

In [89]:
years = df_top_days.index.tolist()
lst_images = {}
final_dates = {}

# Process each year one by one
for year in years:
    print(f"\n--- Processing Year: {year} ---")

    # Get the list of hottest candidate days for that year
    top_days_for_year = df_top_days.loc[year, 'top_days']

    successful_candidates = [] # A list to store all successful (non-fallback) results

    # Loop through all candidates for the year to evaluate them
    for i, day_info in enumerate(top_days_for_year):
        target_date = day_info['date']
        print(f"-> Evaluating Candidate #{i+1} | Target Date: {target_date}")

        # Call our robust function to get an image
        landsat_image, found_date, status = get_robust_landsat_data(target_date)

        # We are only interested in high-quality 'SUCCESS' images
        if status == 'SUCCESS':
            # Calculate the percentage of valid pixels for the successful image
            coverage = get_coverage_percentage(landsat_image, hamburg).getInfo()
            print(f"  > Found a SUCCESS candidate. Valid Pixel Coverage: {coverage:.2f}%")

            # Store the result along with its coverage score
            successful_candidates.append({
                'image': landsat_image,
                'found_date': found_date,
                'target_date': target_date,
                'coverage': coverage
            })
        else:
            print(f"  > Candidate resulted in '{status}'. Skipping.")

    # After checking all candidates for the year, decide which one is the best
    if successful_candidates:
        # Sort the successful candidates by their coverage score in descending order
        best_candidate = sorted(successful_candidates, key=lambda x: x['coverage'], reverse=True)[0]

        print(f"BEST OPTION for {year}: Target Date {best_candidate['target_date']} -> Found Image on {best_candidate['found_date']} with {best_candidate['coverage']:.2f}% coverage.")

        # Calculate LST for the best image
        lst_images[year] = calculate_lst(best_candidate['image'])
        final_dates[year] = {
            'target': best_candidate['target_date'],
            'found': best_candidate['found_date'],
            'coverage': best_candidate['coverage']
        }
    else:
        # If no candidates resulted in a 'SUCCESS'
        print(f"FAILURE: Could not find any high-quality image for year {year} from the top candidate days.")


--- Processing Year: 2014 ---
-> Evaluating Candidate #1 | Target Date: 2014-07-04
  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.82%
-> Evaluating Candidate #2 | Target Date: 2014-07-19
  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.82%
-> Evaluating Candidate #3 | Target Date: 2014-07-20
  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.82%
BEST OPTION for 2014: Target Date 2014-07-04 -> Found Image on 2014-07-10 with 99.82% coverage.

--- Processing Year: 2015 ---
-> Evaluating Candidate #1 | Target Date: 2015-07-05
  > Candidate resulted in 'FALLBACK'. Skipping.
-> Evaluating Candidate #2 | Target Date: 2015-07-02
  > Candidate resulted in 'FALLBACK'. Skipping.
-> Evaluating Candidate #3 | Target Date: 2015-06-12
  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.73%
BEST OPTION for 2015: Target Date 2015-06-12 -> Found Image on 2015-06-11 with 99.73% coverage.

--- Processing Year: 2016 ---
-> Evaluating Candidate #1 | Target Date: 2016-06-05
  > Foun



  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.43%
-> Evaluating Candidate #3 | Target Date: 2023-06-12
  > Found a SUCCESS candidate. Valid Pixel Coverage: 99.91%
BEST OPTION for 2023: Target Date 2023-06-12 -> Found Image on 2023-06-08 with 99.91% coverage.

--- Processing Year: 2024 ---
-> Evaluating Candidate #1 | Target Date: 2024-06-25
  > Found a SUCCESS candidate. Valid Pixel Coverage: 93.64%
-> Evaluating Candidate #2 | Target Date: 2024-07-20
  > Found a SUCCESS candidate. Valid Pixel Coverage: 14.79%
-> Evaluating Candidate #3 | Target Date: 2024-06-26
  > Found a SUCCESS candidate. Valid Pixel Coverage: 93.64%
BEST OPTION for 2024: Target Date 2024-06-25 -> Found Image on 2024-06-26 with 93.64% coverage.


#### Visualize 10 years LTS data for control

In [90]:
Map = geemap.Map(center=[53.55, 9.99], zoom=12)

# Visualization parameters (for single band)
vis_params = {
    'min': 20,  # Min LST (°C)
    'max': 40,  # Max LST (°C)
    'palette': ['blue', 'green', 'yellow', 'red']
}


# Add each year's LST image to the map
for year, lst_image in lst_images.items():
    try:
        lst_single_band = lst_image.select('LST')
        Map.addLayer(lst_single_band, vis_params, f'LST {year}')
    except Exception as e:
        print(f"{year} error: {str(e)}")

# Add layer control panel
Map.addLayerControl()

map_dir = '/content/drive/MyDrive/UHI-Detection-Analysis/outputs/'
output_path = map_dir + 'LST_map_10years.html'
Map.to_html(output_path)

Problem in 2017 with 67% coverange and 2024 with 93% coverage few missing pixels

*Adding Time Slider to 10 years LTS*


### Extracting Sentinel-2 Images Based on Hottest Days

In [None]:
# Hamburg coordinates
hamburg = ee.Geometry.Point(9.99, 53.55)

def get_sentinel2_data(year, max_cloud=30):
    """
    Finds the most suitable Sentinel-2 image for the given year based on the hottest day from df_hottest.
    - Searches ±1 day first, then ±10 days.
    - Picks the least cloudy image if multiple are available.
    - If no image under max_cloud is found, picks the best available candidate (highest cloud mask applied later).
    """
    target_date = ee.Date(df_hottest.loc[year, 'date'])

    #sentinel-2 is not available for 2014 that is why landsat 8 will be used for ndvi calculation
    if year >= 2017:
      collection_id = "COPERNICUS/S2_SR"
    else:
      collection_id = "COPERNICUS/S2"


    image_collection = ee.ImageCollection(collection_id).filterBounds(hamburg)

    # 1. Search on the exact date (+/- 1 day)
    s2 = image_collection \
        .filterDate(target_date.advance(-1, 'day'), target_date.advance(1, 'day')) \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', max_cloud))

    # 2. If not found, expand to ±10 days
    if s2.size().getInfo() == 0:
        print(f"No Sentinel-2 image found for {year} within ±1 day and <{max_cloud}% clouds. Expanding window...")
        s2 = image_collection \
            .filterDate(target_date.advance(-10, 'day'), target_date.advance(10, 'day')) \
            .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', max_cloud)) \
            .sort('CLOUDY_PIXEL_PERCENTAGE')

    # 3. If still not found, pick best available image (even high cloud)
    if s2.size().getInfo() == 0:
        print(f"CRITICAL WARNING for {year}: No low-cloud Sentinel-2 image found. Picking best candidate.")
        s2 = image_collection \
            .filterDate(target_date.advance(-20, 'day'), target_date.advance(20, 'day')) \
            .sort('CLOUDY_PIXEL_PERCENTAGE')

    image = ee.Image(s2.first())

    # Verify existence
    if image.getInfo() is None:
        print(f"WARNING: No Sentinel-2 image found for {year} within any search window.")
        return None

    # Apply cloud mask if cloud percentage > max_cloud
    if image.get('CLOUDY_PIXEL_PERCENTAGE').getInfo() >= max_cloud:
        print(f"The best available Sentinel-2 image for {year} has high cloud cover ({image.get('CLOUDY_PIXEL_PERCENTAGE').getInfo():.2f}%). Applying SCL cloud mask.")

        def mask_clouds(img):
            scl = img.select('SCL')
            # Keep classes: 4=Vegetation, 5=Bare soil, 6=Water, 7=Low vegetation, 8=High vegetation, 11=Snow/Ice
            mask = scl.eq(4).Or(scl.eq(5)).Or(scl.eq(6)).Or(scl.eq(7)).Or(scl.eq(8)).Or(scl.eq(11))
            return img.updateMask(mask)

        image = mask_clouds(image)

    return image

In [None]:
# --- Retrieve Sentinel-2 images ---
s2_images = {}

for year in df_hottest.index:
    print(f"\n--- Processing Sentinel-2 image for year {year} ---")
    sentinel_image = get_sentinel2_data(year)

    if sentinel_image:
        s2_images[year] = sentinel_image
        print(f"Sentinel-2 image successfully retrieved for {year}.")
    else:
        print(f"No Sentinel-2 image available for {year}.")



--- Processing Sentinel-2 image for year 2014 ---
No Sentinel-2 image found for 2014 within ±1 day and <30% clouds. Expanding window...
No Sentinel-2 image available for 2014.

--- Processing Sentinel-2 image for year 2015 ---
No Sentinel-2 image found for 2015 within ±1 day and <30% clouds. Expanding window...
The best available Sentinel-2 image for 2015 has high cloud cover (48.14%). Applying SCL cloud mask.
Sentinel-2 image successfully retrieved for 2015.

--- Processing Sentinel-2 image for year 2016 ---
Sentinel-2 image successfully retrieved for 2016.

--- Processing Sentinel-2 image for year 2017 ---
No Sentinel-2 image found for 2017 within ±1 day and <30% clouds. Expanding window...
Sentinel-2 image successfully retrieved for 2017.

--- Processing Sentinel-2 image for year 2018 ---
Sentinel-2 image successfully retrieved for 2018.

--- Processing Sentinel-2 image for year 2019 ---
Sentinel-2 image successfully retrieved for 2019.

--- Processing Sentinel-2 image for year 202

#### Visualize Sentinel-2 Data

In [None]:
Map = geemap.Map(center=[53.55, 9.99], zoom=12)

# Visualization parameters for Sentinel-2 RGB bands
vis_params_s2 = {
    'bands': ['B4', 'B3', 'B2'],  # Red, Green, Blue
    'min': 0,
    'max': 3000,
    'gamma': 1.4
}

# Add each year's Sentinel-2 image to the map
for year, s2_image in s2_images.items():
    try:
        Map.addLayer(s2_image, vis_params_s2, f'Sentinel-2 {year}')
    except Exception as e:
        print(f"Error for {year}: {str(e)}")

# Add layer control panel
Map.addLayerControl()

# Save the map as an HTML file
map_dir = '/content/drive/MyDrive/UHI-Detection-Analysis/outputs/'
output_path = map_dir + 'Sentinel2_map_10years.html'
Map.to_html(output_path)


Error for 2015: Image.select: Band pattern 'SCL' did not match any bands. Available bands: [B1, B2, B3, B4, B5, B6, B7, B8, B8A, B9, B10, B11, B12, QA10, QA20, QA60]


Missing tiles 2017, 2018, 2020,2021 and 2024
Error for 2015

### Export Images as GeoTIF

In [None]:
# GeoTIFF olarak export etme fonksiyonu (GÜNCEL)
def export_to_drive(image, name, folder):
    task = ee.batch.Export.image.toDrive(
        image=image,
        description=name,
        folder=folder.replace('/content/drive/MyDrive/', ''),  # GEE için göreli yol
        fileNamePrefix=name,
        scale=30,
        region=hamburg.buffer(20000).bounds(),
        fileFormat='GeoTIFF'
    )
    task.start()
    return task

work_dir = 'raw'

# Drive'a kaydet (work_dir kullanarak)
export_to_drive(lst_images[2015], 'LST_2015_Hamburg_DEMO', work_dir)

# Task'lerin tamamlanmasını bekle
import time
while True:
    tasks = ee.batch.Task.list()
    if all(task.status()['state'] in ('COMPLETED', 'FAILED') for task in tasks):
        break
    time.sleep(10)
print(f"Export işlemleri tamamlandı!")

Export işlemleri tamamlandı!


In [None]:
# Function to export an image to Google Drive as GeoTIFF
def export_to_drive(image, name, folder):
    task = ee.batch.Export.image.toDrive(
        image=image,
        description=name,
        folder=folder.replace('/content/drive/MyDrive/', ''),  # Relative path for GEE
        fileNamePrefix=name,
        scale=30,
        region=hamburg.buffer(5000).bounds(),
        fileFormat='GeoTIFF'
    )
    task.start()
    return task

base_dir = 'raw'

export_tasks = []

# Export each year's LST image to Google Drive → landsat_8
for year, lst_image in lst_images.items():
    try:
        task = export_to_drive(lst_image.select('LST'), f'LST_{year}_Hamburg', base_dir)
        export_tasks.append(task)
        print(f"Export task started for {year} (LST).")
    except Exception as e:
        print(f"Error exporting {year} (LST): {str(e)}")

# Export each year's Sentinel-2 image to Google Drive → sentinel_2
for year, s2_image in s2_images.items():
    try:
        task = export_to_drive(s2_image.select(['B4', 'B3', 'B2','B8', 'B11']), f'Sentinel2_{year}_Hamburg', base_dir)
        export_tasks.append(task)
        print(f"Export task started for Sentinel-2 image of {year}.")
    except Exception as e:
        print(f"Error exporting Sentinel-2 image for {year}: {str(e)}")

# Wait for all export tasks to complete
import time
while True:
    tasks = ee.batch.Task.list()
    if all(task.status()['state'] in ('COMPLETED', 'FAILED') for task in tasks):
        break
    time.sleep(10)

print("All export tasks completed!")


Export task started for 2014 (LST).
Export task started for 2015 (LST).
Export task started for 2016 (LST).
Export task started for 2017 (LST).
Export task started for 2018 (LST).
Export task started for 2019 (LST).
Export task started for 2020 (LST).
Export task started for 2021 (LST).
Export task started for 2022 (LST).
Export task started for 2023 (LST).
Export task started for 2024 (LST).
Export task started for Sentinel-2 image of 2015.
Export task started for Sentinel-2 image of 2016.
Export task started for Sentinel-2 image of 2017.
Export task started for Sentinel-2 image of 2018.
Export task started for Sentinel-2 image of 2019.
Export task started for Sentinel-2 image of 2020.
Export task started for Sentinel-2 image of 2021.
Export task started for Sentinel-2 image of 2022.
Export task started for Sentinel-2 image of 2023.
Export task started for Sentinel-2 image of 2024.
All export tasks completed!


region = hamburg.buffer(20000).bounds()
Sentinel 2 should be 10m as scale
export location of sentinel 2 is wrong

#### Clean the metadata.widgets Data in the Notebook

In [None]:
!pip install nbstripout > /dev/null 2>&1

In [None]:
!nbstripout /content/drive/MyDrive/GitHub_Repos/urban-heat-island/notebooks/01_data_acquisition.ipynb


### Summary and Next Steps

In this notebook, **data acquisition** was prepared to determine the **hottest day** and to download satellite imagery from **Landsat 8 for LST** and **Sentinel-2**.  
These datasets were saved in the `data/raw/` directory.

In the next notebook, **`03_data_processing.ipynb`**, we will combine *LST* with **spectral indices** (e.g., NDVI) to create a **multi-channel tensor**, which will serve as the final input for the **U-Net model**.
