In [1]:
import pandas as pd
import numpy as np
import h5py
from scipy.stats import mode



This script looks at min and max of light values within each composite

In [4]:
# load clean gdp data
gdp = pd.read_csv("data/gdp_ukraine_clean.csv")
ukraine = gdp[gdp["region"] != "Kyiv_Oblast_City"]
nearnad_snow_cov_max, nearnad_snow_cov_min = 0, 1000000
nearnad_snow_free_max, nearnad_snow_free_min = 0, 1000000
offnad_snow_cov_max, offnad_snow_cov_min = 0, 1000000
offnad_snow_free_max, offnad_snow_free_min = 0, 1000000
allangle_snow_cov_max, allangle_snow_cov_min = 0, 1000000
allangle_snow_free_max, allangle_snow_free_min = 0, 1000000

for i in range(len(gdp)):

    # get year, region, and gdp
    year = gdp["year"][i]
    region = gdp["region"][i]
    gdp_value = gdp["real_gdp"][i]

    # get the file name
    file_name = f"{year}_{region}.h5"

    # load the image
    file_path = f"data/annual_region_images/{file_name}"
    
    with h5py.File(file_path, 'r') as annual_region:
        nearnad_snow_cov = annual_region["NearNadir_Composite_Snow_Covered"][:]
        nearnad_snow_free = annual_region["NearNadir_Composite_Snow_Free"][:]
        offnad_snow_cov = annual_region["OffNadir_Composite_Snow_Covered"][:]
        offnad_snow_free = annual_region["OffNadir_Composite_Snow_Free"][:]
        allangle_snow_cov = annual_region["AllAngle_Composite_Snow_Covered"][:]
        allangle_snow_free = annual_region["AllAngle_Composite_Snow_Free"][:]

        # check what the max and min values are within each category
        if nearnad_snow_cov.max() > nearnad_snow_cov_max:
            nearnad_snow_cov_max = nearnad_snow_cov.max()
        if nearnad_snow_cov.min() < nearnad_snow_cov_min:
            nearnad_snow_cov_min = nearnad_snow_cov.min()
        if nearnad_snow_free.max() > nearnad_snow_free_max:
            nearnad_snow_free_max = nearnad_snow_free.max()
        if nearnad_snow_free.min() < nearnad_snow_free_min:
            nearnad_snow_free_min = nearnad_snow_free.min()
        if offnad_snow_cov.max() > offnad_snow_cov_max:
            offnad_snow_cov_max = offnad_snow_cov.max()
        if offnad_snow_cov.min() < offnad_snow_cov_min:
            offnad_snow_cov_min = offnad_snow_cov.min()
        if offnad_snow_free.max() > offnad_snow_free_max:
            offnad_snow_free_max = offnad_snow_free.max()
        if offnad_snow_free.min() < offnad_snow_free_min:
            offnad_snow_free_min = offnad_snow_free.min()
        if allangle_snow_cov.max() > allangle_snow_cov_max:
            allangle_snow_cov_max = allangle_snow_cov.max()
        if allangle_snow_cov.min() < allangle_snow_cov_min: 
            allangle_snow_cov_min = allangle_snow_cov.min()
        if allangle_snow_free.max() > allangle_snow_free_max:
            allangle_snow_free_max = allangle_snow_free.max()
        if allangle_snow_free.min() < allangle_snow_free_min:
            allangle_snow_free_min = allangle_snow_free.min()

print(f"NearNadir Composite Snow Covered Max: {nearnad_snow_cov_max}")
print(f"NearNadir Composite Snow Covered Min: {nearnad_snow_cov_min}")
print(f"NearNadir Composite Snow Free Max: {nearnad_snow_free_max}")
print(f"NearNadir Composite Snow Free Min: {nearnad_snow_free_min}")
print(f"OffNadir Composite Snow Covered Max: {offnad_snow_cov_max}")
print(f"OffNadir Composite Snow Covered Min: {offnad_snow_cov_min}")
print(f"OffNadir Composite Snow Free Max: {offnad_snow_free_max}")
print(f"OffNadir Composite Snow Free Min: {offnad_snow_free_min}")
print(f"AllAngle Composite Snow Covered Max: {allangle_snow_cov_max}")
print(f"AllAngle Composite Snow Covered Min: {allangle_snow_cov_min}")
print(f"AllAngle Composite Snow Free Max: {allangle_snow_free_max}")
print(f"AllAngle Composite Snow Free Min: {allangle_snow_free_min}")

# take the log of all the max values
# nearnad_snow_cov_max = np.log(nearnad_snow_cov_max)
# nearnad_snow_free_max = np.log(nearnad_snow_free_max)
# offnad_snow_cov_max = np.log(offnad_snow_cov_max)
# offnad_snow_free_max = np.log(offnad_snow_free_max)
# allangle_snow_cov_max = np.log(allangle_snow_cov_max)
# allangle_snow_free_max = np.log(allangle_snow_free_max)


NearNadir Composite Snow Covered Max: 30000.0
NearNadir Composite Snow Covered Min: 0.0
NearNadir Composite Snow Free Max: 30000.0
NearNadir Composite Snow Free Min: 0.0
OffNadir Composite Snow Covered Max: 30000.0
OffNadir Composite Snow Covered Min: 0.0
OffNadir Composite Snow Free Max: 25135.0
OffNadir Composite Snow Free Min: 0.0
AllAngle Composite Snow Covered Max: 30000.0
AllAngle Composite Snow Covered Min: 0.0
AllAngle Composite Snow Free Max: 23922.0
AllAngle Composite Snow Free Min: 0.0


This script looks calculates the idr and log bins for each composite 

In [2]:
def get_bins(composite_all):

    # exclude zeros
    composite_all = composite_all[composite_all > 0]

    # print the mean, median, and sd for that composite
    print(f"Mean: {composite_all.mean()}")
    print(f"Median: {np.median(composite_all)}")
    print(f"Standard Deviation: {composite_all.std()}")

    # calculate the interdecile range
    p10 = np.percentile(composite_all, 10)
    p90 = np.percentile(composite_all, 90)
    interdecile_range = p90 - p10

    # calculate the bin width, round to a whole number up
    bin_width = np.ceil(interdecile_range / 8)

    # get the bins
    bins_idr = [0.1 + p10 + i * bin_width for i in range(8)]
    bins_idr = [composite_all.min() - 0.1] + bins_idr + [max(p90, p10 + 8 *bin_width), composite_all.max() + 0.1]

    # turn all values into log
    composite_all_log = np.log(composite_all)

    # get the bins of equal width
    bins_log = np.linspace(composite_all_log.min(), composite_all_log.max(), 11, endpoint=True)

    return bins_idr, bins_log
    

In [3]:
ukraine = pd.read_csv("data/gdp_ukraine_clean.csv")
ukraine = ukraine[ukraine["region"] != "Kyiv_Oblast_City"]
poland = pd.read_csv("data/gdp_poland_clean.csv")
# composite_names = ["NearNadir_Composite_Snow_Covered", "NearNadir_Composite_Snow_Free", "OffNadir_Composite_Snow_Covered", "OffNadir_Composite_Snow_Free", "AllAngle_Composite_Snow_Covered", "AllAngle_Composite_Snow_Free"]
composite_names = ["AllAngle_Composite_Snow_Free"]
for country in ["ukr", "pol"]:

    if country == "ukr":
        country_data = ukraine
    elif country == "pol":
        country_data = poland

    for composite_name in composite_names:

        composite_all = np.array([])
        composite_all_hq = np.array([])

        for i in range(len(country_data)):

            # get year, region, and gdp
            year = country_data["year"][i]
            region = country_data["region"][i]
            gdp_value = country_data["real_gdp"][i]

            # get the file names
            file_name = f"{year}_{region}.h5"
            file_name_hq = f"{year}_{region}_hq.h5"

            # load the images
            file_path = f"data/annual_region_images/{file_name}"
            file_path_hq = f"data/annual_region_images/{file_name_hq}"
            
            with h5py.File(file_path, 'r') as annual_region:
                composite = annual_region[composite_name][:].flatten()

            composite_all = np.append(composite_all, composite)

            with h5py.File(file_path_hq, 'r') as annual_region_hq:
                composite_hq = annual_region_hq[composite_name][:].flatten()

            composite_all_hq = np.append(composite_all_hq, composite_hq)

            if i % 50 == 0:
                print(f"{i} out of {len(country_data)} done for {composite_name}.")

        # get the bins for that composite
        bins_idr, bins_log = get_bins(composite_all)
        bins_idr_hq, bins_log_hq = get_bins(composite_all_hq)

        print(f"Composite: {composite_name}")
        print(f"Country: {country}")
        print(f"Bins IDR: {bins_idr}")
        print(f"Bins Log: {bins_log}")
        print(f"Bins IDR HQ: {bins_idr_hq}")
        print(f"Bins Log HQ: {bins_log_hq}")

        # save the bins
        with open(f"data/bins/{country}_{composite_name}_bins_idr.npy", "wb") as f:
            np.save(f, bins_idr)
        
        with open(f"data/bins/{country}_{composite_name}_bins_log.npy", "wb") as f:
            np.save(f, bins_log)

        with open(f"data/bins/{country}_{composite_name}_bins_idr_hq.npy", "wb") as f:
            np.save(f, bins_idr_hq)

        with open(f"data/bins/{country}_{composite_name}_bins_log_hq.npy", "wb") as f:
            np.save(f, bins_log_hq)

0 out of 300 done for AllAngle_Composite_Snow_Free.
50 out of 300 done for AllAngle_Composite_Snow_Free.
100 out of 300 done for AllAngle_Composite_Snow_Free.
150 out of 300 done for AllAngle_Composite_Snow_Free.
200 out of 300 done for AllAngle_Composite_Snow_Free.
250 out of 300 done for AllAngle_Composite_Snow_Free.
Mean: 18.845038136743707
Median: 6.0
Standard Deviation: 116.80296334098945
Mean: 19.11430801710409
Median: 6.0
Standard Deviation: 118.11575240570737
Composite: AllAngle_Composite_Snow_Free
Country: ukr
Bins IDR: [4.9, 5.1, 9.1, 13.1, 17.1, 21.1, 25.1, 29.1, 33.1, 37.0, 23922.1]
Bins Log: [ 1.60943791  2.4567495   3.30406109  4.15137268  4.99868427  5.84599586
  6.69330745  7.54061905  8.38793064  9.23524223 10.08255382]
Bins IDR HQ: [4.9, 5.1, 9.1, 13.1, 17.1, 21.1, 25.1, 29.1, 33.1, 37.0, 23922.1]
Bins Log HQ: [ 1.60943791  2.4567495   3.30406109  4.15137268  4.99868427  5.84599586
  6.69330745  7.54061905  8.38793064  9.23524223 10.08255382]
0 out of 192 done for All

This script computes tabular data for each composite

In [9]:
def get_tabular_data(composite_name, log_bins, idr_bins, prefix, annual_region, data_dict):

    image = annual_region[composite_name][:].flatten()

    # count the number of zeros, exclude them
    num_zeros = np.sum(image == 0)
    image = image[image != 0]

    # take the sum, mean, median, standard deviation, mode of non-zero values
    sum = np.sum(image)
    mean = np.mean(image)
    median = np.median(image)
    mode_res = mode(image)
    sd = np.std(image)

    # get the counts for the idr bins
    idr_counts, idr_bin_edges = np.histogram(image, bins=idr_bins)

    # take the log and get the counts for the log bins
    log_image = np.log(image)
    log_counts, log_bin_edges = np.histogram(log_image, bins=log_bins)

    # create a dictionary to store the data
    data = data_dict.copy()
    for i in range(1, 11):
        data[f"{prefix}_idr_{i}"] = idr_counts[i - 1]
        data[f"{prefix}_log_{i}"] = log_counts[i - 1]

    data[f"{prefix}_num_zeros"] = num_zeros
    data[f"{prefix}_sum"] = sum
    data[f"{prefix}_mean"] = mean
    data[f"{prefix}_median"] = median
    data[f"{prefix}_sd"] = sd
    # data[f"{prefix}_mode"] = mode_res.mode[0]

    return data

Create Kyiv_Oblast_City composites

In [7]:
composite_names = ["NearNadir_Composite_Snow_Covered", "NearNadir_Composite_Snow_Free", "OffNadir_Composite_Snow_Covered", "OffNadir_Composite_Snow_Free", "AllAngle_Composite_Snow_Covered", "AllAngle_Composite_Snow_Free"]

for year in range(2012, 2024):
    file_name_city, file_name_oblast, file_name_both = f"{year}_Kyiv.h5", f"{year}_Kyiv_Oblast.h5", f"{year}_Kyiv_Oblast_City.h5"
    file_name_city_hq, file_name_oblast_hq, file_name_both_hq = f"{year}_Kyiv_hq.h5", f"{year}_Kyiv_Oblast_hq.h5", f"{year}_Kyiv_Oblast_City_hq.h5"
    file_path_city, file_path_oblast, file_path_both = f"data/annual_region_images/{file_name_city}", f"data/annual_region_images/{file_name_oblast}", f"data/annual_region_images/{file_name_both}"
    file_path_city_hq, file_path_oblast_hq, file_path_both_hq = f"data/annual_region_images/{file_name_city_hq}", f"data/annual_region_images/{file_name_oblast_hq}", f"data/annual_region_images/{file_name_both_hq}"
    
    with h5py.File(file_path_city, 'r') as annual_region_city, h5py.File(file_path_oblast, 'r') as annual_region_oblast, h5py.File(file_path_both, 'w') as annual_region_both:
        

        for composite in composite_names:
            composite_city = annual_region_city[composite][:]
            composite_oblast = annual_region_oblast[composite][:]
            composite_both = composite_city + composite_oblast
            annual_region_both.create_dataset(composite, data=composite_both)

    with h5py.File(file_path_city_hq, 'r') as annual_region_city_hq, h5py.File(file_path_oblast_hq, 'r') as annual_region_oblast_hq, h5py.File(file_path_both_hq, 'w') as annual_region_both_hq:

        for composite in composite_names:
            composite_city_hq = annual_region_city_hq[composite][:]
            composite_oblast_hq = annual_region_oblast_hq[composite][:]
            composite_both_hq = composite_city_hq + composite_oblast_hq
            annual_region_both_hq.create_dataset(composite, data=composite_both_hq)
    

In [10]:
ukraine = pd.read_csv("data/gdp_ukraine_clean.csv")
poland = pd.read_csv("data/gdp_poland_clean.csv")

composite_names = ["NearNadir_Composite_Snow_Covered", "NearNadir_Composite_Snow_Free", "OffNadir_Composite_Snow_Covered", "OffNadir_Composite_Snow_Free", "AllAngle_Composite_Snow_Covered", "AllAngle_Composite_Snow_Free"]
composite_prefixes = ["nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov", "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free"]
composite_prefixes_hq = ["nearnad_snow_cov_hq", "nearnad_snow_free_hq", "offnad_snow_cov_hq", "offnad_snow_free_hq", "allangle_snow_cov_hq", "allangle_snow_free_hq"]
bin_types = ["idr", "log"]
other_data = ["num_zeros", "sum", "mean", "median", "sd"]

# get all column names
column_names = []
for prefix in composite_prefixes + composite_prefixes_hq:
    for bin_type in bin_types:
        for i in range(1, 11):
            column_names.append(f"{prefix}_{bin_type}_{i}")
    for other in other_data:
        column_names.append(f"{prefix}_{other}")


for country in ["ukr", "pol"]:

    if country == "ukr":
        country_data = ukraine
    elif country == "pol":
        country_data = poland

    # initialise a pandas dataframe to store the data
    df = pd.DataFrame(columns=["year", "region", "real_gdp"] + column_names)

    for i in range(len(country_data)):

        # get year, region, and gdp
        year = country_data["year"][i]
        region = country_data["region"][i]
        gdp_value = country_data["real_gdp"][i]
        data_dict = {"year": year, "region": region, "real_gdp": gdp_value}

        # get the file names
        file_name = f"{year}_{region}.h5"
        file_name_hq = f"{year}_{region}_hq.h5"

        # load the images
        file_path = f"data/annual_region_images/{file_name}"
        file_path_hq = f"data/annual_region_images/{file_name_hq}"
        
        # get the bins and data for all quality images
        with h5py.File(file_path, 'r') as annual_region:
            for composite_name, prefix in zip(composite_names, composite_prefixes):
                with open(f"data/bins/{country}_{composite_name}_bins_idr.npy", "rb") as f:
                    idr_bins = np.load(f)
                with open(f"data/bins/{country}_{composite_name}_bins_log.npy", "rb") as f:
                    log_bins = np.load(f)
                data_dict = get_tabular_data(composite_name, log_bins, idr_bins, prefix, annual_region, data_dict)

        # get the bins and data for high quality images
        with h5py.File(file_path_hq, 'r') as annual_region_hq:
            for composite_name, prefix in zip(composite_names, composite_prefixes_hq):
                with open(f"data/bins/{country}_{composite_name}_bins_idr_hq.npy", "rb") as f:
                    idr_bins = np.load(f)
                with open(f"data/bins/{country}_{composite_name}_bins_log_hq.npy", "rb") as f:
                    log_bins = np.load(f)
                data_dict = get_tabular_data(composite_name, log_bins, idr_bins, prefix, annual_region_hq, data_dict)

            # add the data to the dataframe
            new_row = pd.DataFrame([data_dict])
            df = pd.concat([df, new_row], ignore_index=True)

    # save the dataframe
    if country == "ukr":
        df.to_csv("data/tabular_data_ukraine.csv", index=False)
        print("Data saved for Ukraine.")
    elif country == "pol":
        df.to_csv("data/tabular_data_poland.csv", index=False)
        print("Data saved for Poland.")


  df = pd.concat([df, new_row], ignore_index=True)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / r

Data saved for Ukraine.


  df = pd.concat([df, new_row], ignore_index=True)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / r

Data saved for Poland.
