In [6]:
import h5py
import rasterio
from rasterio.mask import mask
from rasterio.transform import from_origin
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd 
import pandas as pd
from dnb_annual import *
from variables import years, composites, region_map, region_names

In [3]:
# this script is used only once to generate the regional images for each year
# country_polygons = gpd.read_file("geoBoundaries-UKR-ADM1.geojson")

# for year in years:
#     dnb = dnb_annual(year, composites, country_polygons)
#     dnb.load_all_data()
#     dnb.save_rasters()
#     dnb.load_rasters()
#     dnb.build_regional_images()
#     dnb.add_padding()
#     dnb.save_regional_images()

In [30]:
# this script is used to clean gdp data

# Inflation data
# inflation = pd.read_excel("data/isc_reg.xls", skiprows=2, header=1)
# inflation = inflation.drop(columns=inflation.columns[0])
# inflation = inflation.rename(columns={inflation.columns[-1]: "region"})
# inflation = inflation[~inflation["region"].isin(["Ukraine", "oblasts"])]
# inflation = inflation.dropna()
# inflation["region"] = inflation["region"].map(region_map)
# inflation.columns = inflation.columns.astype(str)
# inflation = inflation.melt(id_vars="region", var_name="year", value_name="inflation")
# inflation.to_csv("data/inflation.csv", index=False)

# GDP data
gdp = pd.read_excel("data/ukr_reg_gdp.xls", skiprows=3, header=1)
gdp = gdp.drop(columns=gdp.columns[0])
gdp = gdp.iloc[:, np.r_[18:36, -1]]
gdp = gdp.rename(columns={gdp.columns[-1]: "region"})
gdp = gdp[~gdp["region"].isin(["Ukrane", "oblasts"])]
gdp = gdp.dropna()
gdp["region"] = gdp["region"].map(region_map)
gdp["region"] = gdp["region"].fillna("Sevastopol")
gdp.columns = gdp.columns.astype(str)
gdp = gdp.rename(columns={gdp.columns[i]: gdp.columns[i][:4] for i in range(18)})
gdp = gdp.melt(id_vars="region", var_name="year", value_name="real_gdp_change")

# include only years from 2012 inclusive, exclude Sevastopol and the Autonomous Republic of Crimea
gdp = gdp[gdp["year"].astype(int) >= 2012]
gdp = gdp[~gdp["region"].isin(["Sevastopol", "Autonomous Republic of Crimea"])]

# set the value for the starting year to 100 (2012), NaN for the rest
gdp.loc[gdp["year"] == "2012", "real_gdp"] = 100
gdp = gdp.sort_values(by=["region", "year"])
gdp["real_gdp_change"] = gdp["real_gdp_change"] / 100

# reste the index
gdp = gdp.reset_index(drop=True)

# # calculate the real gdp
for i in range(1, gdp.shape[0]):

    # skip if the year is 2012
    if gdp.loc[gdp.index[i], "year"] == "2012":
        continue
    else:
        gdp.loc[gdp.index[i], "real_gdp"] = gdp.loc[gdp.index[i-1], "real_gdp"] * (gdp.loc[gdp.index[i], "real_gdp_change"])

# delete the real_gdp_change column
gdp = gdp.drop(columns="real_gdp_change")

# get the nominal gdp
gdp_nominal = pd.read_excel("data/ukr_reg_gdp.xls", skiprows=3, header=1)
gdp_nominal = gdp_nominal.iloc[:, np.r_[9, -1]]
gdp_nominal.columns = ["gdp_nominal", "region"]
gdp_nominal = gdp_nominal[~gdp_nominal["region"].isin(["Ukrane", "oblasts"])]
gdp_nominal = gdp_nominal.dropna()
gdp_nominal["region"] = gdp_nominal["region"].map(region_map)
gdp_nominal["region"] = gdp_nominal["region"].fillna("Sevastopol")

# merge nominal gdp to real gdp by region
gdp = gdp.merge(gdp_nominal, on="region")

# multiple the real gdp by the nominal gdp
gdp["real_gdp"] = gdp["real_gdp"] * gdp["gdp_nominal"]

# drop the nominal gdp column
gdp = gdp.drop(columns="gdp_nominal")

# for the region column, change all spaces to _
gdp["region"] = gdp["region"].str.replace(" ", "_")

# save the data
gdp.to_csv("data/clean_gdp.csv", index=False)


### Neural Network

In [17]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


# # Loading the MNIST dataset
# from keras.datasets import mnist
# (train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [18]:
# load clean gdp data
gdp = pd.read_csv("data/clean_gdp.csv")

# Initialise a three dimensional array to store the images
X = np.zeros((len(gdp), 765, 1076))
y = np.zeros(len(gdp))

# load the snow covered and snow free images, add them together and append to the list
for i in range(len(gdp)):

    # get year, region, and gdp
    year = gdp["year"][i]
    region = gdp["region"][i]
    gdp_value = gdp["real_gdp"][i]

    # get the file name
    file_name = f"{year}_{region}.h5"

    # load the image
    file_path = f"data/annual_region_images/{file_name}"
    
    with h5py.File(file_path, 'r') as annual_region:
        snow_covered = annual_region["AllAngle_Composite_Snow_Covered"][:]
        snow_free = annual_region["AllAngle_Composite_Snow_Free"][:]

        # add the two images together
        combined = snow_covered + snow_free

    # add the gdp value to y
    y[i] = gdp_value

    # append the image to X
    X[i] = combined

print(X.shape)
print(y.shape)

# Normalise the images
maximum = X.max()
X = X / maximum

# standardise gdp values
y = (y - y.mean()) / y.std()

print(y.mean())
print(maximum)

(250, 765, 1076)
(250,)
2.842170943040401e-17
53922.0


In [19]:
# select 80% of the data for training, choose randomly
# X is the images, y is the gdp
train_size = int(0.8 * len(gdp))
test_size = len(gdp) - train_size

# select randomly train_size numbers from 0 to len(gdp)
train_indices = np.random.choice(len(gdp), train_size, replace=False)
test_indices = np.setdiff1d(np.arange(len(gdp)), train_indices)

# get the train data
X_train = X[train_indices]
y_train = y[train_indices]

# get the test data
X_test = X[test_indices]
y_test = y[test_indices]


In [20]:
model = Sequential() 
model.add(Flatten(input_shape=(765, 1076)))  # Flatten the 2D image into a 1D array
model.add(Dense(64, activation='relu'))      # Add a fully connected layer with 64 neurons and ReLU activation
model.add(Dense(1))
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mae'])  # Using Mean Squared Error loss and Mean Absolute Error metric


In [21]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)  # Assuming you have a validation split of 20%


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x20a8006eca0>

In [22]:
# Step 6: Evaluate your model on the testing data
test_loss, test_mae = model.evaluate(X_test, y_test)
print('Test MAE:', test_mae) # mean absolute error
print('Test Loss:', test_loss)

Test MAE: 0.1212017610669136
Test Loss: 0.038211364299058914


In [23]:
y_test

array([-0.35328459, -0.35934255, -0.35308317, -0.67422427, -0.67006618,
       -0.65813516, -0.66242736,  1.66620088,  1.36667144,  1.34279448,
        0.21447579, -0.40856682, -0.37869668, -0.31488347,  0.45270047,
        0.40657239, -0.5671497 , -0.55565071, -0.44506385, -0.42874331,
       -0.48154707, -0.50134819, -0.44644959, -0.46444809,  4.02686291,
        4.22617341,  0.23412913,  0.2694953 , -0.32585884, -0.6289909 ,
        0.17647786, -0.40386827, -0.32935416, -0.33432836,  0.27269501,
        0.31760439, -0.07728294, -0.10081219, -0.53458293, -0.53685923,
       -0.50489344, -0.46038454, -0.4519557 , -0.60057928, -0.576305  ,
       -0.59695028, -0.27076813, -0.56921405,  0.04834337, -0.46934308])