# Exploratory Analysis of NUTS 2 GDP Values

In [58]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [10]:
data = pd.read_csv("Data/gdp_data/nuts_gdp_cleaned.csv")
print(data.shape)
print(data.head(10))

(4179, 5)
  region  year    value code  country
0     BE  2008  33100.0   BE  Belgium
1    BE1  2008  61100.0   BE  Belgium
2   BE10  2008  61100.0   BE  Belgium
3    BE2  2008  33100.0   BE  Belgium
4   BE21  2008  39200.0   BE  Belgium
5   BE22  2008  27500.0   BE  Belgium
6   BE23  2008  29400.0   BE  Belgium
7   BE24  2008  34500.0   BE  Belgium
8   BE25  2008  31100.0   BE  Belgium
9    BE3  2008  24400.0   BE  Belgium


In [16]:
# Average regions per country
print("Average regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.mean())
print("Min regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.min())
print("Max regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.max())

Average regions per country:  13.39423076923077
Min regions per country:  3
Max regions per country:  55


## Benchmark Prediction for GDP Values

In [21]:
country_values = data[data['region'].str.len() == 2]
country_values.head()

Unnamed: 0,region,year,value,code,country
0,BE,2008,33100.0,BE,Belgium
15,BG,2008,4900.0,BG,Bulgaria
24,CZ,2008,15500.0,CZ,Czechia
34,DK,2008,44000.0,DK,Denmark
41,DE,2008,31700.0,DE,Germany


In [24]:
# Open list of NUTS 2 regions
with open("Data/geo_data/NUTS_RG_01M_2016_4326_LEVL_2.geojson") as f:
    nuts2_poly = json.load(f)


In [27]:
# Make list of all downloaded regions, which will be predicted
nuts2_regions = []
for i in range(332): # 332 different NUTS2 regions are downloaded
    nuts2_regions.append(nuts2_poly['features'][i]['properties']['NUTS_ID'])

# Check
nuts2_regions[:5]

['CH02', 'CH03', 'AL02', 'AL03', 'BG33']

In [29]:
# Filter for predicted regions
nuts2_predictions = data[data['region'].isin(nuts2_regions)]
print(nuts2_predictions.shape)
print(nuts2_predictions.head())

(2827, 5)
  region  year    value code  country
2   BE10  2008  61100.0   BE  Belgium
4   BE21  2008  39200.0   BE  Belgium
5   BE22  2008  27500.0   BE  Belgium
6   BE23  2008  29400.0   BE  Belgium
7   BE24  2008  34500.0   BE  Belgium


## Predict values with total value of country and year

In [34]:
# Merge with total values per country
country_predictions = nuts2_predictions.merge(country_values.loc[:,['region','year','value']], left_on=['code','year'], right_on=['region','year']).drop(columns = ['region_y'])
country_predictions.columns = ['nuts2','year','nuts_value','code','country','country_value']
country_predictions.head()

Unnamed: 0,nuts2,year,nuts_value,code,country,country_value
0,BE10,2008,61100.0,BE,Belgium,33100.0
1,BE21,2008,39200.0,BE,Belgium,33100.0
2,BE22,2008,27500.0,BE,Belgium,33100.0
3,BE23,2008,29400.0,BE,Belgium,33100.0
4,BE24,2008,34500.0,BE,Belgium,33100.0


In [35]:
# Compute MAE and MSE for predictions
mae_country_value = np.mean(abs(country_predictions['country_value']-country_predictions['nuts_value']))
mse_country_value = np.mean((country_predictions['country_value']-country_predictions['nuts_value'])**2)
print('MAE: ', mae_country_value)
print('MSE: ', mse_country_value)

MAE:  5889.246551114255
MSE:  137490948.00141492


## Predict values with total mean

In [37]:
# Compute MAE and MSE for predictions
mae_total_value = np.mean(abs(np.mean(country_predictions['nuts_value'])-country_predictions['nuts_value']))
mse_total_value = np.mean((np.mean(country_predictions['nuts_value'])-country_predictions['nuts_value'])**2)
print('MAE: ', mae_total_value)
print('MSE: ', mse_total_value)

MAE:  11795.816854729313
MSE:  292515102.45398825


## Predict values by the mean of regions per country and year

In [50]:
mean_predictions = country_predictions.groupby(['code','year'])['nuts_value'].mean().reset_index()
mean_predictions.head()

Unnamed: 0,code,year,nuts_value
0,AL,2008,2933.333333
1,AL,2009,2933.333333
2,AL,2010,3066.666667
3,AL,2011,3166.666667
4,AL,2012,3233.333333


In [51]:
mean_predictions = country_predictions.merge(mean_predictions, left_on=['code','year'], right_on=['code','year'])
mean_predictions.columns = ['nuts2','year','nuts_value','code','country','country_value','mean_value']
mean_predictions.head()

Unnamed: 0,nuts2,year,nuts_value,code,country,country_value,mean_value
0,BE10,2008,61100.0,BE,Belgium,33100.0,31881.818182
1,BE21,2008,39200.0,BE,Belgium,33100.0,31881.818182
2,BE22,2008,27500.0,BE,Belgium,33100.0,31881.818182
3,BE23,2008,29400.0,BE,Belgium,33100.0,31881.818182
4,BE24,2008,34500.0,BE,Belgium,33100.0,31881.818182


In [53]:
# Compute MAE and MSE for predictions
mae_mean_value = np.mean(abs(mean_predictions['nuts_value']-mean_predictions['mean_value']))
mse_mean_value = np.mean((mean_predictions['nuts_value']-mean_predictions['mean_value'])**2)
print('MAE: ', mae_mean_value)
print('MSE: ', mse_mean_value)

MAE:  5280.572333692404
MSE:  127639832.55490972
