# Exploratory Analysis of NUTS 2 GDP Values

In [27]:
# Import libraries
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [28]:
data = pd.read_csv("/Users/maxbehrens/Documents/Msc/Thesis/Data/gdp_data/nuts_gdp_cleaned.csv")
print(data.shape)
print(data.head(10))

(4179, 5)
  region  year    value code  country
0     BE  2008  33100.0   BE  Belgium
1    BE1  2008  61100.0   BE  Belgium
2   BE10  2008  61100.0   BE  Belgium
3    BE2  2008  33100.0   BE  Belgium
4   BE21  2008  39200.0   BE  Belgium
5   BE22  2008  27500.0   BE  Belgium
6   BE23  2008  29400.0   BE  Belgium
7   BE24  2008  34500.0   BE  Belgium
8   BE25  2008  31100.0   BE  Belgium
9    BE3  2008  24400.0   BE  Belgium


In [29]:
# Import GDP PPP data
ppp_data = pd.read_excel("/Users/maxbehrens/Documents/Msc/Thesis/Data/gdp_data/gdp_ppp.xls",na_values=':')
print(ppp_data.head())

geo   2006   2007   2008   2009   2010   2011   2012   2013   2014   2015  \
0  BE10  228.0  219.0  213.0  219.0  220.0  217.0  216.0  212.0  205.0  204.0   
1  BE21  140.0  138.0  137.0  137.0  141.0  140.0  142.0  141.0  141.0  140.0   
2  BE22   97.0   96.0   96.0   94.0   97.0   98.0   99.0   98.0   99.0   97.0   
3  BE23  105.0  104.0  102.0  107.0  108.0  108.0  109.0  109.0  109.0  109.0   
4  BE24  124.0  122.0  120.0  126.0  127.0  125.0  130.0  130.0  128.0  127.0   

    2016   2017  
0  201.0  196.0  
1  140.0  140.0  
2   96.0   96.0  
3  109.0  108.0  
4  128.0  125.0  


In [30]:
# Reshape to long format
ppp_long = pd.melt(ppp_data, id_vars=['geo'],value_vars=['2008','2009','2010','2011','2012','2013','2014','2015','2016','2017'])
ppp_long = ppp_long.dropna()
ppp_long.head()

Unnamed: 0,geo,variable,value
0,BE10,2008,213.0
1,BE21,2008,137.0
2,BE22,2008,96.0
3,BE23,2008,102.0
4,BE24,2008,120.0


In [31]:
# Average regions per country
print("Average regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.mean())
print("Min regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.min())
print("Max regions per country: ",data.groupby(['country','year']).region.count().reset_index().region.max())

Average regions per country:  13.39423076923077
Min regions per country:  3
Max regions per country:  55


## Benchmark Prediction for GDP Values

In [32]:
country_values = data[data['region'].str.len() == 2]
country_values.head()

Unnamed: 0,region,year,value,code,country
0,BE,2008,33100.0,BE,Belgium
15,BG,2008,4900.0,BG,Bulgaria
24,CZ,2008,15500.0,CZ,Czechia
34,DK,2008,44000.0,DK,Denmark
41,DE,2008,31700.0,DE,Germany


In [33]:
# Open list of NUTS 2 regions
with open("/Users/maxbehrens/Documents/Msc/Thesis/Data/geo_data_2016/NUTS_RG_01M_2016_4326_LEVL_2.geojson") as f:
    nuts2_2016 = json.load(f)

In [34]:
# Make list of all downloaded regions, which will be predicted
nuts2_regions = []
for i in range(len(nuts2_2016['features'])): 
    nuts2_regions.append(nuts2_2016['features'][i]['properties']['NUTS_ID'])

# Check
print(len(nuts2_regions))

332


In [35]:
# Print number of GDP values
nuts2_predictions = data
print(data.shape)

(4179, 5)


## Predict values with total value of country and year

In [36]:
# Merge with total values per country
country_predictions = nuts2_predictions.merge(country_values.loc[:,['region','year','value']], left_on=['code','year'], right_on=['region','year']).drop(columns = ['region_y'])
country_predictions.columns = ['nuts2','year','nuts_value','code','country','country_value']
print(country_predictions.head())
print(country_predictions.shape)

nuts2  year  nuts_value code  country  country_value
0    BE  2008     33100.0   BE  Belgium        33100.0
1   BE1  2008     61100.0   BE  Belgium        33100.0
2  BE10  2008     61100.0   BE  Belgium        33100.0
3   BE2  2008     33100.0   BE  Belgium        33100.0
4  BE21  2008     39200.0   BE  Belgium        33100.0
(4179, 6)


In [37]:
# Filter to regions which were used for CNN training as well
banned_countries = ["SE","FI","NO","BE"]
country_predictions = country_predictions.loc[(~country_predictions['code'].isin(banned_countries))&(country_predictions['year']>2013)]
country_predictions = country_predictions.loc[country_predictions['nuts2'].isin(nuts2_regions)]

# Print shape
print(country_predictions.shape)
country_predictions.head()

(1128, 6)


Unnamed: 0,nuts2,year,nuts_value,code,country,country_value
2351,BG31,2014,3800.0,BG,Bulgaria,5900.0
2352,BG32,2014,4300.0,BG,Bulgaria,5900.0
2353,BG33,2014,5000.0,BG,Bulgaria,5900.0
2354,BG34,2014,5000.0,BG,Bulgaria,5900.0
2356,BG41,2014,9500.0,BG,Bulgaria,5900.0


In [38]:
# Some basic stats
print(len(country_predictions['nuts2'].unique()))
print(len(country_predictions['code'].unique()))

292
30


## Benchmarking Results

### Predict Country value

In [39]:
# Compute MAE and MSE for predictions
### THIS IS USED IN THESIS ###
mae_country_value = np.mean(abs(country_predictions['country_value']-country_predictions['nuts_value']))
mse_country_value = np.mean((country_predictions['country_value']-country_predictions['nuts_value'])**2)
print('MAE: ', mae_country_value)
print('MSE: ', mse_country_value)

MAE:  5835.815602836879
MSE:  160428421.9858156


### Predict values with total mean

In [40]:
# Compute MAE and MSE for predictions
mae_total_value = np.mean(abs(np.mean(country_predictions['nuts_value'])-country_predictions['nuts_value']))
mse_total_value = np.mean((np.mean(country_predictions['nuts_value'])-country_predictions['nuts_value'])**2)
print('MAE: ', mae_total_value)
print('MSE: ', mse_total_value)

MAE:  12063.273351441076
MSE:  324876751.9239475


## Predict values by the mean of regions per country and year

In [41]:
# Average GDP per country from the values of regions
mean_predictions = country_predictions.groupby(['code','year'])['nuts_value'].mean().reset_index()
mean_predictions.head()

Unnamed: 0,code,year,nuts_value
0,AL,2014,3400.0
1,AL,2015,3500.0
2,AL,2016,3633.333333
3,AL,2017,3933.333333
4,AT,2014,38166.666667


In [42]:
# Merge with region and year
mean_predictions = country_predictions.merge(mean_predictions, left_on=['code','year'], right_on=['code','year'])
mean_predictions.columns = ['nuts2','year','nuts_value','code','country','country_value','mean_value']
mean_predictions.head()

Unnamed: 0,nuts2,year,nuts_value,code,country,country_value,mean_value
0,BG31,2014,3800.0,BG,Bulgaria,5900.0,5266.666667
1,BG32,2014,4300.0,BG,Bulgaria,5900.0,5266.666667
2,BG33,2014,5000.0,BG,Bulgaria,5900.0,5266.666667
3,BG34,2014,5000.0,BG,Bulgaria,5900.0,5266.666667
4,BG41,2014,9500.0,BG,Bulgaria,5900.0,5266.666667


In [43]:
# Compute MAE and MSE for predictions
mae_mean_value = np.mean(abs(mean_predictions['nuts_value']-mean_predictions['mean_value']))
mse_mean_value = np.mean((mean_predictions['nuts_value']-mean_predictions['mean_value'])**2)
print('MAE: ', mae_mean_value)
print('MSE: ', mse_mean_value)

MAE:  5312.380800070733
MSE:  155685089.24860018


## Add Relative GDP to table

In [44]:
# GDP NUTS Value - GDP Country Value
mean_predictions['nuts_diff'] = mean_predictions['nuts_value']-mean_predictions['country_value']
mean_predictions.head()

Unnamed: 0,nuts2,year,nuts_value,code,country,country_value,mean_value,nuts_diff
0,BG31,2014,3800.0,BG,Bulgaria,5900.0,5266.666667,-2100.0
1,BG32,2014,4300.0,BG,Bulgaria,5900.0,5266.666667,-1600.0
2,BG33,2014,5000.0,BG,Bulgaria,5900.0,5266.666667,-900.0
3,BG34,2014,5000.0,BG,Bulgaria,5900.0,5266.666667,-900.0
4,BG41,2014,9500.0,BG,Bulgaria,5900.0,5266.666667,3600.0


## Add another column with ppp value

### NOT USED IN THE THESIS

In [45]:
print(ppp_long.shape)
print(mean_predictions.shape)

(2827, 3)
(1128, 8)


In [46]:
# Convert to numeric values
ppp_long['variable'] = pd.to_numeric(ppp_long['variable'])

In [47]:
# Merge with other GDP values
enhanced_gdp = mean_predictions.merge(ppp_long, how='left', left_on=['nuts2','year'], right_on=['geo','variable'])
print(enhanced_gdp.head)
print(enhanced_gdp.shape)

<bound method NDFrame.head of      nuts2  year  nuts_value code   country  country_value   mean_value  \
0     BG31  2014      3800.0   BG  Bulgaria         5900.0  5266.666667   
1     BG32  2014      4300.0   BG  Bulgaria         5900.0  5266.666667   
2     BG33  2014      5000.0   BG  Bulgaria         5900.0  5266.666667   
3     BG34  2014      5000.0   BG  Bulgaria         5900.0  5266.666667   
4     BG41  2014      9500.0   BG  Bulgaria         5900.0  5266.666667   
...    ...   ...         ...  ...       ...            ...          ...   
1123  TRB1  2017      5600.0   TR    Turkey         9400.0  7665.384615   
1124  TRB2  2017      3700.0   TR    Turkey         9400.0  7665.384615   
1125  TRC1  2017      6200.0   TR    Turkey         9400.0  7665.384615   
1126  TRC2  2017      3900.0   TR    Turkey         9400.0  7665.384615   
1127  TRC3  2017      4500.0   TR    Turkey         9400.0  7665.384615   

      nuts_diff   geo  variable  value  
0       -2100.0  BG31      2

In [48]:
# Reshape and rename to clean format
enhanced_gdp = enhanced_gdp.drop(columns = ['geo','variable'])
enhanced_gdp.columns = ['nuts2', 'year', 'nuts_value', 'code', 'country', 'country_value',
       'mean_value','nuts_diff','ppp']
print(enhanced_gdp.head())

nuts2  year  nuts_value code   country  country_value   mean_value  \
0  BG31  2014      3800.0   BG  Bulgaria         5900.0  5266.666667   
1  BG32  2014      4300.0   BG  Bulgaria         5900.0  5266.666667   
2  BG33  2014      5000.0   BG  Bulgaria         5900.0  5266.666667   
3  BG34  2014      5000.0   BG  Bulgaria         5900.0  5266.666667   
4  BG41  2014      9500.0   BG  Bulgaria         5900.0  5266.666667   

   nuts_diff   ppp  
0    -2100.0  30.0  
1    -1600.0  34.0  
2     -900.0  40.0  
3     -900.0  40.0  
4     3600.0  75.0  


In [49]:
# Save
enhanced_gdp.to_csv("/Users/maxbehrens/Documents/Msc/Thesis/Data/gdp_data/enhanced_gdp_data.csv", index=False)