https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

### Load packages

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [42]:
from pandas import set_option
set_option('display.max_columns', None)

### Define functions

In [2]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

### Load dataset

In [31]:
# Load dataset
dataset_all = read_csv('../data/df_clean.csv', header=0)

In [32]:
# Subset to one country
dataset = dataset_all[dataset_all.Country == 'Japan'].sort_values('Year').reset_index().iloc[:,3:]

In [45]:
dataset.shape

(23, 21)

In [35]:
values = dataset.values

In [10]:
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

In [38]:
# specify the number of lag years
n_years_lag = 1
n_features = scaled.shape[1]-1
# frame as supervised learning
reframed = series_to_supervised(values, n_years_lag, 1)
print(reframed.shape)

(22, 42)


In [44]:
reframed

Unnamed: 0,var1(t-1),var2(t-1),var3(t-1),var4(t-1),var5(t-1),var6(t-1),var7(t-1),var8(t-1),var9(t-1),var10(t-1),var11(t-1),var12(t-1),var13(t-1),var14(t-1),var15(t-1),var16(t-1),var17(t-1),var18(t-1),var19(t-1),var20(t-1),var21(t-1),var1(t),var2(t),var3(t),var4(t),var5(t),var6(t),var7(t),var8(t),var9(t),var10(t),var11(t),var12(t),var13(t),var14(t),var15(t),var16(t),var17(t),var18(t),var19(t),var20(t),var21(t)
1,13.590583,1082.6554,1331.1099,2180.5838,2356.9458,564.8382,443.9216,2600.5047,5883.0587,0.0,247.7112,0.0,199.8423,0.0,0.0,123537000.0,1967.420993,1.54,65.56374,23.181523,249500.0,12.783167,1088.2812,1361.4418,2313.0267,2344.4087,608.9425,483.4684,2609.8994,5888.392,0.0,242.3806,0.0,206.8938,0.0,0.0,123921000.0,2051.952488,1.53,64.449997,23.693268,249426.0
2,12.783167,1088.2812,1361.4418,2313.0267,2344.4087,608.9425,483.4684,2609.8994,5888.392,0.0,242.3806,0.0,206.8938,0.0,0.0,123921000.0,2051.952488,1.53,64.449997,23.693268,249426.0,12.48425,1098.1753,1396.1105,2435.1576,2182.1631,630.343,502.827,2575.3073,5949.2685,0.0,238.8828,0.0,210.5758,0.0,0.0,124229000.0,2166.038128,1.502,65.152792,24.68432,249352.0
3,12.48425,1098.1753,1396.1105,2435.1576,2182.1631,630.343,502.827,2575.3073,5949.2685,0.0,238.8828,0.0,210.5758,0.0,0.0,124229000.0,2166.038128,1.502,65.152792,24.68432,249352.0,12.055167,1089.8927,1416.2305,2669.3172,2012.0671,653.2915,504.7714,2568.934,6054.6914,0.0,246.484,0.0,217.1407,0.0,0.0,124536000.0,2294.106358,1.458,61.273173,27.2723,249278.0
4,12.055167,1089.8927,1416.2305,2669.3172,2012.0671,653.2915,504.7714,2568.934,6054.6914,0.0,246.484,0.0,217.1407,0.0,0.0,124536000.0,2294.106358,1.458,61.273173,27.2723,249278.0,13.22975,1140.8671,1493.425,2795.5539,1813.8055,688.6269,541.0427,2573.8024,6143.4741,0.0,252.0788,0.0,224.9406,0.0,0.0,124961000.0,2481.28292,1.5,64.390485,27.616489,249204.0
5,13.22975,1140.8671,1493.425,2795.5539,1813.8055,688.6269,541.0427,2573.8024,6143.4741,0.0,252.0788,0.0,224.9406,0.0,0.0,124961000.0,2481.28292,1.5,64.390485,27.616489,249204.0,12.475417,1152.8119,1537.4246,3124.2287,1791.3193,743.8814,562.8012,2587.9991,6206.6392,0.0,264.2714,0.0,243.7702,0.0,0.0,125439000.0,2655.748233,1.422,61.452946,29.130731,249130.0
6,12.475417,1152.8119,1537.4246,3124.2287,1791.3193,743.8814,562.8012,2587.9991,6206.6392,0.0,264.2714,0.0,243.7702,0.0,0.0,125439000.0,2655.748233,1.422,61.452946,29.130731,249130.0,12.082833,1167.7661,1578.3839,3116.8525,1771.5196,782.3093,581.4807,2697.0634,6395.4411,0.0,274.0647,0.0,265.9245,0.0,0.0,125757000.0,2839.747068,1.425,61.199215,29.654576,249056.0
7,12.082833,1167.7661,1578.3839,3116.8525,1771.5196,782.3093,581.4807,2697.0634,6395.4411,0.0,274.0647,0.0,265.9245,0.0,0.0,125757000.0,2839.747068,1.425,61.199215,29.654576,249056.0,12.752417,1160.3747,1592.5605,3161.5961,1687.1812,825.2656,600.1348,2639.2896,6581.5856,0.0,289.0099,0.0,284.8526,0.0,0.0,126057000.0,3034.897078,1.388,59.517549,30.628368,248982.0
8,12.752417,1160.3747,1592.5605,3161.5961,1687.1812,825.2656,600.1348,2639.2896,6581.5856,0.0,289.0099,0.0,284.8526,0.0,0.0,126057000.0,3034.897078,1.388,59.517549,30.628368,248982.0,13.24475,1126.3275,1492.8235,3266.7809,1682.8814,840.6924,622.6385,2633.7315,6648.3163,0.0,296.5964,0.0,294.0935,0.0,0.0,126400000.0,3155.3099,1.384,58.33315,31.732799,248908.0
9,13.24475,1126.3275,1492.8235,3266.7809,1682.8814,840.6924,622.6385,2633.7315,6648.3163,0.0,296.5964,0.0,294.0935,0.0,0.0,126400000.0,3155.3099,1.384,58.33315,31.732799,248908.0,13.080083,1165.9829,1567.9684,3185.4882,1740.3991,905.4439,661.9925,2603.0604,6662.15,0.0,278.5529,0.0,303.8107,0.0,0.0,126631000.0,3385.848708,1.342,61.123206,29.647275,248834.0
10,13.080083,1165.9829,1567.9684,3185.4882,1740.3991,905.4439,661.9925,2603.0604,6662.15,0.0,278.5529,0.0,303.8107,0.0,0.0,126631000.0,3385.848708,1.342,61.123206,29.647275,248834.0,12.833833,1181.6503,1638.902,3383.6875,1768.2282,942.1554,682.3017,2651.1982,6892.9756,0.0,284.81,0.0,315.6212,0.0,0.0,126843000.0,3582.186705,1.359,61.277631,29.597589,248760.0


In [48]:
# split into train and test sets
values = reframed.values
n_years = 21
train = values[:-n_years_test, :]
test = values[-n_years_test:, :]

In [29]:
len(train)

21

In [25]:
# split into input and outputs
train_X, train_y = train[:, 1:], train[:, 0]
test_X, test_y = test[:, 1:], test[:, 0]
print(train_X.shape, len(train_X), train_y.shape)

(21, 41) 21 (21,)


In [50]:
train_X.shape[0]

21

In [49]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_years, n_features))
test_X = test_X.reshape((test_X.shape[0], n_years, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

ValueError: cannot reshape array of size 861 into shape (21,21,20)

In [None]:
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, -7:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -7:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)