In [28]:
# preliminaries

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from numpy import loadtxt
from numpy import nan
from numpy import isnan
from numpy import count_nonzero
from numpy import unique
from numpy import array
from numpy import nanmedian
from numpy import save
from sktime.classification.interval_based import TimeSeriesForestClassifier

import numpy as np
import pandas as pd

# load data and fix issues w original

weather = pd.read_csv('madweather.csv')
del weather['Unnamed: 0']
weather.head()
#weather.dtypes

# change data from object to datetime
weather['DATE'] = pd.to_datetime(weather.DATE, format = '%Y/%m')
data = weather.drop(['DATE'], axis = 1)
data.index = pd.DatetimeIndex(weather.DATE).to_period('m')


data = data.fillna(method="ffill")
data = data.iloc[0:,0:3]
data

Unnamed: 0_level_0,PRCP,SNOW,TAVG
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900-01,0.69,4.5,23.7
1900-02,1.26,9.1,13.0
1900-03,1.33,12.5,25.1
1900-04,1.31,5.0,48.0
1900-05,1.87,0.0,60.5
...,...,...,...
2019-09,5.19,0.0,65.4
2019-10,5.98,5.6,48.3
2019-11,3.17,9.2,29.4
2019-12,1.75,1.2,29.0


In [25]:
values = data.values
# get list of unique chunk identifiers
chunk_ids = unique(values[:,1])

chunks = dict()


# split the dataset by 'chunkID', return a list of chunks
def to_chunks(values, chunk_ix=0):
	chunks = list()
	# get the unique chunk ids
	chunk_ids = unique(values[:, chunk_ix])
	# group rows by chunk id
	for chunk_id in chunk_ids:
		selection = values[:, chunk_ix] == chunk_id
		chunks.append(values[selection, :])
	return chunks
 
# return a list of relative forecast lead times
def get_lead_times():
	return [1, 2, 3, 4, 5, 10, 17, 24, 48, 72]
 
# interpolate series of hours (in place) in 24 hour time
def interpolate_hours(hours):
	# find the first hour
	ix = -1
	for i in range(len(hours)):
		if not isnan(hours[i]):
			ix = i
			break
	# fill-forward
	hour = hours[ix]
	for i in range(ix+1, len(hours)):
		# increment hour
		hour += 1
		# check for a fill
		if isnan(hours[i]):
			hours[i] = hour % 24
	# fill-backward
	hour = hours[ix]
	for i in range(ix-1, -1, -1):
		# decrement hour
		hour -= 1
		# check for a fill
		if isnan(hours[i]):
			hours[i] = hour % 24
 
# return true if the array has any non-nan values
def has_data(data):
	return count_nonzero(isnan(data)) < len(data)
 
# impute missing data
def impute_missing(train_chunks, rows, hours, series, col_ix):
	# impute missing using the median value for hour in all series
	imputed = list()
	for i in range(len(series)):
		if isnan(series[i]):
			# collect all rows across all chunks for the hour
			all_rows = list()
			for rows in train_chunks:
				[all_rows.append(row) for row in rows[rows[:,2]==hours[i]]]
			# calculate the central tendency for target
			all_rows = array(all_rows)
			# fill with median value
			value = nanmedian(all_rows[:, col_ix])
			if isnan(value):
				value = 0.0
			imputed.append(value)
		else:
			imputed.append(series[i])
	return imputed
 
# layout a variable with breaks in the data for missing positions
def variable_to_series(chunk_train, col_ix, n_steps=5*24):
	# lay out whole series
	data = [nan for _ in range(n_steps)]
	# mark all available data
	for i in range(len(chunk_train)):
		# get position in chunk
		position = int(chunk_train[i, 1] - 1)
		# store data
		data[position] = chunk_train[i, col_ix]
	return data
 
# created input/output patterns from a sequence
def supervised_for_lead_time(series, n_lag, lead_time):
	samples = list()
	# enumerate observations and create input/output patterns
	for i in range(n_lag, len(series)):
		end_ix = i + (lead_time - 1)
		# check if can create a pattern
		if end_ix >= len(series):
			break
		# retrieve input and output
		start_ix = i - n_lag
		row = series[start_ix:i] + [series[end_ix]]
		samples.append(row)
	return samples
 
# create supervised learning data for each lead time for this target
def target_to_supervised(chunks, rows, hours, col_ix, n_lag):
	train_lead_times = list()
	# get series
	series = variable_to_series(rows, col_ix)
	if not has_data(series):
		return None, [nan for _ in range(n_lag)]
	# impute
	imputed = impute_missing(chunks, rows, hours, series, col_ix)
	# prepare test sample for chunk-variable
	test_sample = array(imputed[-n_lag:])
	# enumerate lead times
	lead_times = get_lead_times()
	for lead_time in lead_times:
		# make input/output data from series
		train_samples = supervised_for_lead_time(imputed, n_lag, lead_time)
		train_lead_times.append(train_samples)
	return train_lead_times, test_sample
 
# prepare training [var][lead time][sample] and test [chunk][var][sample]
def data_prep(chunks, n_lag, n_vars=39):
	lead_times = get_lead_times()
	train_data = [[list() for _ in range(len(lead_times))] for _ in range(n_vars)]
	test_data = [[list() for _ in range(n_vars)] for _ in range(len(chunks))]
	# enumerate targets for chunk
	for var in range(n_vars):
		# convert target number into column number
		col_ix = 3 + var
		# enumerate chunks to forecast
		for c_id in range(len(chunks)):
			rows = chunks[c_id]
			# prepare sequence of hours for the chunk
			hours = variable_to_series(rows, 2)
			# interpolate hours
			interpolate_hours(hours)
			# check for no data
			if not has_data(rows[:, col_ix]):
				continue
			# convert series into training data for each lead time
			train, test_sample = target_to_supervised(chunks, rows, hours, col_ix, n_lag)
			# store test sample for this var-chunk
			test_data[c_id][var] = test_sample
			if train is not None:
				# store samples per lead time
				for lead_time in range(len(lead_times)):
					# add all rows to the existing list of rows
					train_data[var][lead_time].extend(train[lead_time])
		# convert all rows for each var-lead time to a numpy array
		for lead_time in range(len(lead_times)):
			train_data[var][lead_time] = array(train_data[var][lead_time])
	return array(train_data), array(test_data)



In [26]:
data.to_numpy
data.iloc[:int(data.shape[0]*0.7)].values

array([[ 0.69,  4.5 , 23.7 ],
       [ 1.26,  9.1 , 13.  ],
       [ 1.33, 12.5 , 25.1 ],
       ...,
       [ 0.  ,  0.1 , 23.6 ],
       [ 0.  ,  0.  , 23.6 ],
       [ 0.  ,  0.  , 23.6 ]])

In [31]:
# split into test and train
from sklearn.model_selection import train_test_split
train = data.iloc[:int(data.shape[0]*0.7)].values
test = data.iloc[int(data.shape[0]*0.7):].values

# group data by chunks
values = data.values
chunks = to_chunks(values)

# group data by chunks
train_chunks = to_chunks(train)
test_chunks = to_chunks(test)

# convert training data into supervised learning data
n_lag = 12
train_data, test_data = data_prep(train_chunks, n_lag)
print(train_data.shape, test_data.shape)
# save train and test sets to file
save('weatherPrediction/supervised_train.npy', train_data)
save('weatherPrediction/supervised_test.npy', test_data)

IndexError: index 3 is out of bounds for axis 1 with size 3

Unnamed: 0_level_0,PRCP,SNOW,TAVG,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1900-01,0.69,4.5,23.7,30.8,16.6
1900-02,1.26,9.1,13.0,21.5,4.5
1900-03,1.33,12.5,25.1,32.3,17.9
1900-04,1.31,5.0,48.0,57.8,38.3
1900-05,1.87,0.0,60.5,70.9,50.2
