In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score as r2
%matplotlib qt

def sortTuples(tup): 
    tup.sort(key = lambda x: x[1]) 
    return tup

def createBarChart(xList, yList, title):
    '''Creates a bar chart displaying the dict keys on the x axis and count on the y axis'''
    fig, ax = plt.subplots()
    ax.grid(zorder = 0, axis = 'y') #adds horizontal grid lines
    ax.set_title(title)
    ax.set_xlabel("Count")
    plt.barh(xList, yList, zorder = 3) #zorder being higher makes bars show on top of grid lines
    plt.xticks(fontsize = 10)
    plt.show(block = False)

def createLineGraph(xList, yList, title, xTitle):
    fig, ax = plt.subplots()
    ax.plot(xList, yList, marker = 'o')
    ax.set_xlabel(xTitle, fontsize=16)
    ax.set_ylabel("r^2 value", fontsize=16)
    ax.set_title(title, fontsize=20)
    ax.grid()
    plt.show(block = False)

def analyticSolution(X, y):
    '''Returns analytic soltuion w to X*w=y'''
    term1 = X.T.dot(y)
    term2 = np.linalg.pinv(X.T.dot(X))
    return term2.dot(term1)

def findBestFeatures(X, Y, frequencyDict, featureList, numBest):
    '''Sequential Forward Selection of Best Features'''
    kFold = KFold(n_splits = 5, shuffle = True)
    for trainingIndicies,  testingIndicies in kFold.split(X):
        X_train, X_test = X[trainingIndicies], X[testingIndicies]
        Y_train, Y_test = Y[trainingIndicies], Y[testingIndicies]
        features = [] #no good feature names yet
        goodCols = [] #no good features yet
        for goodFeature in range(numBest):
            tupList = []
            for i in range( len(featureList) ):
                #finding r2 value after addition of a feature
                X_included_train = X_train[:, goodCols + [i]]
                X_included_test = X_test[:, goodCols + [i]]
                weightsAnalytic = analyticSolution(X_included_train, Y_train)
                r2_value = r2(Y_test, X_included_test.dot( weightsAnalytic ))
                #adding that value to the list
                newTup = (i, r2_value)
                tupList.append(newTup)

            tupList = sortTuples(tupList) #the very last, or feature with highest r2 value, is the best feature to add
            frequencyDict[featureList[ tupList[ len(tupList) - 1 ][0] ]] += 1 #increase count since a next best feature was found

            features.append( tupList[ len(tupList) - 1 ][0] ) #adding next best feature name
            goodCols.append( tupList[ len(tupList) - 1 ][0] ) #add that next best feature to list

    keys = [x for x in frequencyDict.keys()] 
    vals = [x for x in frequencyDict.values()]
    tups = sortTuples([(keys[i], vals[i]) for i in range(len(keys))]) #greatest to lowest

    createBarChart([x[0] for x in tups], [x[1] for x in tups], "Frequency of Best Features")

    return tups[::-1]

def SFS(X_reg, Y_reg, frequencyDict, indexDict, regFeatureList):
    '''Sequential Forward Selection'''
    #gets a list of best features as a tuple with (feature name, occurence)
    bestFeats = findBestFeatures(X_reg, Y_reg, frequencyDict, regFeatureList, 4)

    #finding avg r^2 value on set for diff feature size
    averageVals = []
    xList = list(range(1, len(bestFeats) + 1))
    kFold = KFold(n_splits = 5, shuffle = True)
    for numFeats in xList:
        #getting the best dataset, using numFeats best features
        bestIndicies = []
        for i in range(numFeats):
            bestIndicies.append(indexDict[ bestFeats[i][0] ]) #the col numbers of the best features
        X_best_reg = X_reg[:, bestIndicies]

        avgr2 = 0
        for trainingIndicies,  testingIndicies in kFold.split(X_best_reg):
            X_train, X_test = X_best_reg[trainingIndicies], X_best_reg[testingIndicies]
            Y_train, Y_test = Y_reg[trainingIndicies], Y_reg[testingIndicies]
            weightsAnalytic = analyticSolution(X_train, Y_train)
            avgr2 += r2(Y_test, X_test.dot( weightsAnalytic ))
        print("The average r2 value is:", round(avgr2/5, 4), "Using", numFeats, "best feature(s)")
        averageVals.append(avgr2/5)

    for i in range(len(bestFeats)):
        print(str(i + 1) + ":", bestFeats[i][0])

    createLineGraph(xList, averageVals, "Features Used vs R^2 Value", "Feature Count")


In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

# reading data and transforming it to have a time lag
df = pd.read_csv("Train and Test Data.csv")
df.drop(columns=["DateTime"], inplace=True)
features = list(df.columns)
data = df.values
transformedDf = series_to_supervised(data, 24, 1)

# specifying columns we want
cols = np.array(transformedDf.columns)
desiredCols = cols[:len(features)]									# t-24 to predict t	
desiredCols = np.append(desiredCols, cols[-1*len(features)])
removeCols = np.array([x for x in cols if x not in desiredCols])
transformedDf.drop(columns=removeCols, inplace=True)

# scaling dataset and preparing it for linear regression
scaler = MinMaxScaler()
data = transformedDf.values
data = scaler.fit_transform(data)
bias = np.ones((data.shape[0], 1))		# adding in a bias
data = np.hstack((bias, data))

y = np.array([[x] for x in data[:, -1]])
X = np.delete(data, -1, axis=1)

# prepping lists for SFS function
allFeatures = np.array(df.columns) + [" (t-24)"]			# renameing from var_i to actual feature names
allFeatures = np.insert(allFeatures, 0, ["Bias"], axis=0)   # adding bias to beginning of features
freqencyDict = dict()
for feat in allFeatures:
    freqencyDict[feat] = 0
freqencyDict = dict()
for feat in allFeatures:
    freqencyDict[feat] = 0
indexDict = dict()
for i in range(len(allFeatures)):
    indexDict[allFeatures[i]] = i

SFS(X, y, freqencyDict, indexDict, allFeatures)

The average r2 value is: 0.6836 Using 1 best feature(s)
The average r2 value is: 0.7058 Using 2 best feature(s)
The average r2 value is: 0.7133 Using 3 best feature(s)
The average r2 value is: 0.7185 Using 4 best feature(s)
The average r2 value is: 0.7304 Using 5 best feature(s)
The average r2 value is: 0.7333 Using 6 best feature(s)
The average r2 value is: 0.7338 Using 7 best feature(s)
The average r2 value is: 0.7343 Using 8 best feature(s)
The average r2 value is: 0.7349 Using 9 best feature(s)
The average r2 value is: 0.735 Using 10 best feature(s)
The average r2 value is: 0.7361 Using 11 best feature(s)
The average r2 value is: 0.7369 Using 12 best feature(s)
The average r2 value is: 0.7372 Using 13 best feature(s)
The average r2 value is: 0.7402 Using 14 best feature(s)
The average r2 value is: 0.741 Using 15 best feature(s)
The average r2 value is: 0.7446 Using 16 best feature(s)
The average r2 value is: 0.7447 Using 17 best feature(s)
The average r2 value is: 0.7487 Using 18 b