In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
#from sklearn.model_selection import training_test_split
#from sklean
import os 
import pandas as pd

In [2]:
basepath='./training_set/DesignMatrices/'
fnames = [x for x in sorted(os.listdir(basepath)) if x[0]!='.']
#X, supply, demand = None, None, None
Xs, supplies, demands = [], [], []
for fname in fnames: 
    table = pd.read_csv(basepath+fname)
    supplies.append(table['Supply'].values)
    demands.append(table['Demand'].values)
    Xs.append(table.drop(['Supply', 'Demand'], axis=1).values)
    
supply = np.concatenate(supplies)
demand = np.concatenate(demands)
X = np.concatenate(Xs)

regions = np.unique(X[:,0])
timeslots = np.unique(X[:,1])

supply.shape, demand.shape, X.shape 

((142880,), (142880,), (142880, 5))

In [6]:
def SplitByRegion(X, supply, demand, splitfrac):
    Xtrain, supplyTrain, demandTrain = [], [], []
    Xval, supplyVal, demandVal = [], [], []
    
    for r in regions: 
        Xr, supplyr, demandr = X[X[:, 0]==r], supply[X[:, 0]==r], demand[X[:, 0]==r]
        
        randidx = np.random.choice(Xr.shape[0], Xr.shape[0], replace=False)
        splitidx = int(splitfrac * Xr.shape[0])
        
        Xtrain.append(Xr[randidx[:splitidx]])
        supplyTrain.append(supplyr[randidx[:splitidx]])
        demandTrain.append(demandr[randidx[:splitidx]])
        
        Xval.append(Xr[randidx[splitidx:]])
        supplyVal.append(supplyr[randidx[splitidx:]])
        demandVal.append(demandr[randidx[splitidx:]])
        
    Xtrain, supplyTrain, demandTrain = np.concatenate(Xtrain), np.concatenate(supplyTrain), np.concatenate(demandTrain)
    Xval, supplyVal, demandVal = np.concatenate(Xval), np.concatenate(supplyVal), np.concatenate(demandVal)
    
    return Xtrain, supplyTrain, demandTrain, Xval, supplyVal, demandVal

In [7]:
Xtrain, supplyTrain, demandTrain, Xval, supplyVal, demandVal = SplitByRegion(X, supply, demand, .7)

In [9]:
print 'Xtrain = {}, demandTrain = {}, supplyTrain = {}'.format(Xtrain.shape, demandTrain.shape, supplyTrain.shape)
print 'Xval = {}, demandVal = {}, supplyVal = {}'.format(Xval.shape, demandVal.shape, supplyVal.shape)

Xtrain = (99981, 5), demandTrain = (99981,), supplyTrain = (99981,)
Xval = (42899, 5), demandVal = (42899,), supplyVal = (42899,)


In [10]:
def getMAE(truths, preds): 
    return (np.mean(np.abs(truths-preds)))

In [11]:
for d in xrange(1,7):
    #print 'training for '.format(d)
    Xnew = PolynomialFeatures(degree=d).fit_transform(Xtrain)
    model = linear_model.LinearRegression()
    model = model.fit(Xnew, demandTrain-supplyTrain)
    
    preds=model.predict(PolynomialFeatures(degree=d).fit_transform(Xval))
    print 'degree = {}\tMAE = {}'.format(d, getMAE(demandVal-supplyVal, preds))

degree = 1	MAE = 12.4183165364
degree = 2	MAE = 12.3706463812
degree = 3	MAE = 12.6758991648
degree = 4	MAE = 12.9602037108
degree = 5	MAE = 13.1210397741
degree = 6	MAE = 15.24857564
