In [12]:
import time
import glob
import numpy as np
import pandas as pd
import gplearn as gpl        
from gplearn.genetic import SymbolicRegressor

---
## Get data
### Ground Truth

In [13]:
data = []
for player in glob.glob('../LSTM-Neural-Network-for-Time-Series-Prediction/data/*.csv'):
    tmp = pd.read_csv(player)
    data.append(tmp)
data = pd.concat(data)

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points
0,412,2013_20014,VAN,RYAN KESLER,8470616,2013-10-03 00:00:00,0.0,0.0,0.0,0.0
1,1024,2013_20030,VAN,RYAN KESLER,8470616,2013-10-05 00:00:00,1.0,0.0,0.0,1.0
2,1174,2013_20034,VAN,RYAN KESLER,8470616,2013-10-06 00:00:00,0.0,0.0,0.0,0.0
3,1520,2013_20043,VAN,RYAN KESLER,8470616,2013-10-08 00:00:00,0.0,0.0,0.0,0.0
4,2016,2013_20056,VAN,RYAN KESLER,8470616,2013-10-10 00:00:00,0.0,0.0,0.0,0.0


In [15]:
years = ['20132014', '20142015', '20152016', '20162017', '20172018', '20182019']
start_date = [pd.datetime(2013, 10, 1), 
              pd.datetime(2014, 10, 1), 
              pd.datetime(2015, 10, 1), 
              pd.datetime(2016, 10, 1),
              pd.datetime(2017, 10, 1),
              pd.datetime(2018, 10, 1)]
end_date = [pd.datetime(2014, 4, 13), 
            pd.datetime(2015, 4, 11), 
            pd.datetime(2016, 4, 10), 
            pd.datetime(2017, 4, 9),
            pd.datetime(2018, 4, 8),
            pd.datetime(2019, 4, 6)]

yearly_data = {}
i = 0
for y in years:
    print(y)
    print('Points...')
    data['Date'] = pd.to_datetime(data['Date'])
    mask = (data['Date'] > start_date[i]) & (data['Date'] <= end_date[i])
    data_tmp = data.loc[mask]
    yearly_data[y] = data_tmp
    
    i+=1

20132014
Points...
20142015
Points...
20152016
Points...
20162017
Points...
20172018
Points...
20182019
Points...


In [18]:
data_1718 = yearly_data['20172018']
mackinnon = data_1718[data_1718['Player'] == 'NATHAN MACKINNON']
mackinnon.head()

Unnamed: 0.1,Unnamed: 0,Game_Id,Team,Player,Player_Id,Date,Goal,First_Assist,Second_Assist,Total_Points
300,162164,2017_20007,COL,NATHAN MACKINNON,8477492,2017-10-05,0.0,1.0,0.0,1.0
301,162622,2017_20019,COL,NATHAN MACKINNON,8477492,2017-10-07,0.0,0.0,0.0,0.0
302,163118,2017_20032,COL,NATHAN MACKINNON,8477492,2017-10-09,0.0,0.0,1.0,1.0
303,163729,2017_20048,COL,NATHAN MACKINNON,8477492,2017-10-11,0.0,2.0,0.0,2.0
304,164191,2017_20060,COL,NATHAN MACKINNON,8477492,2017-10-13,0.0,0.0,0.0,0.0


---
## Symbolic Regressor

In [19]:
class WindowSlider(object):
    
    def __init__(self, window_size = 5):        
        '''
        Window Slider object
        ====================
        w: window_size - number of time steps to look back
        o: offset between last reading and temperature
        r: response_size - number of time steps to predict
        l: maximum length to slide - (#observation - w)
        p: final predictors - (#predictors * w)
        '''
        self.w = window_size
        self.o = 0
        self.r = 1       
        self.l = 0
        self.p = 0
        self.names = []
        
    def re_init(self, arr):
        '''
        Helper function to initializate to 0 a vector
        '''
        arr = np.cumsum(arr)
        return arr - arr[0]
                

    def collect_windows(self, X, window_size=5, offset=0, previous_y=False):
        '''
        Input: X is the input matrix, each column is a variable
        Returns: diferent mappings window-output
        '''
        cols = len(list(X)) - 1
        N = len(X)
        
        self.o = offset
        self.w = window_size
        self.l = N - (self.w + self.r) + 1
        if not previous_y: self.p = cols * (self.w)
        if previous_y: self.p = (cols + 1) * (self.w)
        
        # Create the names of the variables in the window
        # Check first if we need to create that for the response itself
        if previous_y: x = cp.deepcopy(X)
        if not previous_y: x = X.drop(X.columns[-1], axis=1)  
        
        for j, col in enumerate(list(x)):        
                
            for i in range(self.w):
                
                name = col + ('(%d)' % (i+1))
                self.names.append(name)
        
        # Incorporate the timestamps where we want to predict
        for k in range(self.r):
            
            name = '∆t' + ('(%d)' % (self.w + k + 1))
            self.names.append(name)
            
        self.names.append('Y')
                
        df = pd.DataFrame(np.zeros(shape=(self.l, (self.p + self.r + 1))), 
                          columns=self.names)
        
        # Populate by rows in the new dataframe
        for i in range(self.l):
            
            slices = np.array([])
            
            # Flatten the lags of predictors
            for p in range(x.shape[1]):
            
                line = X.values[i:self.w + i, p]
                # Reinitialization at every window for ∆T
                if p == 0: line = self.re_init(line)
                    
                # Concatenate the lines in one slice    
                slices = np.concatenate((slices, line)) 
 
            # Incorporate the timestamps where we want to predict
            line = np.array([self.re_init(X.values[i:i+self.w+self.r, 0])[-1]])
            y = np.array(X.values[self.w + i + self.r - 1, -1]).reshape(1,)
            slices = np.concatenate((slices, line, y))
            
            # Incorporate the slice to the cake (df)
            df.iloc[i,:] = slices
            
        return df

In [40]:
w = 5
train_constructor = WindowSlider()
train_windows = train_constructor.collect_windows(mackinnon[['Total_Points']], 
                                                  previous_y=False)

test_constructor = WindowSlider()
test_windows = test_constructor.collect_windows(testset.iloc[:,1:],
                                                previous_y=False)

train_constructor_y_inc = WindowSlider()
train_windows_y_inc = train_constructor_y_inc.collect_windows(trainset.iloc[:,1:], 
                                                  previous_y=True)

test_constructor_y_inc = WindowSlider()
test_windows_y_inc = test_constructor_y_inc.collect_windows(testset.iloc[:,1:],
                                                previous_y=True)

train_windows.head(3)

NameError: name 'testset' is not defined

In [5]:
#######################
# CREATION OF THE MODEL
#######################

# It is possible to create custom operations to be considered in the tree
def _xexp( x ):
    a = np.exp(x); 
    a[ np.abs(a) > 1e+9 ] = 1e+9
    return a    

xexp = gpl.functions.make_function( function = _xexp, name='xexp', arity=1 )
#function_set = ['add', 'sub', 'mul', 'div', 'sin', 'log'] #, xexp]
function_set = ['add', 'sub', 'mul', 'div']

if 'model' in locals(): del model
model = SymbolicRegressor(population_size = 3000, tournament_size=5,
                          generations = 25, stopping_criteria=0.1,
                          function_set = function_set, metric='rmse',
                          p_crossover=0.65, p_subtree_mutation=0.15,
                          p_hoist_mutation=0.05, p_point_mutation=0.1,
                          verbose = 1, random_state = None, n_jobs = -1)

###########################################################
# TRAIN THE NETWORK AND PREDICT - Without previous values y
###########################################################

# Train
t0 = time.time()
model.fit(train_windows.values[:,:-1], train_windows.values[:,-1])
tF = time.time()

# Predict
sr_y_fit = model.predict(train_windows.values[:,:-1]).reshape(-1,1)
sr_y_pred = model.predict(test_windows.values[:,:-1]).reshape(-1,1)

# Calculating Errors
sr_residuals = sr_y_pred - testset.iloc[5:,-1].values.reshape(-1,1)
sr_rmse = np.sqrt(np.sum(np.power(sr_residuals,2)) / len(sr_residuals))
print('RMSE = %f' % sr_rmse)
print('Time to train %.2f' % (tF - t0))
print(model._program)

NameError: name 'train_windows' is not defined