In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
def read_data(path=None, file=None):
    
    if path and not file:
        files_list = os.listdir(path)
        li = []    
        
        for file in files_list:
            filepath = path + '/' + file
            df = pd.read_csv(filepath, index_col=0)
            li.append(df)
        
        dataframe = pd.concat(li, axis=0, ignore_index=True)
    
    elif file and not path:
        dataframe = pd.read_csv(file, index_col=0)
        
    elif file and path:
        filepath = path + '/' + file
        dataframe = pd.read_csv(filepath, index_col=0)
        
    else:
        dataframe = None
        
    return dataframe

In [3]:
def get_columns(dataframe, num_cols=10):
    
    last_row = { idx:val for idx,val in enumerate(dataframe.tail(1).values[0]) if idx % 2 == 0 }
    
    top_n_dist = [ str(k) for k, v in sorted(last_row.items(), key=lambda item: item[1], reverse=True)[:num_cols] ]
    
    top_n_pit = [ str(int(k)+1) for k in top_n_dist ]
    
    columns = sorted(top_n_dist + top_n_pit)
    
    #trimmed_dataframe = dataframe[columns]
    
    #trimmed_dataframe[top_n_pit] = trimmed_dataframe[top_n_pit]/60
    
    return dataframe[columns]

In [4]:
df_2013 = read_data(file='./data/Data2013.csv')
df_2014 = read_data(file='./data/Data2014.csv')
df_2015 = read_data(file='./data/Data2015.csv')
df_2016 = read_data(file='./data/Data2016.csv')
df_2017 = read_data(file='./data/Data2017.csv')
df_2018 = read_data(file='./data/Data2018.csv')
df_2019 = read_data(file='./data/Data2019.csv')

In [5]:
df_2016.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.009058,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,1.002255,2.0,0.0,2.0,1.032321,2.0,1.021018,2.0,0.0,2.0,...,1.017214,2.0,0.0,2.0,0.0,2.0,1.010559,2.0,1.019581,2.0
3,1.024851,3.0,1.016334,3.0,1.055583,3.0,1.042884,3.0,1.004935,3.0,...,1.039221,3.0,1.006999,3.0,0.0,3.0,1.033337,3.0,1.041954,3.0
4,1.047448,4.0,1.03865,4.0,1.078846,4.0,1.06475,4.0,1.026733,4.0,...,1.061227,4.0,1.028633,4.0,1.016377,4.0,1.056114,4.0,1.064327,4.0


In [6]:
df_top_2013 = get_columns(df_2013, 10)
df_top_2014 = get_columns(df_2014, 10)
df_top_2015 = get_columns(df_2015, 10)
df_top_2016 = get_columns(df_2016, 10)
df_top_2017 = get_columns(df_2017, 10)
df_top_2018 = get_columns(df_2018, 10)
df_top_2019 = get_columns(df_2019, 10)

In [7]:
df_top_2018.head()

Unnamed: 0,0,1,16,17,2,26,27,3,34,35,4,40,41,5,58,59,6,62,63,7
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.023845,1.0,1.001899,1.0,1.021091,0.0,1.0,1.0,0.0,1.0,1.018493,0.0,1.0,1.0,0.0,1.0,1.013808,0.0,1.0,1.0
2,1.04769,2.0,1.024721,2.0,1.044842,1.000867,2.0,2.0,0.0,2.0,1.042346,0.0,2.0,2.0,0.0,2.0,1.03741,0.0,2.0,2.0
3,1.071536,3.0,1.047544,3.0,1.068593,1.023739,3.0,3.0,0.0,3.0,1.066199,0.0,3.0,3.0,0.0,3.0,1.061013,0.0,3.0,3.0
4,1.095381,4.0,1.070367,4.0,1.092345,1.046612,4.0,4.0,1.020701,4.0,1.090051,1.014323,4.0,4.0,0.0,4.0,1.084616,0.0,4.0,4.0


In [8]:
from sklearn import preprocessing

def create_data(dataframe, rank_step=15, frame_step=1, frame_size=20):
    
    start_row = 0 
    end_row = start_row + frame_size 
    limit = end_row + max(rank_step, frame_size)
    label_row = end_row + rank_step
    length_df = len(dataframe)
    
    dist_columns = [ str(idx) for idx in dataframe.columns if int(idx) % 2 == 0 ]
    index_max_dist = dataframe[dist_columns].idxmax(1)
        
    input_data = []
    labels = []
        
    print(length_df)
    while limit < length_df:
        
        #print(limit)
        
        input_data.append(dataframe.iloc[start_row:end_row,:].values)
        label_row_df = dataframe.iloc[[label_row],:].copy()
        
        # Set values for label row
        label_row_df.loc[:,label_row_df.columns] = 0
        label_row_df.loc[:,index_max_dist[label_row]] = 1
        
        labels.append([ label_row_df.values for i in range(0,frame_size) ])
        
        # increment
        
        start_row = start_row + frame_step 
        end_row = start_row + frame_size 
        label_row = end_row + rank_step
         
        limit = end_row + max(rank_step, frame_size)
        
    #print(length_df, start_row, end_row, label_row, limit)
    #return input_data, labels
    return np.array(input_data), np.array(labels)
        

In [9]:
def create_data2(dataframe, steps=5, frame_size=20, rank_step=10):
    
    start_row = 0
    end_row = steps * frame_size
    limit = end_row + max(rank_step, frame_size)
    label_row = end_row + rank_step
    length_df = len(dataframe)
    
    dist_columns = [ str(idx) for idx in dataframe.columns if int(idx) % 2 == 0 ]
    index_max_dist = dataframe[dist_columns].idxmax(1)

    input_data = []
    labels = []
            
    
    while limit < length_df:
        
        rows = [ i for i in range(start_row, end_row, steps) ]
        
        
        input_data.append(dataframe.iloc[rows,:].values)
        label_row_df = dataframe.iloc[[label_row],:].copy()
        
        # Set values for label row
        label_row_df.loc[:,label_row_df.columns] = 0
        label_row_df.loc[:,index_max_dist[label_row]] = 1
        
        labels.append([ label_row_df.values for i in range(0,frame_size) ])
        
        start_row = start_row + steps 
        end_row = start_row + steps * frame_size 
        label_row = end_row + rank_step
        
        limit = end_row + max(rank_step, frame_size)
        
        
    return np.array(input_data), np.array(labels)            

In [10]:
x_2013, y_2013 = create_data2(df_top_2013, steps=2, rank_step=5, frame_size=25)
x_2014, y_2014 = create_data2(df_top_2014, steps=2, rank_step=5, frame_size=25)
x_2015, y_2015 = create_data2(df_top_2015, steps=2, rank_step=5, frame_size=25)
x_2016, y_2016 = create_data2(df_top_2016, steps=2, rank_step=5, frame_size=25)
x_2017, y_2017 = create_data2(df_top_2017, steps=2, rank_step=5, frame_size=25)
x_2018, y_2018 = create_data2(df_top_2018, steps=2, rank_step=5, frame_size=25)
x_2019, y_2019 = create_data2(df_top_2019, steps=2, rank_step=5, frame_size=25)
                             
                             
#x_2013, y_2013 = create_data2(df_top_2013, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2014, y_2014 = create_data(df_top_2014, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2015, y_2015 = create_data(df_top_2015, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2016, y_2016 = create_data(df_top_2016, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2017, y_2017 = create_data(df_top_2017, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2018, y_2018 = create_data(df_top_2018, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)
#x_2019, y_2019 = create_data(df_top_2019, skip_steps=1, rank_step=5, frame_step=1, frame_size=50)

In [11]:
x_2013.shape

(4765, 25, 20)

In [12]:
y_2013.shape

(4765, 25, 1, 20)

In [13]:
y_r_2013 = y_2013.reshape(y_2013.shape[0],y_2013.shape[1],y_2013.shape[3])
y_r_2014 = y_2014.reshape(y_2014.shape[0],y_2014.shape[1],y_2014.shape[3])
y_r_2015 = y_2015.reshape(y_2015.shape[0],y_2015.shape[1],y_2015.shape[3])
y_r_2016 = y_2016.reshape(y_2016.shape[0],y_2016.shape[1],y_2016.shape[3])
y_r_2017 = y_2017.reshape(y_2017.shape[0],y_2017.shape[1],y_2017.shape[3])
y_r_2018 = y_2018.reshape(y_2018.shape[0],y_2018.shape[1],y_2018.shape[3])
y_r_2019 = y_2019.reshape(y_2019.shape[0],y_2019.shape[1],y_2019.shape[3])

In [14]:
y_r_2013.shape

(4765, 25, 20)

In [15]:
#X = np.vstack((x_2013,x_2014,x_2015,x_2016,x_2017,x_2018,x_2019))

X_train = np.vstack((x_2013,x_2014,x_2015,x_2016,x_2017))

In [16]:
X_train.shape

(26530, 25, 20)

In [17]:
#Y = np.vstack((y_r_2013,y_r_2014,y_r_2015,y_r_2016,y_r_2017,y_r_2018,y_r_2019))

Y_train = np.vstack((y_r_2013,y_r_2014,y_r_2015,y_r_2016,y_r_2017))

In [18]:
Y_train.shape

(26530, 25, 20)

In [19]:
#X_train = X[:65000]
#Y_train = Y[:65000]
#X_val = X[65000:70000]
#Y_val = Y[65000:70000]
#X_test = X[70000:]
#Y_test = Y[70000:]

X_val = x_2018
Y_val = y_r_2018

In [20]:
frame_size = 25
n_features = 20

In [21]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense, Flatten
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from attention_decoder import AttentionDecoder
import tensorflow as tf

# define model
model = Sequential()
model.add(LSTM(128, input_shape=(frame_size, n_features), return_sequences=True))
model.add(AttentionDecoder(128, n_features))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])









In [22]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 25, 128)           76288     
_________________________________________________________________
AttentionDecoder (AttentionD (None, 25, 20)            161316    
Total params: 237,604
Trainable params: 237,604
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.fit(X_train,Y_train, epochs=1, verbose=2, validation_data=(X_val, Y_val))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 26530 samples, validate on 5354 samples
Epoch 1/1
 - 117s - loss: 1.0419 - acc: 0.6315 - val_loss: 3.4998 - val_acc: 0.2183


<keras.callbacks.History at 0x25d15c42688>

In [28]:
input_data = x_2018[0]

In [34]:
t = []
t.append(input_data)
t = np.array(t)
t.shape

(1, 25, 20)

In [35]:
print(model.predict(t))

[[[2.11805761e-01 3.34203616e-03 1.39602199e-02 3.12782191e-02
   1.41248917e-02 6.17674366e-02 1.43873066e-01 9.87847615e-03
   9.65031907e-02 1.94437767e-03 2.08247807e-02 1.04153037e-01
   1.69235673e-02 9.72325634e-03 9.07741636e-02 5.46579901e-03
   5.45376837e-02 1.80970551e-03 1.03306003e-01 4.00431734e-03]
  [3.92280370e-01 6.16596721e-04 9.70080588e-03 1.52273998e-02
   1.59673635e-02 6.70711100e-02 1.52900666e-01 1.85933302e-03
   1.09029025e-01 4.64972894e-04 7.59244058e-03 9.60928351e-02
   4.21683583e-03 3.81655362e-03 5.68795800e-02 6.83697348e-04
   2.87678335e-02 3.54687712e-04 3.59351747e-02 5.42757276e-04]
  [3.70904833e-01 2.82048422e-04 8.38206708e-03 1.01123741e-02
   2.16674544e-02 6.23045191e-02 1.98644131e-01 5.63372159e-04
   1.50856614e-01 2.14926578e-04 4.13220190e-03 8.05593207e-02
   1.01466384e-03 1.91655068e-03 5.12283742e-02 2.71031924e-04
   2.59203408e-02 1.68202023e-04 1.06562199e-02 2.00758659e-04]
  [2.96175748e-01 1.90289167e-04 7.31021073e-03 6.59