based on https://stackoverflow.com/questions/59381695/lstm-in-pytorch-how-to-add-change-sequence-length-dimension

In [1]:
from fastai.tabular import *
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# Data preparation

## Get the data

In [2]:
class CryptoDataset:
    """Face Landmarks dataset."""
    TARGET_COLUMN_NAME = 'close'
    DATE_COLUMN_NAME = 'date'
    ELAPSED = 'Elapsed'
    DROP_COLUMNS =['Second', 'Month', 'Year', \
                   'Is_quarter_end', 'Is_quarter_start', \
                   'Is_year_end', 'Is_year_start',
                   'Is_month_end', 'Is_month_start',
                   'Week', 'Dayofyear', ELAPSED]
    True_target_column_name = TARGET_COLUMN_NAME+'_future'

    def __init__(self, csv_file="BTC-ETH-filtered_with_indicators.csv", predict_delta=4, sequence_size=10, batch_size=8):
        """
        Args:
            csv_file (string): Path to the csv file with the crypto stats table.
        """
        from datetime import datetime
        self.sequence_size = sequence_size
        self.df = pd.read_csv("BTC-ETH-filtered_with_indicators.csv",
                              #read dates as dates
                              parse_dates=[self.DATE_COLUMN_NAME],
                              date_parser=lambda x: datetime.fromtimestamp(int(x)))
        add_datepart(self.df, self.DATE_COLUMN_NAME, time=True);
        self.df = self.df.astype(float).drop(columns=self.DROP_COLUMNS)
        
        # building the target
        self.df[self.True_target_column_name] = self.df[self.TARGET_COLUMN_NAME].shift(-predict_delta)
        
        trvate_split = tuple(int(x * len(self.df)) for x in (0.75, 0.9, 1.0))
        tmp_train_limit = trvate_split[0]- trvate_split[0]%sequence_size
        self.train_df = self.df[:tmp_train_limit]
        tmp_valid_limit = trvate_split[1] - trvate_split[0]%sequence_size -(trvate_split[1]-trvate_split[0])%sequence_size
        self.valid_df = self.df[tmp_train_limit:tmp_valid_limit]
        self.test_df = self.df[tmp_valid_limit:(len(self.df)- len(self.df)%sequence_size)]
        del tmp_train_limit, tmp_valid_limit
        
        def __transform_df(df_to_transform, transformer, columns):
            return pd.DataFrame(transformer.transform(df_to_transform), columns=columns)
        
        preprocessing_pipeline = Pipeline([("poli-feature", PolynomialFeatures(degree=2)),
                                           ("normalizer", StandardScaler())
                                          ]).fit(self.train_df.input_data(), self.train_df.target())
#         preprocessing_pipeline = StandardScaler().fit(self.train_df.input_data())
#         columns = self.train_df.input_data().columns
        columns = preprocessing_pipeline["poli-feature"].get_feature_names(self.train_df.input_data().columns)
        self.train_data = __transform_df(self.train_df.input_data(), preprocessing_pipeline, columns=columns)
        self.valid_data =__transform_df(self.valid_df.input_data(), preprocessing_pipeline, columns=columns)
        self.test_data = __transform_df(self.test_df.input_data(), preprocessing_pipeline, columns=columns)

        self.train_data.drop(columns=['1'], inplace=True)
        self.valid_data.drop(columns=['1'], inplace=True)
        self.test_data.drop(columns=['1'], inplace=True)
        
        def create_inout_sequences(input_data: pd.DataFrame, labels: pd.DataFrame, seq_len: int):
            out_labels = torch.tensor(labels.iloc[seq_len-1:].values.astype(np.float32))
            print(input_data.shape)
            out_sequences = np.stack([input_data.iloc[i:i+seq_len] for i in range(len(input_data)-seq_len+1)])
            out_data = torch.tensor(out_sequences.reshape(-1, seq_len, input_data.shape[-1]).astype(np.float32))
            print(input_data.shape, out_data.shape, out_labels.shape)
            return out_data, out_labels
        
#         train_x_y
        self.train_loader = DataLoader(dataset=TensorDataset(*create_inout_sequences(input_data=self.train_data,
                                                                                    labels=self.train_df.target(),
                                                                                    seq_len=sequence_size)),
                                       batch_size=batch_size, shuffle = True)
        self.val_loader = DataLoader(dataset=TensorDataset(*create_inout_sequences(input_data=self.valid_data,
                                                                                    labels=self.valid_df.target(),
                                                                                    seq_len=sequence_size)),
                                       batch_size=batch_size)
        self.test_loader = DataLoader(dataset=TensorDataset(*create_inout_sequences(input_data=self.test_data,
                                                                                    labels=self.test_df.target(),
                                                                                    seq_len=sequence_size)),
                                       batch_size=batch_size)

In [3]:
def input_data(self: pd.DataFrame):
    return self.drop(columns=[CryptoDataset.True_target_column_name])
def target(self: pd.DataFrame):
    return self[[CryptoDataset.True_target_column_name]]

pd.DataFrame.input_data = input_data
pd.DataFrame.target = target

### dataset build

In [None]:
BATCH_SIZE = 1
SEQUENCE_SIZE = 4
dataset=CryptoDataset(predict_delta=1,
                      batch_size=BATCH_SIZE,
                      sequence_size=SEQUENCE_SIZE)

# Study

In [None]:
from IPython.display import display
pd.set_option('display.max_columns', 500)
np.set_printoptions(threshold=sys.maxsize)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(dataset.train_df.input_data(), dataset.train_df.target())
print(rf.feature_importances_)

In [None]:
all_features = dataset.train_df.input_data().columns
print("all columns\n", *[str(a) +'\n ' for a in zip(all_features, rf.feature_importances_)])
plot_params = 'parameters'
plot_importances = 'importances'
df = pd.DataFrame({plot_params:all_features[rf.feature_importances_ > 0.].tolist(),
                   plot_importances:rf.feature_importances_[rf.feature_importances_ > 0.]})
ax = df.plot.bar(x=plot_params, y=plot_importances, rot=0)

In [4]:
def input_data(self: pd.DataFrame):
    return self[['open', 'high', 'low', 'close', 'others_cr']]
pd.DataFrame.input_data = input_data

In [5]:
BATCH_SIZE = 3
SEQUENCE_SIZE = 24
dataset=CryptoDataset(predict_delta=1,
                      batch_size=BATCH_SIZE,
                      sequence_size=SEQUENCE_SIZE)

(1032, 20)
(1032, 20) torch.Size([1009, 24, 20]) torch.Size([1009, 1])
(192, 20)
(192, 20) torch.Size([169, 24, 20]) torch.Size([169, 1])
(168, 20)
(168, 20) torch.Size([145, 24, 20]) torch.Size([145, 1])


# Train

In [6]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

## Model

In [25]:
class SentimentNet(nn.Module):
    def __init__(self, features, output_size=1,
#                  embedding_dim=10,
#                  hidden_dim=50,
                 sequence_size=10,
                 n_layers=1,
                 drop_prob=0.5, device=torch.device("cpu")):
        super(SentimentNet, self).__init__()
        
        embedding_dim=features*2
        hidden_dim=features
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.device = device
        
        self.liniar = nn.Linear(features, embedding_dim)
#         self.bn1 = nn.BatchNorm1d(num_features=sequence_size, affine=False) # embedding_dim)
        
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=n_layers, dropout=drop_prob, batch_first=True)
#         self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Sequential(nn.Linear(hidden_dim, hidden_dim//2),
#                                 nn.BatchNorm1d(num_features=hidden_dim//2, affine=False), if reenabling, search for "No need to skip this since have dropped the bn"
                                nn.Dropout(drop_prob),
                                nn.Tanh(),
                                nn.Linear(hidden_dim//2, output_size))
        
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        # Initialize cell state
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).requires_grad_().to(self.device)
        
        out = self.liniar(x)
#         print(out.shape)
#         out = self.bn1(out)
        out = F.relu(out)
#         print(out.shape)
        out, (hn, cn) = self.lstm(out, (h0, c0))
#         print(out.shape)
        # Index hidden state of last time step
        out = self.fc(out[:, -1, :])
        
        return out

In [None]:
# class CustomLoss(nn.L1Loss):
#     def __init__(self, size_average=None, reduce=None, reduction='mean'):
#         super(CustomLoss, self).__init__(size_average, reduce, reduction)

#     def forward(self, input, target):
#         return 10.0 * F.l1_loss(input, target, reduction=self.reduction)
        

In [40]:
features_count = dataset.train_data.shape[1]
# output_size = 1
# embedding_dim = 400
# hidden_dim = 512
# n_layers = 2

model = SentimentNet(features_count, device=DEVICE, n_layers=3, sequence_size=SEQUENCE_SIZE, drop_prob=0.25)
model.to(DEVICE)

lr=0.001
# criterion = CustomLoss()
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

epochs = 200
counter = 0
print_every = 5
clip = 5
valid_loss_min = np.Inf
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=lr,
                                                   steps_per_epoch=len(dataset.train_loader),
                                                   epochs=epochs)

In [41]:
%matplotlib notebook
import matplotlib.pyplot as plt
train_losses, valid_losses = [], []

In [42]:
model.train()
fig,ax = plt.subplots(1,1)
ax.set_xlabel('epochs')
ax.set_ylabel('losses')

def train():
    model.train()

    for train, train_target in dataset.train_loader:
#         if train.shape[0] < 2: No need to skip this since have dropped the bn
#             continue
            
        # Load data as a torch tensor with gradient accumulation abilities
        train = train.requires_grad_().to(DEVICE)
        train_target = train_target.to(DEVICE)
#         print(train.shape)
#         print(train_target.shape)

        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        # Forward pass to get output/logits
        outputs = model(train)

        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, train_target)

        # Getting gradients w.r.t. parameters
        loss.backward()
        # Updating parameters
#         nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        lr_scheduler.step()

def eval_and_save(on_epoch=0):
    model.eval()
    global valid_loss_min
    
    #evaluate first the full train set
    epoch_train_losses = []
    for train, train_target in dataset.train_loader:
        # Forward pass to get output/logits
        train = train.to(DEVICE)
        train_target = train_target.to(DEVICE)
        outputs = model(train)
        train_loss = criterion(outputs, train_target)
        epoch_train_losses.append(train_loss.item())
    avg_epoch_train_loss = np.mean(epoch_train_losses)
    train_losses.append(avg_epoch_train_loss)
   
        
    #eval the valid set
    epoch_val_losses = []
    for val_in, val_out in dataset.val_loader:
        # Forward pass to get output/logits
        val_in = val_in.to(DEVICE)
        val_out = val_out.to(DEVICE)
        outputs = model(val_in)
        val_loss = criterion(outputs, val_out)
        epoch_val_losses.append(val_loss.item())
    avg_epoch_val_loss = np.mean(epoch_val_losses)
    valid_losses.append(avg_epoch_val_loss)
    
    
    print("Epoch: {}...".format(epoch),
          "Loss: {:.6f}...".format(avg_epoch_train_loss),
          "Val Loss: {:.6f}".format(avg_epoch_val_loss))
    if avg_epoch_val_loss < valid_loss_min:
        torch.save(model.state_dict(), './state_dict9.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,avg_epoch_val_loss))
        valid_loss_min = avg_epoch_val_loss
            

    plt.cla()
    ax.plot(train_losses, 'b', label='train_loss')
    ax.plot(valid_losses, 'r', label='valid_loss')
    ax.legend()
    fig.canvas.draw()
            
        
for epoch in range(epochs):
    train()
    eval_and_save(epoch)
        

<IPython.core.display.Javascript object>

Epoch: 0... Loss: 0.141337... Val Loss: 0.153164
Validation loss decreased (inf --> 0.153164).  Saving model ...
Epoch: 1... Loss: 0.016707... Val Loss: 0.034829
Validation loss decreased (0.153164 --> 0.034829).  Saving model ...
Epoch: 2... Loss: 0.020208... Val Loss: 0.032579
Validation loss decreased (0.034829 --> 0.032579).  Saving model ...
Epoch: 3... Loss: 0.013087... Val Loss: 0.021742
Validation loss decreased (0.032579 --> 0.021742).  Saving model ...
Epoch: 4... Loss: 0.016224... Val Loss: 0.022998
Epoch: 5... Loss: 0.008137... Val Loss: 0.014605
Validation loss decreased (0.021742 --> 0.014605).  Saving model ...
Epoch: 6... Loss: 0.013385... Val Loss: 0.018049
Epoch: 7... Loss: 0.006061... Val Loss: 0.008580
Validation loss decreased (0.014605 --> 0.008580).  Saving model ...
Epoch: 8... Loss: 0.007970... Val Loss: 0.010219
Epoch: 9... Loss: 0.008867... Val Loss: 0.007435
Validation loss decreased (0.008580 --> 0.007435).  Saving model ...
Epoch: 10... Loss: 0.004137... V

Epoch: 131... Loss: 0.000234... Val Loss: 0.000205
Epoch: 132... Loss: 0.000178... Val Loss: 0.000156
Epoch: 133... Loss: 0.000084... Val Loss: 0.000072
Validation loss decreased (0.000078 --> 0.000072).  Saving model ...
Epoch: 134... Loss: 0.000073... Val Loss: 0.000082
Epoch: 135... Loss: 0.000210... Val Loss: 0.000178
Epoch: 136... Loss: 0.000110... Val Loss: 0.000084
Epoch: 137... Loss: 0.000189... Val Loss: 0.000159
Epoch: 138... Loss: 0.000355... Val Loss: 0.000313
Epoch: 139... Loss: 0.000162... Val Loss: 0.000172
Epoch: 140... Loss: 0.000187... Val Loss: 0.000138
Epoch: 141... Loss: 0.000192... Val Loss: 0.000221
Epoch: 142... Loss: 0.000256... Val Loss: 0.000240
Epoch: 143... Loss: 0.000102... Val Loss: 0.000117
Epoch: 144... Loss: 0.000143... Val Loss: 0.000207
Epoch: 145... Loss: 0.000099... Val Loss: 0.000106
Epoch: 146... Loss: 0.000175... Val Loss: 0.000196
Epoch: 147... Loss: 0.000141... Val Loss: 0.000085
Epoch: 148... Loss: 0.000065... Val Loss: 0.000072
Validation lo

# Test

## Visualize

In [43]:
import matplotlib.pyplot as plt

class Visualizer:
    colors=['blue','black', 'red']
    
    def __init__(self):
        self.date_pred_targ_dict: dict = dict()
        
    def add(self, timestamp, pred, targ, color='red'):
        self.date_pred_targ_dict[color] = pd.concat([self.date_pred_targ_dict.get(color, pd.DataFrame()),
                                                     pd.concat([pd.DataFrame(timestamp),
                                                       pd.DataFrame(pred),
                                                       pd.DataFrame(targ)], axis=1)])
    
    def plot(self):
        for color in self.colors:
            color_df = visualizer.date_pred_targ_dict.get(color, pd.DataFrame())
            plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
            plt.plot(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
            plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
            plt.plot(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
        plt.show()

## Eval

In [44]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict9.pt'))
visualizer = Visualizer()

mse_losses = []
mae_losses = []
kldiv_losses = []
totalIndexes = 0
custom_batch = BATCH_SIZE
# index_of_elapsed = dataset.train_data.columns.tolist().index(dataset.ELAPSED)

model.eval()

# #TRAIN
# for inputs, labels in tqdm(dataset.train_loader):
#     if inputs.shape[0] != custom_batch:
#         continue
#       totalIndexes += 1
#     inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
#     output, h = model(inputs, h)
#     visualizer.add(np.arange(len(inputs)), output, labels[:,-1], color='black')
    
#VALIDATE       
for val_in, val_out in dataset.val_loader:
    # Forward pass to get output/logits
    val_in = val_in.to(DEVICE)
    val_out = val_out.to(DEVICE)
    outputs = model(val_in)
#     val_loss = criterion(outputs, val_out)
#     val_losses.append(val_loss.item())

    visualizer.add(np.arange(len(val_out))+totalIndexes, np.array(outputs), np.array(val_out), color='blue')
    totalIndexes += len(val_out)
    
#TEST
for test_in, test_out in dataset.test_loader:
    # Forward pass to get output/logits
    test_in = test_in.to(DEVICE)
    test_out = test_out.to(DEVICE)
    outputs = model(test_in)
#     val_loss = criterion(outputs, val_out)
#     val_losses.append(val_loss.item())

    visualizer.add(np.arange(len(test_out))+totalIndexes, np.array(outputs), np.array(test_out))
    totalIndexes += len(test_out)
    
    mse_losses.append(nn.MSELoss()(outputs, test_out).item())
    mae_losses.append(nn.L1Loss()(outputs, test_out).item())
    kldiv_losses.append(nn.KLDivLoss()(outputs, test_out).item())
    

print("MSE loss: {:.8f}".format(np.mean(mse_losses)))
print("MAE loss: {:.8f}".format(np.mean(mae_losses)))
print("KLDiv loss: {:.8f}".format(np.mean(kldiv_losses)))

MSE loss: 0.00000000
MAE loss: 0.00004850
KLDiv loss: -0.07914409


### BS10 maxLR==lr 2lstms
lr=0.0003

In [None]:
visualizer.date_pred_targ_dict['blue']

In [45]:
plt.figure()
for color in ['blue']:
    color_df = visualizer.date_pred_targ_dict.get(color, pd.DataFrame())
    plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
    plt.plot(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
    plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
    plt.plot(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
plt.show()

<IPython.core.display.Javascript object>

In [46]:
plt.figure()

for color in ['red']:
    color_df = visualizer.date_pred_targ_dict.get(color, pd.DataFrame())
    plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
    plt.plot(color_df.iloc[:,0], color_df.iloc[:, -2]*10, color=color)
    plt.scatter(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
    plt.plot(color_df.iloc[:,0], color_df.iloc[:, -1]*10, color='g')
plt.show()

<IPython.core.display.Javascript object>

# should try estimate (min / max) on next 4 timestamps