In [4]:
import torch
import pandas as pd
import numpy as np
import sys 
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler

sys.path.append("./")
from prediction.api import *
from prediction.data_wrapper import get_loaders
from pickle import dump

In [5]:
args.data = './data/processed/processed_cleaned_df.csv'
args.input_size = [33]
args.hidden_size = 300
args.number_of_layers = 10
args.output_size = [1]
args.test_portion = 0.2 
args.train_window = 12
args.batch_size = 64
args.learning_rate = 0.001
args.weight_decay = 0.0001
args.gpu = 0 
args.epochs = 100
args.log_freq = 30
args.debug = False
args.seed = 1
args.samples = 20
args.dropout = 0.3

In [6]:
df = pd.read_csv(args.data).iloc[:,1:]
label_index = df.columns.get_loc('0301')
print(df.head())

GPs  month      07    0304  050103  131002     2101   010305       01  \
0  A81001      1  7271.0  4071.0  1476.0   185.0  4964.00  13032.0  69503.0   
1  A81001      2  8527.0  4071.0  1427.0   191.0  3841.00  12740.0  78753.0   
2  A81001      3  6402.0  3928.0  1278.0   198.0  4188.25  11574.0  65473.0   
3  A81001      4  7295.0  4836.0  1537.0   231.0  4535.50  13019.0  74436.0   
4  A81001      5  6671.0  5153.0  1436.0   280.0  4882.75  11824.0  65084.0   

       05  ...  030401       13           12     0106     1302       10  \
0  9767.0  ...  4310.0  59472.0   886.000000  19797.0  49930.0  15657.0   
1  9767.0  ...  4064.0  59472.0   994.666667  24304.0  49930.0  12903.0   
2  9767.0  ...  3928.0  59472.0  1103.333333  20416.0  44955.0  13236.0   
3  9767.0  ...  4836.0  58722.0  1212.000000  24713.0  45005.0  14082.0   
4  9767.0  ...  5152.0  57972.0   817.000000  20309.0  42955.0  12972.0   

    130201  050108    0603   010604  
0  41880.0   998.0  4034.0  10939.0  
1  4

In [7]:
data = df.to_numpy()
gps = data[:,0]
features = data[:,1:]

In [8]:
print(features.shape, gps.shape)

(398518, 33) (398518,)


In [9]:
total_size = len(features)
train_size = int((1-args.test_portion) * total_size)

train_data = features[:train_size]
test_data = features[train_size:]

train_gps = gps[:train_size]
test_gps = gps[train_size:]

train_unique_gps = np.unique(train_gps)
test_unique_gps = np.unique(test_gps)

scaler = MinMaxScaler(feature_range=(-1, 1))
train_data_normalized = scaler.fit_transform(train_data)
test_data_normalized = scaler.transform(test_data)

train_data_normalized = torch.FloatTensor(train_data_normalized)
test_data_normalized = torch.FloatTensor(test_data_normalized)

In [8]:
def create_inout_sequences(input_data,_gps,_unique_gps,_label_index, tw):
    X = []
    Y = []
    for i in tqdm(range(len(_unique_gps))):
        for j in range(len(_gps[_gps == _unique_gps[i]]) - tw):
            seq = input_data[j:j+tw]
            label = input_data[j+tw:j+tw+1,_label_index]
            seq = np.array(seq, dtype=np.float32)
            label = np.array(label, dtype = np.float32)
            X.append(torch.from_numpy(seq).float())
            Y.append(torch.from_numpy(label).float())

    return X, Y
X_train, Y_train = create_inout_sequences(train_data_normalized, train_gps, train_unique_gps, label_index, args.train_window)
X_test, Y_test = create_inout_sequences(test_data_normalized, test_gps, test_unique_gps, label_index, args.train_window)


100%|██████████| 5497/5497 [00:36<00:00, 149.47it/s]
100%|██████████| 1375/1375 [00:03<00:00, 381.20it/s]


In [9]:
X_train = torch.stack(X_train)
X_test = torch.stack(X_test)
Y_train = torch.stack(Y_train)
Y_test = torch.stack(Y_test)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

torch.Size([252850, 12, 33]) torch.Size([252850, 1])
torch.Size([63204, 12, 33]) torch.Size([63204, 1])


In [10]:
def get_loaders(X, Y):
    dataset = torch.utils.data.TensorDataset(X,Y)
    return torch.utils.data.DataLoader(dataset,batch_size = args.batch_size,shuffle=True)

In [11]:
train_loader = get_loaders(X_train, Y_train)
test_loader = get_loaders(X_test, Y_test)

In [3]:
model, scaler = get_model()