## Pytorch Encoder for cudf vs Pandas

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import cudf as gd

## Reading and Modifying Data

We are using the Electrolysia Time-Series Electricity Consumption Dataset from Kaggle, you can find more details about it [on the Kaggle dataset page](https://www.kaggle.com/utathya/electricity-consumption).

Just for kicks, we can take this opportunity to compare loading times on CPU with `pandas` and GPU with `cuDF`

In [1]:
path = 'data/electricity_consumption.csv'

### On CPU

In [2]:
%%time
# pandas reading time
df = pd.read_csv(path)

CPU times: user 38.5 ms, sys: 7.79 ms, total: 46.3 ms
Wall time: 44.7 ms


In [3]:
%%time
from sklearn.preprocessing import MinMaxScaler

#pandas Transform
minmax = MinMaxScaler().fit(df.iloc[:, 7].values.reshape((-1,1)).astype('float32'))
df_time_series = minmax.transform(df.iloc[:, 7].values.reshape((-1,1)).astype('float32')).reshape((-1))
df_time_series = pd.DataFrame(df_time_series)
df_time_series.head()

CPU times: user 238 ms, sys: 32.3 ms, total: 270 ms
Wall time: 269 ms


In [4]:
time_original = pd.to_datetime(df.iloc[:, 0]).tolist()

### On GPU

In [5]:
%%time
#cudf reading
cudf_data = gd.read_csv(path)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 11.9 µs


In [6]:
%%time
#cudf transform
minmax = MinMaxScaler().fit(cudf_data.iloc[:, 7].to_array().reshape((-1,1)).astype('float32'))
cudf_time_series = minmax.transform(cudf_data.iloc[:, 7].to_array().reshape((-1,1)).astype('float32')).reshape((-1))
cudf_time_series = gd.from_pandas(pd.DataFrame(cudf_time_series))
cudf_time_series.head()

CPU times: user 119 ms, sys: 12.7 ms, total: 132 ms
Wall time: 130 ms


### Pytorch Encoder

Now what we want to do is to create a representation of data  or an encoding of data (for ex: a intermediate layer in resnet) . So, we will use a simple MLP to do that. 


In [7]:
## Building a Pytorch MLP model to get an intermediate representation of Data

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'torch'

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dimension= 32):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.first_layer_encoder = nn.Linear(input_size, hidden_size)
        self.second_layer_encoder = nn.Linear(hidden_size, dimension)
        self.first_layer_decoder = nn.Linear(dimension, hidden_size)
        self.second_layer_decoder = nn.Linear(hidden_size, input_size)

    def forward(self, input):
        output = nn.functional.relu(self.first_layer_encoder(input))
        output = nn.functional.relu(self.second_layer_encoder(output))
        decode = nn.functional.relu(self.first_layer_decoder(output))
        decode = nn.functional.sigmoid(self.second_layer_decoder(decode))
        return decode, output

In [10]:
num_epochs = 100
batch_size = 32
input_size = 1
learning_rate = 0.01

model = Encoder(input_size, hidden_size=32, dimension=32)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [11]:
%%time
from torch.autograd import Variable

# pandas run time
X = Variable(torch.from_numpy(df_time_series.values).float(), requires_grad=False)
for epoch in range(num_epochs):
    output, _ = model(X)
    loss = criterion(output, X)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data))




epoch [1/100], loss:0.1840
epoch [2/100], loss:0.1629
epoch [3/100], loss:0.1463
epoch [4/100], loss:0.1271
epoch [5/100], loss:0.1055
epoch [6/100], loss:0.0814
epoch [7/100], loss:0.0570
epoch [8/100], loss:0.0353
epoch [9/100], loss:0.0196
epoch [10/100], loss:0.0116
epoch [11/100], loss:0.0099
epoch [12/100], loss:0.0111
epoch [13/100], loss:0.0130
epoch [14/100], loss:0.0146
epoch [15/100], loss:0.0157
epoch [16/100], loss:0.0165
epoch [17/100], loss:0.0170
epoch [18/100], loss:0.0173
epoch [19/100], loss:0.0175
epoch [20/100], loss:0.0177
epoch [21/100], loss:0.0177
epoch [22/100], loss:0.0177
epoch [23/100], loss:0.0176
epoch [24/100], loss:0.0175
epoch [25/100], loss:0.0172
epoch [26/100], loss:0.0166
epoch [27/100], loss:0.0155
epoch [28/100], loss:0.0134
epoch [29/100], loss:0.0100
epoch [30/100], loss:0.0091
epoch [31/100], loss:0.0098
epoch [32/100], loss:0.0075
epoch [33/100], loss:0.0080
epoch [34/100], loss:0.0083
epoch [35/100], loss:0.0074
epoch [36/100], loss:0.0058
e

In [16]:
model = Encoder(input_size, hidden_size=32, dimension=32)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [17]:
%%time
from torch.autograd import Variable

#cudf run time
X = Variable(torch.from_numpy(cudf_time_series.as_matrix()).float(), requires_grad=False)
for epoch in range(num_epochs):
    output, _ = model(X)
    loss = criterion(output, X)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data))

epoch [1/100], loss:0.1607
epoch [2/100], loss:0.1320
epoch [3/100], loss:0.1022
epoch [4/100], loss:0.0713
epoch [5/100], loss:0.0428
epoch [6/100], loss:0.0216
epoch [7/100], loss:0.0101
epoch [8/100], loss:0.0071
epoch [9/100], loss:0.0085
epoch [10/100], loss:0.0108
epoch [11/100], loss:0.0128
epoch [12/100], loss:0.0143
epoch [13/100], loss:0.0152
epoch [14/100], loss:0.0159
epoch [15/100], loss:0.0163
epoch [16/100], loss:0.0165
epoch [17/100], loss:0.0167
epoch [18/100], loss:0.0167
epoch [19/100], loss:0.0166
epoch [20/100], loss:0.0164
epoch [21/100], loss:0.0161
epoch [22/100], loss:0.0155
epoch [23/100], loss:0.0146
epoch [24/100], loss:0.0130
epoch [25/100], loss:0.0106
epoch [26/100], loss:0.0071
epoch [27/100], loss:0.0048
epoch [28/100], loss:0.0106
epoch [29/100], loss:0.0064
epoch [30/100], loss:0.0040
epoch [31/100], loss:0.0045
epoch [32/100], loss:0.0053
epoch [33/100], loss:0.0054
epoch [34/100], loss:0.0047
epoch [35/100], loss:0.0034
epoch [36/100], loss:0.0022
e

In [18]:
out, encoding = model(X)
encoding.shape
torch.save(encoding, 'electricity_encoding.pt')

In [19]:
encoding = torch.load('electricity_encoding.pt')