# Import Files
This notebook is a test of the Pytorch dataloader using the DatasetFolder class and the separated csvs. The dataloader will be used in the NN notebooks later.

In [1]:
import os.path as op

import numpy as np
import pandas as pd

import torch

from torchvision.datasets import DatasetFolder

In [2]:
# root containing the csv files - note the 'class_0' is not included
root = 'ashrae-energy-prediction/dataloader_train/csv'

In [3]:
# create the  data set
train_dataset = DatasetFolder(root, pd.read_csv, extensions='.csv', transform=None, target_transform=None)

print(train_dataset)

Dataset DatasetFolder
    Number of datapoints: 2380
    Root location: ashrae-energy-prediction/dataloader_train/csv


In [4]:
# Each entry of the dataset will be a tuple with two entries 
# - the first is a dataframe (with the target as a column) 
#   that is process and then use within your mini-batch loop
# - the second is a dummy target based on the 'class_0' directory;
#   it isn't important for anything (this functionality is for
#   classification type problems) and will always be zero

data = train_dataset[0]

print("data type =", type(data))

df = data[0]
junk = data[1]

print("df type =", type(df))
display(df)

print("junk =", junk)

data type = <class 'tuple'>
df type = <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,square_feet,year_built,building_median,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,...,wind_speed_mean_lag3,building_id,site_id,primary_use,hour,week,day_of_week,is_holiday,meter,meter_reading
0,50623,1976.0,4.332031,24.4,0.0,10.0,0.0,1019.400024,0.0,0.0,...,0.500000,105,1,0,0,53,4,1,0,3.190624
1,50623,1976.0,4.332031,3.8,0.0,2.4,0.0,1020.900024,240.0,3.1,...,3.099609,105,1,0,1,53,4,0,0,3.841753
2,50623,1976.0,4.332031,3.7,0.0,2.4,0.0,1021.599976,230.0,2.6,...,2.849609,105,1,0,2,53,4,0,0,3.841753
3,50623,1976.0,4.332031,2.6,0.0,1.9,0.0,1021.900024,0.0,0.0,...,1.900391,105,1,0,3,53,4,0,0,3.830967
4,50623,1976.0,4.332031,2.0,0.0,1.2,0.0,1022.299988,170.0,1.5,...,1.366211,105,1,0,4,53,4,0,0,3.841753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,50623,1976.0,4.332031,8.5,9.0,6.6,0.0,1028.099976,220.0,3.1,...,3.099609,105,1,0,19,52,5,0,0,3.870544
8780,50623,1976.0,4.332031,8.1,9.0,6.5,0.0,1027.500000,220.0,3.6,...,3.099609,105,1,0,20,52,5,0,0,3.840462
8781,50623,1976.0,4.332031,7.2,9.0,6.1,0.0,1026.900024,220.0,4.1,...,3.599609,105,1,0,21,52,5,0,0,3.847485
8782,50623,1976.0,4.332031,6.9,9.0,5.8,0.0,1026.199951,220.0,4.6,...,4.101562,105,1,0,22,52,5,0,0,3.838107


junk = 0


___

In [16]:
# to be able to wrap the dataset into a dataloader, we conver the dataframe to an array
    num_dtypes = list(map(lambda x : np.issubdtype(df[x].dtype, np.number), df.columns))
    
    return df.iloc[:, num_dtypes].values

In [17]:
# we can now wrap the dataset into a dataloader to make mini-batch training easier
batch_size = 1 #Need to make all the time-series the same length to make batch_size bigger than 1

train_dataset = DatasetFolder(root, pd.read_csv, extensions='.csv', transform=df_to_array, target_transform=None)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle = False)

In [18]:
# train_loader is now an iteratable batch loader... you can look a single example

stuff = next(iter(train_loader))


In [19]:
# last entry is the meter_reading, the others are the building_id and meter
stuff

[tensor([[[5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 1.0000e+00,
           0.0000e+00, 3.1906e+00],
          [5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 0.0000e+00,
           0.0000e+00, 3.8418e+00],
          [5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 0.0000e+00,
           0.0000e+00, 3.8418e+00],
          ...,
          [5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 0.0000e+00,
           0.0000e+00, 3.8475e+00],
          [5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 0.0000e+00,
           0.0000e+00, 3.8381e+00],
          [5.0623e+04, 1.9760e+03, 4.3320e+00,  ..., 0.0000e+00,
           0.0000e+00, 3.8132e+00]]], dtype=torch.float64), tensor([0])]