# pytorch dataloaders

This notebook is used to develop the pytorch dataset and correspond dataloaders.

pytorch dataloaders use a well-defined pytorch dataset to handle the process of generating training/testing/validation sets. The pytorch dataset is just a class that contains two methods, `__len__()` and `__getitem__`. The `len` method just returns the size of the dataset and the `getitem` method returns a single sample and its corresponding label. 

In [64]:
%matplotlib widget
import glob
import os

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
from pyts.image import RecurrencePlot
import torch
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor

import sys
sys.path.append('/Users/ndmiles/ClassWork/FallQuarter2021/aos_c204/aos_c204_final_project')

from utils import generate_data_chunks

In [65]:
class ICMEDataset(Dataset):
    def __init__(self, datalabels, datadir, transform=None):
        pass

In [66]:
chunk_flist = glob.glob('../data/sta_chunks/sta*txt')

In [67]:
test_chunk = pd.read_csv(chunk_flist[102], header=0, index_col=0, parse_dates=True)

In [68]:
X = test_chunk['BTOTAL'].values

In [69]:
rp = RecurrencePlot(threshold='distance', percentage=10)

In [70]:
test_chunk.resample('10min').mean().dropna()

Unnamed: 0_level_0,BTOTAL,BX(RTN),BY(RTN),BZ(RTN),VP_RTN,TEMPERATURE,BETA
EPOCH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-06-27 00:00:00,3.693066,-0.640593,-0.840517,1.508870,401.468621,58968.160871,2.155385
2011-06-27 00:20:00,3.689943,-0.190876,-1.496312,1.130264,398.885227,58317.818613,2.201446
2011-06-27 00:40:00,3.733945,0.258847,-1.967292,1.192816,395.456471,58939.000048,2.229310
2011-06-27 01:00:00,3.772438,0.615153,-2.585726,1.145958,392.124860,60154.966967,2.287068
2011-06-27 01:20:00,3.912162,0.319812,-2.360731,1.715302,390.926780,58896.865100,2.169453
...,...,...,...,...,...,...,...
2011-06-30 22:40:00,3.928866,-1.274164,-3.131051,-1.410548,391.750182,56838.053978,1.100172
2011-06-30 23:00:00,3.986032,-1.680883,-3.132007,-1.510975,389.671564,56927.447143,1.006858
2011-06-30 23:20:00,4.018877,-1.553664,-3.183556,-1.496514,390.220132,54099.911628,0.947468
2011-06-30 23:40:00,3.984881,-1.464581,-3.133854,-1.597873,389.802187,54351.572736,0.969492


In [168]:
interp_df = test_chunk.interpolate(method='linear', axis=0)

In [83]:
y = test_chunk.rolling('120min', center=True).mean().dropna()['BTOTAL']
y_resampled = test_chunk.resample('20min').mean().dropna()['BTOTAL']
x = np.arange(0, len(y))
x = test_chunk.index
x_resample = np.arange(0, len(y_resampled))
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.plot(x, test_chunk['BTOTAL'])
ax.plot(interp_df.index, interp_df['BTOTAL'],ls='--')
# ax.plot(x, y)
# ax.plot(y_resampled.index, y_resampled,c='k',ls='--')

# test_chunk.resample('20min').mean().plot(y='BETA')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7ff69c2436d0>]

In [84]:
smoothed = interp_df.rolling('90min').mean().dropna()
resampled = smoothed.resample('20min').mean().dropna()

In [85]:
offset = smoothed.index[0] - resampled.index[0]

In [86]:
resampled.index += offset

In [87]:
smoothed.index.to_series().diff(1).value_counts().index[0]

Timedelta('0 days 00:20:00')

In [97]:
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.plot(smoothed.index, smoothed['BZ(RTN)'])
# ax.plot(resampled.index, resampled['BTOTAL'])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7ff69d2670d0>]

In [151]:
imgs = generate_data_chunks.visualize_chunk_img(resampled, cols=resampled.columns, threshold='distance', percentage=10)

In [152]:
len(imgs.keys())

7

In [238]:
fig.suptitle?

[0;31mSignature:[0m [0mfig[0m[0;34m.[0m[0msuptitle[0m[0;34m([0m[0mt[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add a centered title to the figure.

Parameters
----------
t : str
    The title text.

x : float, default 0.5
    The x location of the text in figure coordinates.

y : float, default 0.98
    The y location of the text in figure coordinates.

horizontalalignment, ha : {'center', 'left', right'}, default: 'center'
    The horizontal alignment of the text relative to (*x*, *y*).

verticalalignment, va : {'top', 'center', 'bottom', 'baseline'}, default: 'top'
    The vertical alignment of the text relative to (*x*, *y*).

fontsize, size : default: :rc:`figure.titlesize`
    The font size of the text. See `.Text.set_size` for possible
    values.

fontweight, weight : default: :rc:`figure.titleweight`
    The font weight of the text. See `.Text.set_weight` for possible
    values.

Returns
-------
text
    T

In [237]:
interp_df.columns()

TypeError: 'Index' object is not callable

In [153]:
def plot_rp(imgs):
    values = imgs.values()
    keys = imgs.keys()
    fig = plt.figure(figsize=(7, 4))
    gs = GridSpec(nrows=2, ncols=4, hspace=0.25, wspace=0.25)
    axes = [fig.add_subplot(gs[0, i]) for i in range(4)]
    axes += [fig.add_subplot(gs[1, i]) for i in range(3)]
    for ax, val, key in zip(axes, values, keys):
        ax.imshow(val[0], cmap='binary', origin='lower', aspect='equal')
#         ax.tick_params(axis='both', which='both', bottom=False, left=False,labelbottom=False, labelleft=False)
        ax.set_title(key)
        ax.grid(False)

In [154]:
plot_rp(imgs)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [155]:
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.plot(smoothed.index, smoothed['BETA'])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7ff458b43dd0>]

In [156]:
diff1 = interp_df['BETA'].iloc[0] - interp_df['BETA']

In [157]:
np.percentile(diff1.values, q=[10])

array([-1.02137145])

In [163]:
len(np.where(diff1.values < -1.02137145)[0])

29

In [164]:
len(imgs['BETA'][0][:,0])

289

In [165]:
imgs['BETA'][0][:,0].sum()

160

In [167]:
289 - 160

129

In [222]:
img_data = list(imgs.values())

In [227]:
img_data = np.concatenate(np.array(img_data))

In [235]:
(img_data[1] == imgs['BTOTAL']).all()

True

### Storing the image data

In [213]:
import h5py

In [214]:
f = h5py.File('test.hdf5', mode='w')

In [215]:
f.keys()

<KeysViewHDF5 []>

In [211]:
f.keys()

<KeysViewHDF5 ['sta']>

In [216]:
grp = f.create_group('sta')

In [217]:
subgrp = grp.create_group('chunk1')

In [218]:
for key, value in imgs.items():
    dst = subgrp.create_dataset(key, dtype=value.dtype, data=value, shape=value.shape)

In [219]:
grp.keys()

<KeysViewHDF5 ['chunk1']>

In [220]:
subgrp.keys()

<KeysViewHDF5 ['BETA', 'BTOTAL', 'BX(RTN)', 'BY(RTN)', 'BZ(RTN)', 'TEMPERATURE', 'VP_RTN']>

In [189]:
interp_df.index[0].strftime('%Y-%m-%d %H:%M:%S')

'2011-06-27 00:00:00'

In [190]:
meta_data = {}
meta_data['start_time'] = interp_df.index[0].strftime('%Y-%m-%d %H:%M:%S')
meta_data['stop_time'] = interp_df.index[-1].strftime('%Y-%m-%d %H:%M:%S')

In [191]:
for key in meta_data.keys():
    dst.attrs[key] = meta_data[key]

In [193]:
dst.value = imgs['BETA']

AttributeError: can't set attribute

In [195]:
del grp['chunk1']

In [212]:
f.close()