## Online Retail ###

taken from `https://archive.ics.uci.edu/dataset/352/online+retail`

In [None]:
#import from online repos

#pip install ucimlrepo

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from ucimlrepo import fetch_ucirepo  
 
online_retail = fetch_ucirepo(id=352) 
  
print('METADATA') 
print(online_retail.metadata) 
  
print("VARIABLES") 
print(online_retail.variables) 


In [None]:
#dataset exploration

for k in online_retail.keys():
    for j in online_retail[k].keys():
        x= online_retail[k][j]
        if hasattr(x, 'shape'):
            print(k,j, x.shape)
        else: 
            print(k,j)

In [None]:
#keep the relevant features

df = online_retail['data']['features'][['Description', 'InvoiceDate', 'Quantity']]

items = df.Description.unique()
items = sorted([str(item) for item in items])

df.loc[:,'InvoiceDate'] = pd.to_datetime(df.InvoiceDate).dt.date
start, end = min(df.InvoiceDate), max(df.InvoiceDate)

df = df.groupby(['Description', 'InvoiceDate'], group_keys=False)['Quantity'].sum()

start, end

In [None]:
#fill the missing zeros

items = items[:items.index('add stock to allocate online orders')]

items_new = []
data = {}

for item in items:
    if any(df[item] < 0):
        continue
    ts = df[item].reindex(pd.date_range(start, end),fill_value=0)
    data[item] = {
        'Values' : ts.values,
        'Date' : ts.index
    }
    items_new.append(item)

len(items_new)

In [None]:
#just make a plot

item = list(data.keys())[0]
ts = data[item]
plt.plot(ts['Date'], ts['Values'])
ts['Values']

In [None]:
#save it into another format (.json)

df = pd.DataFrame({
    'target' : [list(data[item]['Values']) for item in items_new],
    'start' : [str(data[item]['Date'][0]) for item in items_new],
    'feat_stat_cat' : [[i] for i in range(len(items_new))]
})

path = "/Users/stefano.damato/switchdrive/Private/PhD/data/OnlineRetail/"
df.to_json(path + 'data.json')

pd.read_json(path + 'data.json')

In [None]:
#save a .csv too

pd.DataFrame(np.array([list(data[item]['Values']) for item in items_new],
                      dtype=np.int64)).to_csv(path + 'data.csv', index=False)

pd.read_csv(path + 'data.csv')

## Auto ##

taken from `https://github.com/canerturkmen/gluon-ts/tree/intermittent-datasets/datasets/intermittent_auto`

In [None]:
#get .json from url

import pandas as pd
import numpy as np

test_url = 'https://raw.githubusercontent.com/canerturkmen/gluon-ts/intermittent-datasets/datasets/intermittent_auto/test/data.json'
train_url = 'https://raw.githubusercontent.com/canerturkmen/gluon-ts/intermittent-datasets/datasets/intermittent_auto/train/data.json'

train = pd.read_json(train_url)
test = pd.read_json(test_url)

In [None]:
#just print some infos

i = 1234
print(len(train.target[i]), len(test.target[i]))
train.iloc[i,], test.iloc[i]

In [None]:
#save the files

path = "/Users/stefano.damato/switchdrive/Private/PhD/data/Auto/"

train.to_json(path + "train.json")
test.to_json(path + "test.json")

pd.read_json(path + "test.json")

In [None]:
#collect it into a .csv too

data = np.empty((len(test), 24))

for i, ts in enumerate(test.target):
    data[i,] = np.array(ts, dtype=np.int64)

pd.DataFrame(data).to_csv(path + 'data.csv', index=False)

pd.read_csv(path + 'data.csv')

## RAF ##

taken from `https://github.com/canerturkmen/gluon-ts/tree/intermittent-datasets/datasets/intermittent_raf`

In [None]:
path = "/Users/stefano.damato/switchdrive/Private/PhD/data/RAF/"

train = pd.read_json(path + 'train.json')
test = pd.read_json(path + 'test.json')
test

In [None]:
i = 1234
print(len(train.target[i]), len(test.target[i]))
train.iloc[i,], test.iloc[i]

In [None]:
pd.DataFrame(np.array([ts for ts in test.target], 
                      dtype=np.int64)).to_csv(path + 'data.csv', index=False)

pd.read_csv(path + 'data.csv')

## carparts ##

In [None]:
#import the data from the original .csv

import pandas as pd
import datetime

path = "/Users/stefano.damato/switchdrive/Private/PhD/data/carparts/"

pd.read_csv(path + 'carparts.csv', sep=';', index_col=0).T.to_csv(path + 'data.csv', index=False)

pd.read_csv(path + 'data.csv')

In [None]:
#save the .json

data = pd.read_csv(path + 'data.csv')

start = datetime.date(1998, 1, 1)

df_test = pd.DataFrame({
    'start' : [str(start) for i in range(len(data))],
    'target' : [list(data.iloc[i]) for i in range(len(data))],
    'feat_stat_cat' : [[i] for i in range(len(data))]
})

df_train = pd.DataFrame({
    'start' : [str(start) for i in range(len(data))],
    'target' : [list(data.iloc[i])[:-6] for i in range(len(data))],
    'feat_stat_cat' : [[i] for i in range(len(data))]
})

df_test.to_json(path + 'test.json')
df_train.to_json(path  + 'train.json')

pd.read_json(path  + 'test.json')

## Syph ##

In [None]:
#import the data from the original .csv

import pandas as pd
import datetime

path = "/Users/stefano.damato/switchdrive/Private/PhD/data/Syph/"

pd.read_csv(path + 'syph.csv', sep=';', index_col=0).T.to_csv(path + 'data.csv', index=False)

pd.read_csv(path + 'data.csv')

In [None]:
#save the .json

from datetime import datetime

data = pd.read_csv(path + 'data.csv')

start = datetime(2007, 1, 1)

df_test = pd.DataFrame({
    'start' : [str(start) for i in range(len(data))],
    'target' : [list(data.iloc[i]) for i in range(len(data))],
    'feat_stat_cat' : [[i] for i in range(len(data))]
})

df_train = pd.DataFrame({
    'start' : [str(start) for i in range(len(data))],
    'target' : [list(data.iloc[i])[:-12] for i in range(len(data))],
    'feat_stat_cat' : [[i] for i in range(len(data))]
})

df_test.to_json(path + 'test.json')
df_train.to_json(path  + 'train.json')

df_test

In [None]:
?gluonts.dataset.Dataset