## Dataset preparation & pre-processing

### EDA Exploratory Data Analysis

In [1]:
import os
import sys
import glob

import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
from scipy.fftpack import fft
from scipy.signal import welch

In [4]:
from detect_peaks import detect_peaks as dpk

In [5]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [6]:
data_path = os.getcwd() + '\\data\\2018-09-20\\'

In [7]:
data_path

'C:\\Users\\Martino\\Jupyter notebooks\\coffee_machine\\data\\2018-09-20\\'

#### EDA singolo file

In [None]:
# importa un singolo file in un Dataframe di pandas
file_name = "2_coffee-pouring-normal-20180920--15_24_10.csv"
df = pd.read_csv(data_path + file_name, delimiter=',', header=None)

In [None]:
# rinomina le feature del df
df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)

In [None]:
# trasforma il tipo della feature "date_time" nel formato date_time
df['date_time'] = pd.to_datetime(df['date_time'])

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

Plottiamo le time series delle singole feature

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x='date_time', y='x', data=df[0:2500], color='r')

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x='date_time', y='y', data=df[0:2500], color='b')

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x='date_time', y='z', data=df[0:2500], color='g')

In [None]:
plt.figure(figsize=(16,6))
sns.lineplot(x='date_time', y='i', data=df[0:2500], color='orange')

In [None]:
fig, ax = plt.subplots(figsize=(16,6))

sns.lineplot(data=df['x'][0:2500], color='r', alpha=0.3)
sns.lineplot(data=df['y'][0:2500], color='g', alpha= 0.3)
sns.lineplot(data=df['z'][0:2500], color='b', alpha=0.3)

#### EDA tutti i file

In [None]:
"""importa tutti i file dei dati .csv, li etichetta
e li concatena in un unico dataframe"""

all_files = glob.glob(data_path + "/*.csv")

dframe = pd.DataFrame()
f_list = []

for f in all_files:
    # saltiamo i primi campioni per rimuovere il "rumore" iniziale
    df = pd.read_csv(f, skiprows=1024, delimiter=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    # assegna la label a ciascun sub-dataframe
    df['label'] = f[f.rfind('\\') +1]
    f_list.append(df)

# concatena i singoli df
dframe = pd.concat(f_list)

In [None]:
dframe.head()

In [None]:
dframe.tail()

In [None]:
dframe.label.value_counts()

In [None]:
count_classes = pd.value_counts(dframe['label'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("Classes distribution")
plt.xlabel("Class")
plt.ylabel("Frequency")
labels = ['empty', 'normal', 'hard', 'moving', 'stand_by', 'steam', 'normal-moving']

In [None]:
dframe['m'] = np.sqrt(dframe['x']**2 + dframe['y']**2 + dframe['z']**2)

In [None]:
dframe['m'].describe()

In [None]:
dframe.head()

In [None]:
plt.figure(figsize=(16,10))

for c in range(1, 8):
    plt.plot(dframe[dframe['label'] == str(c)]['m'][0:7000], linewidth=.5, alpha=0.70)

plt.legend(labels)

### Pre-processing

#### Pre-processing FFT singolo file

In [None]:
file_name = '2_coffee-pouring-normal-20180920--15_24_10.csv'
df = pd.read_csv(data_path + file_name, delimiter=',', header=None)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)

In [None]:
df['date_time'] = pd.to_datetime(df['date_time'])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
SAMPLES = 128

In [None]:
df = df[0:SAMPLES]

In [None]:
#df.reset_index(drop=True, inplace=True)

In [None]:
#df = df[500:2000]

In [None]:
len(df)

In [None]:
df.head()

In [None]:
end_time_idx = len(df) - 1

In [None]:
df['date_time'][end_time_idx] - df['date_time'][0]

In [None]:
delta_time_sec = (df['date_time'][end_time_idx] - df['date_time'][0]).total_seconds()

In [None]:
delta_time_sec

In [None]:
N = len(df)

In [None]:
T = delta_time_sec/N
T

In [None]:
round(1/T, 2)

In [None]:
f_values = np.linspace(0.0, 1.0/(2.0*T), N//2)

In [None]:
#f_values

In [None]:
f_values.size

In [None]:
fft_values_ = fft(df['x'])

In [None]:
fft_values = 2.0/N * np.abs(fft_values_[0:N//2])

In [None]:
fft_values.size

In [None]:
plt.figure(figsize=(8,4))
plt.plot(f_values, fft_values, linestyle='-', color='blue')
plt.xlabel('Frequency [Hz]', fontsize=16)
plt.ylabel('Amplitude', fontsize=16)
plt.title("Frequency domain of the signal", fontsize=16)
plt.show()

In [None]:
# NB: la scala delle ascisse non corrisponde a quella reale,
# l'indice dei picchi è invece corretto e ci consente di ottenere
# il valore dei picchi
ind = dpk(fft_values, mph=750, mpd=10, show=True)

In [None]:
# funzione di Duarte editata per ottenere la giusta scala in ascissa (completare)
#ind = detect_peaks_edit(f_values, fft_values, N, T, mph=0, mpd=20, show=True)

In [None]:
f_values[ind]

Definiamo una funzione che racchiuda gli step precedenti

In [8]:
"""applica la FFT al segnale campionato e catturato in un file .csv
dopo averlo importato in un DataFrame di pandas;
chiede di specificare la variabile da processare [axis],
il colore del plot ed eventualmente la label"""
def preprocess_signal_FFT(file_name, axis, start_idx, end_idx, color, label=""):
    
    # importa i dati in un DataFrame pandas
    # data_path = os.getcwd() + '\\data\\2018-09-13\\'
    data_path = os.getcwd() + '\\data\\2018-09-20\\'
    df = pd.read_csv(data_path + file_name, delimiter=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['m'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
    df = df[start_idx:end_idx]
    
    # applica la FFT al segnale
    start_time = 0
    end_time = len(df) - 1
    delta_time_sec = (df['date_time'][end_time] - df['date_time'][start_time]).total_seconds()
    N = len(df)
    T = delta_time_sec / N
    f = 1 / T
    f_values = np.linspace(0.0, 1.0/(2.0*T), N//2)
    fft_values_ = fft(df[axis])
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    
    # plotta il segnale processato
    plt.figure()
    fig, ax = plt.subplots(figsize=(10,6))
    ax.plot(f_values[1:], fft_values[1:], linestyle='-', color=color)
    plt.xlabel('Frequency [Hz]', fontsize=16)
    plt.ylabel('Amplitude', fontsize=16)
    plt.title("Frequency domain of the signal " + axis + " " + label, fontsize=16)
    plt.show()

In [None]:
file_name = "3_coffee-pouring-hard-20180920--15_33_50.csv"

In [None]:
preprocess_signal_FFT(file_name, 'x', start_idx=0, end_idx=128, color='red')

#### Pre-processing FFT tutti i file

In [None]:
files_list = os.listdir(data_path)
files_list

In [None]:
colors = {'1':'red', '2':'blue', '3':'green', '4':'orange', '5':'purple', '6':'cyan', '7':'pink'}
labels = {'1':'(empty)', '2':'(normal pouring)', '3':'(hard pouring)', '4':'(moving)', '5':'(stand-by)', 
          '6':'(steam)', '7':'(normal-moving)'}

In [None]:
for f in files_list:
    file_name = f
    c = file_name[0]
    preprocess_signal_FFT(file_name, start_idx=0, end_idx=128, axis='x', color=colors[c], label=labels[c])

#### Pre-processing PSD singolo file

In [None]:
file_name = '2_coffee-pouring-normal-20180920--15_25_34.csv'

In [None]:
SAMPLES = 128

In [None]:
df = pd.read_csv(data_path + file_name, delimiter=',', header=None)
df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
df['date_time'] = pd.to_datetime(df['date_time'])
df = df[0:SAMPLES]
start_time = 0
end_time = len(df) - 1
delta_time_sec = (df['date_time'][end_time] - df['date_time'][start_time]).total_seconds()
N = len(df)
T = delta_time_sec / N
f = 1 / T

In [None]:
f_values, psd_values = welch(df['x'], fs=f, nperseg=128)

In [None]:
len(f_values)

In [None]:
len(psd_values)

In [None]:
plt.figure()
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(f_values, psd_values, linestyle='-', color='blue')
plt.xlabel('Frequency [Hz]')
plt.ylabel('PSD [V**2 / Hz]')
plt.title("PSD of the signal " + "x" + " " + "(normal)", fontsize=16)
plt.show()

Definiamo una funzione che racchiuda gli step precedenti

In [9]:
"""calcola la Power Spectral Density PSD del segnale catturato in un file .csv
dopo averlo importato in un DataFrame di pandas;
chiede di specificare la variabile da processare [axis], la finestra temporale
il colore del plot ed eventualmente la label"""
def preprocess_signal_PSD(file_name, axis, start_idx, end_idx, color, label=""):
    
    # importa i dati in un DataFrame pandas
    # data_path = os.getcwd() + '\\data\\2018-09-13\\'
    data_path = os.getcwd() + '\\data\\2018-09-20\\'
    df = pd.read_csv(data_path + file_name, delimiter=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['m'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
    df = df[start_idx:end_idx]
    
    # applica la PSD al segnale
    start_time = 0
    end_time = len(df) - 1
    delta_time_sec = (df['date_time'][end_time] - df['date_time'][start_time]).total_seconds()
    N = len(df)
    T = delta_time_sec / N
    f = 1 / T
    SAMPLES = end_idx-start_idx
    f_values, psd_values = welch(df[axis], fs=f, nperseg=SAMPLES)
    
    # plotta il segnale processato
    plt.figure()
    fig, ax = plt.subplots(figsize=(10,6))
    ax.plot(f_values, psd_values, linestyle='-', color=color)
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('PSD [V**2 / Hz]')
    plt.title("PSD of the signal " + axis + " " + label, fontsize=16)
    plt.show()

Utilizziamo il metodo di Marcos Duarte per calcolare l'indice dei picchi nel segnale pre-processato

In [None]:
# %load ./../functions/detect_peaks.py
"""Detect peaks in data based on their amplitude and other features."""

from __future__ import division, print_function
import numpy as np

__author__ = "Marcos Duarte, https://github.com/demotu/BMC"
__version__ = "1.0.5"
__license__ = "MIT"


def detect_peaks_edit(x_values, x, N, T, mph=None, mpd=1, threshold=0, edge='rising',
                 kpsh=False, valley=False, show=False, ax=None):

    """Detect peaks in data based on their amplitude and other features.

    Parameters
    ----------
    x : 1D array_like
        data.
    mph : {None, number}, optional (default = None)
        detect peaks that are greater than minimum peak height (if parameter
        `valley` is False) or peaks that are smaller than maximum peak height
         (if parameter `valley` is True).
    mpd : positive integer, optional (default = 1)
        detect peaks that are at least separated by minimum peak distance (in
        number of data).
    threshold : positive number, optional (default = 0)
        detect peaks (valleys) that are greater (smaller) than `threshold`
        in relation to their immediate neighbors.
    edge : {None, 'rising', 'falling', 'both'}, optional (default = 'rising')
        for a flat peak, keep only the rising edge ('rising'), only the
        falling edge ('falling'), both edges ('both'), or don't detect a
        flat peak (None).
    kpsh : bool, optional (default = False)
        keep peaks with same height even if they are closer than `mpd`.
    valley : bool, optional (default = False)
        if True (1), detect valleys (local minima) instead of peaks.
    show : bool, optional (default = False)
        if True (1), plot data in matplotlib figure.
    ax : a matplotlib.axes.Axes instance, optional (default = None).

    Returns
    -------
    ind : 1D array_like
        indeces of the peaks in `x`.

    Notes
    -----
    The detection of valleys instead of peaks is performed internally by simply
    negating the data: `ind_valleys = detect_peaks(-x)`
    
    The function can handle NaN's 

    See this IPython Notebook [1]_.

    References
    ----------
    .. [1] http://nbviewer.ipython.org/github/demotu/BMC/blob/master/notebooks/DetectPeaks.ipynb

    Examples
    --------
    >>> from detect_peaks import detect_peaks
    >>> x = np.random.randn(100)
    >>> x[60:81] = np.nan
    >>> # detect all peaks and plot data
    >>> ind = detect_peaks(x, show=True)
    >>> print(ind)

    >>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
    >>> # set minimum peak height = 0 and minimum peak distance = 20
    >>> detect_peaks(x, mph=0, mpd=20, show=True)

    >>> x = [0, 1, 0, 2, 0, 3, 0, 2, 0, 1, 0]
    >>> # set minimum peak distance = 2
    >>> detect_peaks(x, mpd=2, show=True)

    >>> x = np.sin(2*np.pi*5*np.linspace(0, 1, 200)) + np.random.randn(200)/5
    >>> # detection of valleys instead of peaks
    >>> detect_peaks(x, mph=-1.2, mpd=20, valley=True, show=True)

    >>> x = [0, 1, 1, 0, 1, 1, 0]
    >>> # detect both edges
    >>> detect_peaks(x, edge='both', show=True)

    >>> x = [-2, 1, -2, 2, 1, 1, 3, 0]
    >>> # set threshold = 2
    >>> detect_peaks(x, threshold = 2, show=True)

    Version history
    ---------------
    '1.0.5':
        The sign of `mph` is inverted if parameter `valley` is True
    
    """

    x = np.atleast_1d(x).astype('float64')
    if x.size < 3:
        return np.array([], dtype=int)
    if valley:
        x = -x
        if mph is not None:
            mph = -mph
    # find indices of all peaks
    dx = x[1:] - x[:-1]
    # handle NaN's
    indnan = np.where(np.isnan(x))[0]
    if indnan.size:
        x[indnan] = np.inf
        dx[np.where(np.isnan(dx))[0]] = np.inf
    ine, ire, ife = np.array([[], [], []], dtype=int)
    if not edge:
        ine = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) > 0))[0]
    else:
        if edge.lower() in ['rising', 'both']:
            ire = np.where((np.hstack((dx, 0)) <= 0) & (np.hstack((0, dx)) > 0))[0]
        if edge.lower() in ['falling', 'both']:
            ife = np.where((np.hstack((dx, 0)) < 0) & (np.hstack((0, dx)) >= 0))[0]
    ind = np.unique(np.hstack((ine, ire, ife)))
    # handle NaN's
    if ind.size and indnan.size:
        # NaN's and values close to NaN's cannot be peaks
        ind = ind[np.in1d(ind, np.unique(np.hstack((indnan, indnan-1, indnan+1))), invert=True)]
    # first and last values of x cannot be peaks
    if ind.size and ind[0] == 0:
        ind = ind[1:]
    if ind.size and ind[-1] == x.size-1:
        ind = ind[:-1]
    # remove peaks < minimum peak height
    if ind.size and mph is not None:
        ind = ind[x[ind] >= mph]
    # remove peaks - neighbors < threshold
    if ind.size and threshold > 0:
        dx = np.min(np.vstack([x[ind]-x[ind-1], x[ind]-x[ind+1]]), axis=0)
        ind = np.delete(ind, np.where(dx < threshold)[0])
    # detect small peaks closer than minimum peak distance
    if ind.size and mpd > 1:
        ind = ind[np.argsort(x[ind])][::-1]  # sort ind by peak height
        idel = np.zeros(ind.size, dtype=bool)
        for i in range(ind.size):
            if not idel[i]:
                # keep peaks with the same height if kpsh is True
                idel = idel | (ind >= ind[i] - mpd) & (ind <= ind[i] + mpd) \
                    & (x[ind[i]] > x[ind] if kpsh else True)
                idel[i] = 0  # Keep current peak
        # remove the small peaks and sort back the indices by their occurrence
        ind = np.sort(ind[~idel])

    if show:
        if indnan.size:
            x[indnan] = np.nan
        if valley:
            x = -x
            if mph is not None:
                mph = -mph
        _plot(x, mph, mpd, threshold, edge, valley, ax, ind)

    return ind


def _plot(x, mph, mpd, threshold, edge, valley, ax, ind):
    """Plot results of the detect_peaks function, see its help."""
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        print('matplotlib is not available.')
    else:
        if ax is None:
            _, ax = plt.subplots(1, 1, figsize=(8, 4))
        
        x_values = np.linspace(0.0, 1.0/(2.0*T), N//2)
        ax.plot(x_values, x, 'b', lw=1)
        if ind.size:
            label = 'valley' if valley else 'peak'
            label = label + 's' if ind.size > 1 else label
            ax.plot(ind, x[ind], '+', mfc=None, mec='r', mew=2, ms=8,
                    label='%d %s' % (ind.size, label))
            ax.legend(loc='best', framealpha=.5, numpoints=1)
        ax.set_xlim(-.02*x.size, x.size*1.02-1)
        ymin, ymax = x[np.isfinite(x)].min(), x[np.isfinite(x)].max()
        yrange = ymax - ymin if ymax > ymin else 1
        ax.set_ylim(ymin - 0.1*yrange, ymax + 0.1*yrange)
        ax.set_xlabel('Data #', fontsize=14)
        ax.set_ylabel('Amplitude', fontsize=14)
        mode = 'Valley detection' if valley else 'Peak detection'
        ax.set_title("%s (mph=%s, mpd=%d, threshold=%s, edge='%s')"
                     % (mode, str(mph), mpd, str(threshold), edge))
        # plt.grid()
        plt.show()

In [None]:
dpk(psd_values, mph=100000, mpd=10, threshold=0, show=True)

In [None]:
files_list = os.listdir(data_path)
colors = {'1':'red', '2':'blue', '3':'green', '4':'orange', '5':'purple', '6':'cyan', '7':'pink'}
labels = {'1':'(empty)', '2':'(normal pouring)', '3':'(hard pouring)', '4':'(moving)', '5':'(stand-by)', 
          '6':'(steam)', '7':'(normal-moving)'}

In [None]:
for f in files_list:
    file_name = f
    c = file_name[0]
    preprocess_signal_PSD(file_name, start_idx=0, end_idx=128, axis='x', color=colors[c], label=labels[c])

#### Pre-processing autocorrelation index singolo file

In [None]:
file_name = '2_coffee-pouring-normal-20180920--15_25_34.csv'

In [None]:
df = pd.read_csv(data_path + file_name, delimiter=',', header=None)
df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
df['date_time'] = pd.to_datetime(df['date_time'])
df['m'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
start_time = 0
end_time = len(df) - 1
delta_time_sec = (df['date_time'][end_time] - df['date_time'][start_time]).total_seconds()
N = len(df)
T = delta_time_sec / N
f = 1 / T

In [None]:
result = np.correlate(df['x'][0:128], df['x'][0:128], mode='full')

In [None]:
#result

In [None]:
result.size

In [None]:
autocorr_values = result[result.size//2:]
#autocorr_values

In [None]:
len(result[len(result)//2:])

In [None]:
t_values = np.array([T * jj for jj in range(0, N+1)])
#t_values

In [None]:
plt.figure(figsize=(12,6))
plt.plot(t_values[0:128], autocorr_values[0:128], linestyle='-', color='blue')
plt.xlabel('time delay [s]')
plt.ylabel('Autocorrelation amplitude')
plt.show()

In [10]:
"""calcola l'indice di autocorrelazione del segnale catturato in un file .csv
dopo averlo importato in un DataFrame di pandas;
chiede di specificare la variabile da processare [axis], la finestra temporale,
il colore del plot ed eventualmente la label"""
def preprocess_signal_corr(file_name, axis, start_idx, end_idx, color, label=""):
    
    # importa i dati in un DataFrame pandas
    # data_path = os.getcwd() + '\\data\\2018-09-13\\'
    data_path = os.getcwd() + '\\data\\2018-09-20\\'
    df = pd.read_csv(data_path + file_name, delimiter=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['m'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
    df = df[start_idx:end_idx]

    delta_time_sec = (df['date_time'][end_idx -1] - df['date_time'][start_idx]).total_seconds()
    N = len(df)
    T = delta_time_sec / N
    f = 1 / T
    
    # calcola l'autocorrelation index
    result = np.correlate(df['x'], df['x'], mode='full')
    autocorr_values = result[result.size//2:]
    t_values = np.array([T * jj for jj in range(0, N)])
    
    # plotta il segnale processato
    plt.figure(figsize=(12,6))
    plt.plot(t_values[start_idx:end_idx], autocorr_values[start_idx:end_idx], linestyle='-', color=color)
    plt.xlabel('time delay [s]')
    plt.ylabel('Autocorrelation amplitude')
    plt.title("Autocorrelation of the signal " + axis + " " + label, fontsize=16)
    plt.show()
    plt.figure()

In [None]:
preprocess_signal_corr(file_name, axis='x', start_idx=0, end_idx=128, color='b', label='(normal)')

In [None]:
files_list = os.listdir(data_path)

In [None]:
files_list

In [None]:
colors = {'1':'red', '2':'blue', '3':'green', '4':'orange', '5':'purple', '6':'cyan', '7':'pink'}
labels = {'1':'(empty)', '2':'(normal pouring)', '3':'(hard pouring)', '4':'(moving)', '5':'(stand-by)', 
          '6':'(steam)', '7':'(normal-moving)'}

In [None]:
for f in files_list:
    file_name = f
    c = file_name[0]
    preprocess_signal_corr(file_name, axis='x', start_idx=0, end_idx=128, color=colors[c], label=labels[c])

### Data preparation for modelling

In [None]:
data_path

In [None]:
files_list

In [None]:
for f in files_list:
    file_name = f
    c = file_name[0]
    df = pd.read_csv(data_path + f, delimiter=',', header=None)
    print("Il file {0} contiene {1} sample".format(f, len(df)))

In [None]:
#data_path = os.getcwd() + '\\data\\2018-09-20\\'

In [11]:
"""importa un file .csv in un df pandas, rioganizza i nomi delle colonne
e il formato dei dati, lo divide in due df separati secondo lo split_ratio,
e li salva in due file .csv, uno per il training e l'altro per il testing"""

def prepare_data(file_name, end_index, split_ratio):
    
    data_path = os.getcwd() + "\\data\\"
    
    labels = {'1':'(empty)', '2':'(normal pouring)', '3':'(hard pouring)', '4':'(moving)', '5':'(stand-by)', 
          '6':'(steam)', '7':'(normal-moving)'}
    label = file_name[0]
    split_index = int(end_index * split_ratio)
    
    df = pd.read_csv(data_path + "2018-09-20\\" + file_name, sep=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df[0:end_index]
    
    df_train = df[0:split_index]
    df_train.to_csv(data_path + "INPUT_TRAIN\\" + "train_" + labels[label] + ".csv", sep=",",
                    index=False, encoding='utf-8')
    df_test = df[split_index:]
    df_test.to_csv(data_path+"INPUT_TEST\\" + "test_" + labels[label] + ".csv", sep=",",
                   index=False, encoding='utf-8')

In [None]:
# test con un singolo file
file_name = "2_coffee-pouring-normal-20180920--15_24_10.csv"
prepare_data(file_name, end_index=10230, split_ratio=0.7)

In [None]:
# per tutti i file della cartella
for f in files_list:
    
    prepare_data(f, end_index=10230, split_ratio=0.7)

In [12]:
"""importa un file .csv in un df pandas, rioganizza i nomi delle colonne
e il formato dei dati, lo divide in due df separati secondo lo split_ratio,
e per ogni variabile genera due file .csv, uno per il training e l'altro per il testing"""

def prepare_data_split_variables(file_name, end_index, split_ratio, var_to_store):
    
    data_path = os.getcwd() + "\\data\\"
    
    labels = {'1':'(empty)', '2':'(normal pouring)', '3':'(hard pouring)', '4':'(moving)', '5':'(stand-by)', 
          '6':'(steam)', '7':'(normal-moving)'}
    label = file_name[0]
    cols={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}
    split_index = int(end_index * split_ratio)
    
    df = pd.read_csv(data_path + "2018-09-20\\" + file_name, sep=',', header=None)
    df.rename(columns={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}, inplace=True)
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df[0:end_index]
    
    df_train = df[0:split_index]
    df_train.to_csv(data_path + "INPUT_TRAIN\\" + "train_" + labels[label] +
                    "_" + var_to_store + "_.csv", sep=",", index=False,
                    columns=[cols[0], var_to_store], encoding='utf-8')
    df_test = df[split_index:]
    df_test.to_csv(data_path+"INPUT_TEST\\" + "test_" + labels[label] +
                    "_" + var_to_store + "_.csv", sep=",", index=False,
                    columns=[cols[0], var_to_store], encoding='utf-8')

In [None]:
file_name = "2_coffee-pouring-normal-20180920--15_24_10.csv"

In [None]:
# test con un singolo file
prepare_data_split_variables(file_name, end_index=10230, split_ratio=0.7, var_to_store='x')

In [None]:
# per tutti i file della cartella
cols={0:'date_time', 1:'x', 2:'y', 3:'z', 4:'i'}

for f in files_list:
    
    for i in range(1,4):
        
        prepare_data_split_variables(f, end_index=10230, split_ratio=0.7, var_to_store=cols[i])

In [13]:
os.getcwd()

'C:\\Users\\Martino\\Jupyter notebooks\\coffee_machine'

In [14]:
data_path

'C:\\Users\\Martino\\Jupyter notebooks\\coffee_machine\\data\\2018-09-20\\'

In [100]:
FILE_NAME = "train_(normal pouring)_x_.csv"

In [101]:
data = pd.read_csv(os.getcwd() + "\\data\\INPUT_TRAIN\\" + FILE_NAME, sep=',', names='x', header=0)

In [102]:
data.head()

Unnamed: 0,x
2018-09-20 15:24:10.452,-7464
2018-09-20 15:24:10.456,3976
2018-09-20 15:24:10.458,-584
2018-09-20 15:24:10.460,-584
2018-09-20 15:24:10.462,-584


In [105]:
data.reset_index(drop=True, inplace=True)

In [107]:
data.tail()

Unnamed: 0,x
7156,-4936
7157,-4936
7158,1212
7159,1212
7160,-144


In [108]:
lngth = len(data) // 128 * 128

In [109]:
data = data[0:lngth]

In [110]:
data.size

7040

In [111]:
data1 = data[0:128]

In [112]:
np.shape(data1)

(128, 1)

In [116]:
type(data1.values)

numpy.ndarray

In [117]:
data1_re = np.reshape(data1.values, (1, -1))

In [118]:
data1_re.shape

(1, 128)

In [32]:
len(np.array_split(data, len(data)//128))

In [119]:
split_data = np.array_split(data, len(data)//128)

In [122]:
len(split_data)

55

In [138]:
input_signals = pd.DataFrame()

for i in range(len(split_data)):
    
    data_reshaped = np.reshape(split_data[i].values, (1, -1))
    
    input_signals = input_signals.append(data_reshaped.tolist(), ignore_index=True)

In [125]:
len(split_data)

55

In [50]:
len(data1)

55

In [127]:
input_signals

In [48]:
type(data1)

list

In [97]:
input_signals = pd.DataFrame()

In [140]:
input_signals.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-7464,3976,-584,-584,-584,552,552,-7728,-7728,4628,...,492,492,-7572,-7572,5016,5016,-920,-920,-920,424
1,424,-7448,-7448,5244,5244,-1604,-1604,-1604,-168,-168,...,4352,4352,-896,-896,868,868,-7300,-7300,-7300,4468
2,-1316,-1316,572,572,-7288,-7288,4792,4792,-1560,-1560,...,-6668,2612,2612,2612,-264,-264,1112,1112,-7296,-7296
3,3480,3480,3480,-504,-504,856,856,-7220,-7220,4140,...,-40,672,672,672,-6240,-6240,2868,2868,-912,-912
4,1012,1012,-6492,-6492,-6492,3432,3432,-1392,-1392,948,...,1652,1652,-276,-276,556,556,-5496,-5496,2196,2196


In [141]:
input_signals.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
50,-1136,-1136,-1648,-1648,-2692,-2692,2000,2000,2000,-852,...,2516,2516,2516,-1288,-1288,-944,-3252,-3252,2348,2348
51,2348,-1160,-1160,-1692,-1692,-2572,-2572,2004,2004,2004,...,72,72,-4288,-4288,2880,2880,-1608,-1608,-532,-532
52,-3900,-3900,-3900,2624,2624,-1220,-1220,-1000,-1000,-3228,...,3016,-1624,-1624,452,452,452,-4932,-4932,2916,2916
53,-1608,-1608,84,84,-4632,-4632,-4632,2944,2944,-1696,...,684,-5332,2788,2788,-1400,-1400,588,588,588,-4976
54,-4976,2872,2872,-1616,-1616,356,356,356,-4844,-4844,...,692,692,-5332,-5332,-5332,2660,2660,-1304,-1304,632
