# Construcción del conjunto de datos

## Introducción

Tras el análisis previo realizado en el notebook *ble-analysis-filter-clean_v3*, llega el momento de preparar un conjunto con los datos más relevantes de cara a entrenar un algoritmo de Machine Learning capaz de utilizar dichos datos para ser capaz de predecir el número de personas que dentro de la biblioteca.

Algunos de los datos de interés son:

- El intervalo de tiempo (Timestamp)
- Número de dispositivos totales
- Número de dispositivos por Raspberry
- Número de dispositivos por pareja de Raspberry
- Número de dispositivos dentro de unos rangos de mensajes por Raspberry
- Número de dispositivos existentes en el intervalo de tiempo anterior
- Número de dispositivos existentes en los dos intervalos de tiempo anteriores

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
import os
import pathlib
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt

In [8]:
def transformTimestampPersonCount(personCount):
    """Función que transforma los campos Fecha y Hora del conjunto de dato proporcionado por el contador de personas a un Timestamp usable"""

    timestamp = []
    for i in range(len(personCount)):
        timestamp.append(personCount['Fecha'][i] + ' ' + personCount['Hora'][i])
    personCount["Timestamp"] = pd.to_datetime(timestamp, dayfirst=True)

    return personCount

def parseData(time, data):
    """Función que recibe un Dataframe con datos BLE agrupados y devuelve un Dataframe con las mismas columnas, pero con el Timestamp completo"""

    columns = data.columns.values
    columnsFull = np.append(columns, "Timestamp")
    fullData = pd.DataFrame(columns=columns)
    for i in range(len(time)):
        if len(data.loc[data.index == time[i]]) == 0:
            if i == 0:
                nans = np.empty(len(columns))
                nans[:] = np.nan
                df = pd.DataFrame([nans], columns=columns)
                df["Timestamp"] = time[i]
                fullData = pd.concat([fullData, df])
            elif i == len(time)-1:
                df = pd.DataFrame(data.loc[data.index == time[i-1]].values, columns=columns)
                df["Timestamp"] = time[i]
                fullData = pd.concat([fullData, df])
            else:
                if len(data.loc[data.index == time[i+1]]) == 0:
                    df = pd.DataFrame(fullData.loc[fullData.Timestamp == time[i-1]].values, columns=columnsFull)
                else:
                    dataPre = fullData.loc[fullData.Timestamp == time[i-1]]
                    dataPre = dataPre.loc[:, dataPre.columns != "Timestamp"].values[0]
                    dataPost = data.loc[data.index == time[i+1]].values[0]
                    array = np.array([dataPre, dataPost], dtype=object)
                    try:
                        row = np.mean(array, axis=0).astype(int)
                    except:
                        row = np.empty((len(columns)))
                        row[:] = np.nan
                    df = pd.DataFrame([row], columns=columns)
                df["Timestamp"] = time[i]
                fullData = pd.concat([fullData, df])
        else:
            df = pd.DataFrame(data.loc[data.index == time[i]].values, columns=columns)
            df["Timestamp"] = time[i]
            fullData = pd.concat([fullData, df])
    fullData.set_index("Timestamp", inplace=True)

    return fullData

def parsePersonCount(time, personCount, initHour='8:00:00', endHour='21:00:00'):
    """Función que devuelve un Dataframe con una columna Timestamp y otra columna con el número de personas detectadas por el contador de personas"""

    zeroList = pd.Series(np.zeros(len(time)))
    fullPersonCountInterval = pd.DataFrame({'Timestamp': pd.to_datetime(time), 'personCount': zeroList})
    date = fullPersonCountInterval["Timestamp"][0].strftime('%Y-%m-%d')
    initHour = date+" "+initHour
    endHour = date+" "+endHour
    personCountInterval = personCount.groupby(pd.Grouper(key='Timestamp', freq='5T'))["Estimación nº Personas"].last()
    personCountInterval = personCountInterval.loc[initHour:endHour]
    booleanPersonCountInterval = personCountInterval.isna()

    for i in range(len(personCountInterval)):
        if booleanPersonCountInterval[i]:
            personCountInterval[i] = personCountInterval[i-1]

    for i in range(len(time)):
        if len(personCountInterval.loc[personCountInterval.index == fullPersonCountInterval["Timestamp"][i]]) != 0:
            if personCountInterval.loc[personCountInterval.index == fullPersonCountInterval["Timestamp"][i]][0] < 0:
                fullPersonCountInterval["personCount"][i] = 0
            else:
                nPeople = personCountInterval.loc[personCountInterval.index == fullPersonCountInterval["Timestamp"][i]][0]
                fullPersonCountInterval["personCount"][i] = int(nPeople)

    return fullPersonCountInterval

def generateTimeSeriesByHour(data, initHour='7:00:00', endHour='21:55:00'):
    """Función que devuelve una Serie con un Timestamp espaciado en intervalos de 5 minutos dada una hora de comienzo y de fin"""
    date =  data["Timestamp"][0].date()
    start = str(date) + " " + initHour
    end = str(date) + " " + endHour
    timeSeries = pd.Series(pd.date_range(start, end, freq='5T'))

    return timeSeries

def readDataFromDirectory(dataPath, fullPersonCountPath):
    """Función que lee los archivos de datos de los receptores Bluetooth y del contador de personas y los concentra en un array"""

    dataArray = []
    fullPersonCountIntervalArray = []
    dates = []
    contentPersonCountDirectory = os.listdir(fullPersonCountPath)
    dataPath = pathlib.Path(dataPath)
    for file in dataPath.iterdir():
        data = pd.read_csv(file, sep=';')
        data["Timestamp int."] = pd.to_datetime(data["Timestamp int."], dayfirst=True)
        data = data.rename(columns={"Timestamp int.": "Timestamp"})
        dataArray.append(data)
        dates.append(data["Timestamp"].dt.date[0])

    for file in range(len(contentPersonCountDirectory)):
        personCount = pd.read_csv(fullPersonCountPath+contentPersonCountDirectory[file], sep=';')
        personCount = transformTimestampPersonCount(personCount)
        time = generateTimeSeriesByHour(dataArray[file])
        fullPersonCount = parsePersonCount(time, personCount)
        fullPersonCountIntervalArray.append(fullPersonCount)

    return dataArray, fullPersonCountIntervalArray, dates

def parseDataByRaspberry(data):
    """Función que devuelve un conjunto de datos filtrado por cada Raspberry. Devuelve un conjunto por Raspberry.
    Se le puede indicar usar todo el conjunto o solamente las MAC Random"""

    dataInterval1 = data.loc[data['Raspberry'] == 'Raspberry A']
    dataInterval2 = data.loc[data['Raspberry'] == 'Raspberry B']
    dataInterval3 = data.loc[data['Raspberry'] == 'Raspberry C']
    dataInterval4 = data.loc[data['Raspberry'] == 'Raspberry D']
    dataInterval5 = data.loc[data['Raspberry'] == 'Raspberry E']

    return dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5

def parseDataByRaspberryTime(data):
    """Función que devuelve conjuntos de datos con valores únicos filtrados por Raspberry y agrupados por Timestamp"""

    dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5 = parseDataByRaspberry(data)
    dataInterval1 = dataInterval1.groupby('Timestamp').nunique()
    dataInterval2 = dataInterval2.groupby('Timestamp').nunique()
    dataInterval3 = dataInterval3.groupby('Timestamp').nunique()
    dataInterval4 = dataInterval4.groupby('Timestamp').nunique()
    dataInterval5 = dataInterval5.groupby('Timestamp').nunique()

    return dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5

def getTotalDevicesByRaspberry(data, timeSeries):
    """Función que devuelve conjuntos de datos con el número de dispositivos únicos filtrados por Raspberry y agrupados por Timestamp"""

    totalMACRA = np.empty(len(timeSeries))
    totalMACRB = np.empty(len(timeSeries))
    totalMACRC = np.empty(len(timeSeries))
    totalMACRD = np.empty(len(timeSeries))
    totalMACRE = np.empty(len(timeSeries))

    dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5 = parseDataByRaspberryTime(data)

    timestamp = np.transpose(data["Timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S').unique())
    dataframe = pd.DataFrame(pd.to_datetime(timestamp))

    for i in range(len(timeSeries)):
        if len(dataframe.loc[dataframe[0] == timeSeries[i]]) == 0:
            totalMACRA[i] = np.nan
            totalMACRB[i] = np.nan
            totalMACRC[i] = np.nan
            totalMACRD[i] = np.nan
            totalMACRE[i] = np.nan
        else:
            try:
                group = dataInterval1.loc[timeSeries[i]]
                totalMACRA[i] = group["MAC"]
            except:
                totalMACRA[i] = 0

            try:
                group = dataInterval2.loc[timeSeries[i]]
                totalMACRB[i] = group["MAC"]
            except:
                totalMACRB[i] = 0

            try:
                group = dataInterval3.loc[timeSeries[i]]
                totalMACRC[i] = group["MAC"]
            except:
                totalMACRC[i] = 0

            try:
                group = dataInterval4.loc[timeSeries[i]]
                totalMACRD[i] = group["MAC"]
            except:
                totalMACRD[i] = 0

            try:
                group = dataInterval5.loc[timeSeries[i]]
                totalMACRE[i] = group["MAC"]
            except:
                totalMACRE[i] = 0

    return totalMACRA, totalMACRB, totalMACRC, totalMACRD, totalMACRE

def getTotalDevicesByPairRaspberries(data, timeSeries):
    """Función que devuelve cuatro listas compuestas por los dispositivos captados en el mismo intervalo de tiempo por las parejas C-E, D-E, B-E y el trio C-D-E"""

    totalMACRDE = np.empty(len(timeSeries))
    totalMACRCE = np.empty(len(timeSeries))
    totalMACRCDE = np.empty(len(timeSeries))
    totalMACRBE = np.empty(len(timeSeries))

    dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5 = parseDataByRaspberry(data)

    nDevicesIntervalDataRaspberryAMerge = dataInterval1[["Timestamp", "Raspberry", "MAC"]]
    nDevicesIntervalDataRaspberryBMerge = dataInterval2[["Timestamp", "Raspberry", "MAC"]]
    nDevicesIntervalDataRaspberryCMerge = dataInterval3[["Timestamp", "Raspberry", "MAC"]]
    nDevicesIntervalDataRaspberryDMerge = dataInterval4[["Timestamp", "Raspberry", "MAC"]]
    nDevicesIntervalDataRaspberryEMerge = dataInterval5[["Timestamp", "Raspberry", "MAC"]]

    nDevicesIntervalDataRaspberryDEMerge = nDevicesIntervalDataRaspberryDMerge.merge(nDevicesIntervalDataRaspberryEMerge, how='outer', on=("Timestamp", "MAC"), copy=False, suffixes=("_d", "_e"))
    nDevicesIntervalDataRaspberryCDEMerge = nDevicesIntervalDataRaspberryDEMerge.merge(nDevicesIntervalDataRaspberryCMerge, how='outer', on=("Timestamp", "MAC"), copy=False)
    nDevicesIntervalDataRaspberryBCDEMerge = nDevicesIntervalDataRaspberryCDEMerge.merge(nDevicesIntervalDataRaspberryBMerge, how='outer', on=("Timestamp", "MAC"), copy=False, suffixes=("_c", "_b"))
    nDevicesIntervalDataRaspberryABCDEMerge = nDevicesIntervalDataRaspberryBCDEMerge.merge(nDevicesIntervalDataRaspberryAMerge, how='outer', on=("Timestamp", "MAC"), copy=False)

    timestamp = np.transpose(data["Timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S').unique())
    dataframe = pd.DataFrame(pd.to_datetime(timestamp))

    for i in range(len(timeSeries)):
        if len(dataframe.loc[dataframe[0] == timeSeries[i]]) == 0:
            totalMACRDE[i] = np.nan
            totalMACRCE[i] = np.nan
            totalMACRCDE[i] = np.nan
            totalMACRBE[i] = np.nan
        else:
            group = nDevicesIntervalDataRaspberryABCDEMerge.loc[nDevicesIntervalDataRaspberryABCDEMerge["Timestamp"] == timeSeries[i]]
            group = group.groupby(["Timestamp", "MAC"]).nunique()
            total_CE, total_DE, total_CDE, total_BE = 0, 0, 0, 0

            for j in range(len(group)):
                if group["Raspberry_c"][j] == 1 and group["Raspberry_d"][j] == 1 and group["Raspberry_e"][j] == 1:
                    total_CDE = total_CDE + 1
                elif group["Raspberry_c"][j] == 1 and group["Raspberry_e"][j] == 1:
                    total_CE = total_CE + 1
                elif group["Raspberry_d"][j] == 1 and group["Raspberry_e"][j] == 1:
                    total_DE = total_DE + 1
                elif group["Raspberry_b"][j] == 1 and group["Raspberry_e"][j] == 1:
                    total_BE = total_BE + 1
            totalMACRDE[i], totalMACRCE[i], totalMACRCDE[i], totalMACRBE[i] = total_DE, total_CE, total_CDE, total_BE

    return totalMACRDE, totalMACRCE, totalMACRCDE, totalMACRBE

def getTotalDeviceByMessageNumber(data, timeSeries):
    """Función que devuelve tres listas por Raspberry, una por intervalo cada intervalo de número de mensajes por debajo de 10, entre 10 y 30 y superior a 30"""

    totalMACRA_10 = np.empty(len(timeSeries))
    totalMACRA_1030 = np.empty(len(timeSeries))
    totalMACRA_30 = np.empty(len(timeSeries))
    totalMACRB_10 = np.empty(len(timeSeries))
    totalMACRB_1030 = np.empty(len(timeSeries))
    totalMACRB_30 = np.empty(len(timeSeries))
    totalMACRC_10 = np.empty(len(timeSeries))
    totalMACRC_1030 = np.empty(len(timeSeries))
    totalMACRC_30 = np.empty(len(timeSeries))
    totalMACRD_10 = np.empty(len(timeSeries))
    totalMACRD_1030 = np.empty(len(timeSeries))
    totalMACRD_30 = np.empty(len(timeSeries))
    totalMACRE_10 = np.empty(len(timeSeries))
    totalMACRE_1030 = np.empty(len(timeSeries))
    totalMACRE_30 = np.empty(len(timeSeries))

    dataInterval1, dataInterval2, dataInterval3, dataInterval4, dataInterval5 = parseDataByRaspberry(data)

    timestamp = np.transpose(data["Timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S').unique())
    dataframe = pd.DataFrame(pd.to_datetime(timestamp))

    dataInterval1 = dataInterval1.groupby(["Timestamp", "MAC"]).sum()
    dataInterval2 = dataInterval2.groupby(["Timestamp", "MAC"]).sum()
    dataInterval3 = dataInterval3.groupby(["Timestamp", "MAC"]).sum()
    dataInterval4 = dataInterval4.groupby(["Timestamp", "MAC"]).sum()
    dataInterval5 = dataInterval5.groupby(["Timestamp", "MAC"]).sum()

    for i in range(len(timeSeries)):
        if len(dataframe.loc[dataframe[0] == timeSeries[i]]) == 0:
            totalMACRA_10[i] = np.nan
            totalMACRA_1030[i] = np.nan
            totalMACRA_30[i] = np.nan
            totalMACRB_10[i] = np.nan
            totalMACRB_1030[i] = np.nan
            totalMACRB_30[i] = np.nan
            totalMACRC_10[i] = np.nan
            totalMACRC_1030[i] = np.nan
            totalMACRC_30[i] = np.nan
            totalMACRD_10[i] = np.nan
            totalMACRD_1030[i] = np.nan
            totalMACRD_30[i] = np.nan
            totalMACRE_10[i] = np.nan
            totalMACRE_1030[i] = np.nan
            totalMACRE_30[i] = np.nan
        else:
            try:
                group = dataInterval1.loc[timeSeries[i]]
                totalMACRA_10[i] = len(group.loc[group["Nº Mensajes"] <= 10])
                totalMACRA_1030[i] = len(group.loc[(group["Nº Mensajes"] > 10) & (group["Nº Mensajes"] <= 30)])
                totalMACRA_30[i] = len(group.loc[group["Nº Mensajes"] > 30])
            except:
                totalMACRA_10[i] = 0
                totalMACRA_1030[i] = 0
                totalMACRA_30[i] = 0

            try:
                group = dataInterval2.loc[timeSeries[i]]
                totalMACRB_10[i] = len(group.loc[group["Nº Mensajes"] <= 10])
                totalMACRB_1030[i] = len(group.loc[(group["Nº Mensajes"] > 10) & (group["Nº Mensajes"] <= 30)])
                totalMACRB_30[i] = len(group.loc[group["Nº Mensajes"] > 30])
            except:
                totalMACRB_10[i] = 0
                totalMACRB_1030[i] = 0
                totalMACRB_30[i] = 0

            try:
                group = dataInterval3.loc[timeSeries[i]]
                totalMACRC_10[i] = len(group.loc[group["Nº Mensajes"] <= 10])
                totalMACRC_1030[i] = len(group.loc[(group["Nº Mensajes"] > 10) & (group["Nº Mensajes"] <= 30)])
                totalMACRC_30[i] = len(group.loc[group["Nº Mensajes"] > 30])
            except:
                totalMACRC_10[i] = 0
                totalMACRC_1030[i] = 0
                totalMACRC_30[i] = 0

            try:
                group = dataInterval4.loc[timeSeries[i]]
                totalMACRD_10[i] = len(group.loc[group["Nº Mensajes"] <= 10])
                totalMACRD_1030[i] = len(group.loc[(group["Nº Mensajes"] > 10) & (group["Nº Mensajes"] <= 30)])
                totalMACRD_30[i] = len(group.loc[group["Nº Mensajes"] > 30])
            except:
                totalMACRD_10[i] = 0
                totalMACRD_1030[i] = 0
                totalMACRD_30[i] = 0

            try:
                group = dataInterval5.loc[timeSeries[i]]
                totalMACRE_10[i] = len(group.loc[group["Nº Mensajes"] <= 10])
                totalMACRE_1030[i] = len(group.loc[(group["Nº Mensajes"] > 10) & (group["Nº Mensajes"] <= 30)])
                totalMACRE_30[i] = len(group.loc[group["Nº Mensajes"] > 30])
            except:
                totalMACRE_10[i] = 0
                totalMACRE_1030[i] = 0
                totalMACRE_30[i] = 0

    return totalMACRA_10, totalMACRA_1030, totalMACRA_30, totalMACRB_10, totalMACRB_1030, totalMACRB_30, totalMACRC_10, totalMACRC_1030, totalMACRC_30, totalMACRD_10, totalMACRD_1030, totalMACRD_30, totalMACRE_10, totalMACRE_1030, totalMACRE_30

def getTotalDevicesInPreviousInterval(data, timeSeries):
    """Función que devuelve una lista con el número de dispositivos registrados en el intervalo de tiempo actual y el anterior"""

    totalMACPreviousInterval = np.empty(len(timeSeries))

    timestamp = np.transpose(data["Timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S').unique())
    dataframe = pd.DataFrame(pd.to_datetime(timestamp))

    for i in range(len(timeSeries)):
        if len(dataframe.loc[dataframe[0] == timeSeries[i]]) == 0:
            totalMACPreviousInterval[i] = np.nan
        else:
            if i == 0:
                totalMACPreviousInterval[i] = 0
            else:
                group = data.loc[data["Timestamp"] == timeSeries[i]]
                groupToCheck = data.loc[data["Timestamp"] == timeSeries[i-1]]
                group.reset_index(inplace=True)
                groupToCheck.reset_index(inplace=True)
                count = 0
                uniqueMAC = group["MAC"].unique()
                for mac in uniqueMAC:
                    if len(groupToCheck.loc[groupToCheck["MAC"] == mac]) != 0:
                        count = count + 1
                totalMACPreviousInterval[i] = count

    return totalMACPreviousInterval

def getTotalDevicesInTwoPreviousIntervals(data, timeSeries):
    """Función que devuelve una lista con el número de dispositivos registrados en el intervalo de tiempo actual y los dos anteriores"""

    totalMACTwoPreviousInterval = np.empty(len(timeSeries))

    timestamp = np.transpose(data["Timestamp"].dt.strftime('%Y-%m-%d %H:%M:%S').unique())
    dataframe = pd.DataFrame(pd.to_datetime(timestamp))

    for i in range(len(timeSeries)):
        if len(dataframe.loc[dataframe[0] == timeSeries[i]]) == 0:
            totalMACTwoPreviousInterval[i] = np.nan
        else:
            if i == 0 or i == 1:
                totalMACTwoPreviousInterval[i] = 0
            else:
                group = data.loc[data["Timestamp"] == timeSeries[i]]
                groupToCheck = data.loc[data["Timestamp"] == timeSeries[i-1]]
                groupToCheckPrevious = data.loc[data["Timestamp"] == timeSeries[i-2]]
                group.reset_index(inplace=True)
                groupToCheck.reset_index(inplace=True)
                groupToCheckPrevious.reset_index(inplace=True)
                uniqueMAC = group["MAC"].unique()
                count = 0
                for mac in uniqueMAC:
                    if len(groupToCheck.loc[groupToCheck["MAC"] == mac]) != 0 and len(groupToCheckPrevious.loc[groupToCheckPrevious["MAC"] == mac]) != 0:
                        count = count + 1
                totalMACTwoPreviousInterval[i] = count

    return totalMACTwoPreviousInterval

def savePlotColumns(dataArray, columnsName, path="figures/", path2="figuresDate/"):
    """Función que guarda en una carpeta las gráficas para cada una de las columnas del training set"""

    for i in range(2, len(dataArray)):
        timestamp = pd.to_datetime(dataArray[0])
        date = timestamp[0].date().strftime('%Y-%m-%d')
        name = columnsName[i]+"_"+date
        nameDate = date+"_"+columnsName[i]
        plt.figure(figsize=(10,6))
        plt.plot(timestamp, dataArray[1], label=columnsName[1], color="red")
        plt.plot(timestamp, dataArray[i], label=columnsName[i], color="blue")
        plt.xlabel("Timestamp")
        plt.ylabel("Devices")
        plt.legend()
        plt.title(name)
        plt.savefig(path+name+'.jpg')
        plt.title(nameDate)
        plt.savefig(path2+nameDate+'.jpg')
        plt.clf()
        plt.close()

def getTrainingDataset(dataArray, fullPersonCountIntervalArray, previous=True):
    """Función que devuelve un conjunto de datos para el algoritmo de Machine Learning"""

    columns = ["Timestamp", "Person Count", "N MAC TOTAL", "N MAC RA", "N MAC RB", "N MAC RC", "N MAC RD", "N MAC RE", "N MAC RDE", "N MAC RCE", "N MAC RCDE", "N MAC RBE", "N MAC MEN RA 10", "N MAC MEN RA 10-30", "N MAC MEN RA 30", "N MAC MEN RB 10", "N MAC MEN RB 10-30", "N MAC MEN RB 30", "N MAC MEN RC 10", "N MAC MEN RC 10-30", "N MAC MEN RC 30", "N MAC MEN RD 10", "N MAC MEN RD 10-30", "N MAC MEN RD 30", "N MAC MEN RE 10", "N MAC MEN RE 10-30", "N MAC MEN RE 30", "N MAC INTERVALO ANTERIOR"]

    if previous:
        columns.append("N MAC DOS INTERVALOS ANTERIORES")

    trainingDataset = pd.DataFrame(columns=columns)

    for i in range(len(dataArray)):
        timeSeries = generateTimeSeriesByHour(dataArray[i])
        dataGroup = dataArray[i].groupby("Timestamp").nunique()
        dataGroup = parseData(timeSeries, dataGroup)
        totalMAC = dataGroup["MAC"].values

        totalMACRA, totalMACRB, totalMACRC, totalMACRD, totalMACRE = getTotalDevicesByRaspberry(dataArray[i], timeSeries)

        totalMACRDE, totalMACRCE, totalMACRCDE, totalMACRBE = getTotalDevicesByPairRaspberries(dataArray[i], timeSeries)

        totalMACRA_10, totalMACRA_1030, totalMACRA_30, totalMACRB_10, totalMACRB_1030, totalMACRB_30, totalMACRC_10, totalMACRC_1030, totalMACRC_30, totalMACRD_10, totalMACRD_1030, totalMACRD_30, totalMACRE_10, totalMACRE_1030, totalMACRE_30 = getTotalDeviceByMessageNumber(dataArray[i], timeSeries)

        totalMACPreviousInterval = getTotalDevicesInPreviousInterval(dataArray[i], timeSeries)

        timestamp = timeSeries.dt.strftime('%Y-%m-%d %H:%M:%S')

        if previous:
            totalMACTwoPreviousInterval = getTotalDevicesInTwoPreviousIntervals(dataArray[i], timeSeries)

            data = [timestamp.values, fullPersonCountIntervalArray[i]["personCount"].values, totalMAC, totalMACRA, totalMACRB, totalMACRC, totalMACRD, totalMACRE, totalMACRDE, totalMACRCE, totalMACRCDE, totalMACRBE, totalMACRA_10, totalMACRA_1030, totalMACRA_30, totalMACRB_10, totalMACRB_1030, totalMACRB_30, totalMACRC_10, totalMACRC_1030, totalMACRC_30, totalMACRD_10, totalMACRD_1030, totalMACRD_30, totalMACRE_10, totalMACRE_1030, totalMACRE_30, totalMACPreviousInterval, totalMACTwoPreviousInterval]
            savePlotColumns(data, columns)
            data = np.transpose(data)

        else:
            data = [timestamp.values, fullPersonCountIntervalArray[i]["personCount"].values, totalMAC, totalMACRA, totalMACRB, totalMACRC, totalMACRD, totalMACRE, totalMACRDE, totalMACRCE, totalMACRCDE, totalMACRBE, totalMACRA_10, totalMACRA_1030, totalMACRA_30, totalMACRB_10, totalMACRB_1030, totalMACRB_30, totalMACRC_10, totalMACRC_1030, totalMACRC_30, totalMACRD_10, totalMACRD_1030, totalMACRD_30, totalMACRE_10, totalMACRE_1030, totalMACRE_30, totalMACPreviousInterval]
            savePlotColumns(data, columns)
            data = np.transpose(data)

        df = pd.DataFrame(data, columns=columns)
        trainingDataset = pd.concat([trainingDataset, df], ignore_index=True)

    return trainingDataset

def checkTrainingSet(trainingSet, dates):
    """Función que comprueba que los valores obtenidos para el conjunto de entrenamiento tenga sentido. Devolverá un dataframe con los resultados"""

    checkDataframe = pd.DataFrame(columns=["Timestamp", "Maximum number of MAC", "Maximum number of Pair MAC", "Number of NaN"])

    trainingSetCheck = trainingSet.copy()
    trainingSetCheck["Timestamp"] = pd.to_datetime(trainingSetCheck["Timestamp"])

    for date in dates:
        group = trainingSetCheck.loc[trainingSetCheck["Timestamp"].dt.date == date]
        group = group.loc[:, (group.columns != 'Timestamp') & (group.columns != 'Person Count')]
        nanNumber = group["N MAC RA"].isna().sum()
        group = group.fillna(0)
        arr = group["N MAC TOTAL"] >= (group["N MAC RCE"] + group["N MAC RDE"] + group["N MAC RCDE"] + group["N MAC RBE"])

        if len(set(arr)) == 1:
            checkNumberPairMAC = "OK"
        else:
            checkNumberPairMAC = "ERROR"
        boolCheck = []
        for column in group.columns:
            arr = group["N MAC TOTAL"] >= group[column]
            if len(set(arr)) == 1:
                boolCheck.append(True)
            else:
                boolCheck.append(False)

        if len(set(boolCheck)) == 1:
            checkNumberMAC = "OK"
        else:
            checkNumberMAC = "ERROR"

        dt = pd.DataFrame([[date, checkNumberMAC, checkNumberPairMAC, nanNumber]],columns=["Timestamp", "Maximum number of MAC", "Maximum number of Pair MAC", "Number of NaN"])
        checkDataframe = pd.concat([checkDataframe, dt])
    print(checkDataframe)

    return checkDataframe

## Procedimiento

En primer lugar, cargamos los archivos con los que vamos a trabajar, alojados en las carpetas *data* y *personcount*.

In [4]:
# dataArray, fullPersonCountArray = readDataFromDirectory(dataPath, fullPersonCountPath)
dataArrayList, fullPersonCountIntervalArrayList, datesList = readDataFromDirectory('docs/data/', 'docs/personcount/')

A continuación, generamos el conjunto de entrenamiento utilizando los archivos previamente cargados. Esta función nos devolverá un Dataframe con todo lo necesario además de guardar en la carpeta *figures* una gráfica por cada columna para cada día.

In [5]:
# trainingDataset = getTrainingDataset(dataArray, fullPersonCountIntervalArray, previous=True)
trainingDatasetList = getTrainingDataset(dataArrayList, fullPersonCountIntervalArrayList)

  plt.figure(figsize=(10,6))


<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

Por último, guardamos el Dataframe generado en un csv para poder exportarlo.

In [6]:
trainingDatasetList.to_csv('docs/training-set.csv', sep=';', index=False, na_rep='NaN')

In [7]:
trainingDatasetList = pd.read_csv('docs/training-set.csv', sep=';')
# checkDataframe = checkTrainingSet(trainingSet, dates)
checkTrainingSet(trainingDatasetList, datesList)


    Timestamp Maximum number of MAC Maximum number of Pair MAC Number of NaN
0  2022-06-30                    OK                         OK             0
0  2022-07-01                    OK                         OK            78
0  2022-07-04                    OK                         OK             0
0  2022-07-05                    OK                         OK             0
0  2022-07-06                    OK                         OK             0
0  2022-07-07                    OK                         OK             0
0  2022-07-11                    OK                         OK             0
0  2022-07-12                    OK                         OK             0
0  2022-07-13                    OK                         OK             0
0  2022-07-14                    OK                         OK             0
0  2022-07-15                    OK                         OK             0


Unnamed: 0,Timestamp,Maximum number of MAC,Maximum number of Pair MAC,Number of NaN
0,2022-06-30,OK,OK,0
0,2022-07-01,OK,OK,78
0,2022-07-04,OK,OK,0
0,2022-07-05,OK,OK,0
0,2022-07-06,OK,OK,0
0,2022-07-07,OK,OK,0
0,2022-07-11,OK,OK,0
0,2022-07-12,OK,OK,0
0,2022-07-13,OK,OK,0
0,2022-07-14,OK,OK,0
