# 1. Importación de paquetes

In [14]:
#Import packages
from modules import preprocess
from modules import training

# 2. Preprocesamiento del *dataset*: [Room Occupancy Estimation Data Set](https://www.kaggle.com/ananthr1/room-occupancy-estimation-data-set)

## Paso 1: Lectura del *dataset* y obtención de datos

In [15]:
df = preprocess.load_csv_data('Occupancy_Estimation.csv')
df.shape

(10129, 19)

In [16]:
df.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


## Paso 2: Recodificación de la columna *RoomOccupancyCount*

En este caso, la columna que contiene la variable de salida (*RoomOccupancyCount*) cuenta el número de personas en la sala. Para simplificar, nuestro objetivo va a ser detectar si la sala está vacía o, en cambio, hay alguien (no el número exacto de personas).

Por lo tanto, se **recodificará cualquier valor mayor que 0 en esa columna como 1**, de tal forma que la variable de salida sea binaria:
- 0 -> ausencia.
- 1 -> presencia.

In [17]:
# Show "Room_Occupancy_Count" before processing
df["Room_Occupancy_Count"]

0        1
1        1
2        1
3        1
4        1
        ..
10124    0
10125    0
10126    0
10127    0
10128    0
Name: Room_Occupancy_Count, Length: 10129, dtype: int64

In [18]:
#Process "Room_Occupancy_Count"
preprocess.recode_dataset_output(df)

# Show "Room_Occupancy_Count" after processing
df["Room_Occupancy_Count"]

0        1
1        1
2        1
3        1
4        1
        ..
10124    0
10125    0
10126    0
10127    0
10128    0
Name: Room_Occupancy_Count, Length: 10129, dtype: int32

## Paso 3: Eliminación de las columnas temporales *Date* y *Time*

Respecto a los datos ofrecidos por las columas *Date* y *Time*, vamos a filtrarlos y dejarlos fuera del proceso. Ya que, aunque los incluyeramos (por ejemplo, juntando ambas partes en una sola columna y convirtiéndolo a formato 'epoch', como long int), los valores de muestras sucesivas de esa columna estarían totalmente correlados entre sí y, como consecuencia, fastidiaríamos a la mayoría de algoritmos que vamos a emplear posteriormente.

Se debe tener en cuenta que no nos estamos olvidando de la información temporal para considerar como si cada valor muestreado (fila de la tabla) fuese independiente de las demás filas. Sabemos que eso no es así, pero estamos considerando que esa información temporal no nos ofrece valor añadido para predecir si la habitación está ocupada o vacía. En caso contrario, se tendrían que usar modelos bastante más complicados para considerar esa relación temporal que indica que, en realidad, las muestras de dos filas adyacentes son consecutivas en el tiempo.

In [19]:
# Filter Date and Time columns
preprocess.remove_time_columns(df)

# Show the result
df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


## Paso 4: División del *dataset* en *train* y *test*

In [20]:
# Split dataset into train and test
train_df, test_df = preprocess.split_dataset(df)
train_df.shape, test_df.shape

((7596, 17), (2533, 17))

In [21]:
train_df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
4715,25.25,25.25,24.69,25.63,0,0,0,0,0.07,0.06,0.06,0.06,360,0.0,0,0,0
6192,25.44,25.44,25.31,26.19,2,3,14,10,0.08,0.05,0.06,0.11,350,0.011538,0,0,0
6934,25.44,25.44,25.0,25.81,0,0,0,0,0.07,0.06,0.06,0.1,355,0.0,0,0,0
7589,25.25,25.25,24.75,25.56,0,0,0,0,0.08,0.05,0.06,0.1,355,0.0,0,0,0
6321,25.44,25.44,25.25,26.06,0,0,0,0,0.07,0.05,0.06,0.11,350,0.0,1,0,0


In [22]:
test_df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
1857,25.25,25.25,24.75,25.63,0,0,0,0,0.07,0.05,0.07,0.07,365,-0.061538,0,0,0
7807,25.13,25.19,24.56,25.38,0,0,0,0,0.08,0.05,0.07,0.1,355,-0.061538,0,0,0
292,26.06,26.94,26.0,26.31,164,255,277,71,0.1,0.18,0.4,0.1,900,1.461538,1,1,1
8750,25.44,25.44,25.06,25.56,0,0,0,0,0.07,0.05,0.05,0.08,375,-0.088462,0,0,0
10119,25.06,25.13,24.69,25.19,6,7,33,22,0.07,0.05,0.05,0.08,345,0.0,0,0,0


## Paso 5: División del *dataset* de entrenamiento en *features* y *output*

In [23]:
# Get features columns
X = preprocess.get_features(train_df)

# Get output column
y = preprocess.get_output(train_df)

In [24]:
X.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
4715,25.25,25.25,24.69,25.63,0,0,0,0,0.07,0.06,0.06,0.06,360,0.0,0,0
6192,25.44,25.44,25.31,26.19,2,3,14,10,0.08,0.05,0.06,0.11,350,0.011538,0,0
6934,25.44,25.44,25.0,25.81,0,0,0,0,0.07,0.06,0.06,0.1,355,0.0,0,0
7589,25.25,25.25,24.75,25.56,0,0,0,0,0.08,0.05,0.06,0.1,355,0.0,0,0
6321,25.44,25.44,25.25,26.06,0,0,0,0,0.07,0.05,0.06,0.11,350,0.0,1,0


In [25]:
y.head()

4715    0
6192    0
6934    0
7589    0
6321    0
Name: Room_Occupancy_Count, dtype: int32

# 2. Entrenamiento y seguimiento de la huella de carbono con **Cumulator**

In [26]:
#training.train_LR_cumulator(X,y)
#training.train_RF_cumulator(X,y)
training.train_SVC_cumulator(X,y)

CPU recognized: TDP set to 65
3.995503849453396e-05
########
Overall carbon footprint: 4.00e-05 gCO2eq
########
Carbon footprint due to computations: 4.00e-05 gCO2eq
Carbon footprint due to communications: 0.00e+00 gCO2eq
This carbon footprint is equivalent to 1.96e-12 garbage trucks of waste recycled instead of landfilled.


