# 1. Instalación de dependencias e importación de paquetes

In [7]:
# Install dependencies
!pip install pandas
!pip install numpy
!pip install -U scikit-learn
!pip install codecarbon
!pip install geopy
!pip install cumulator
!pip install GPUtil
!pip install py-cpuinfo
!pip install geocoder
!pip install matplotlib
!pip install carbontracker



In [8]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from codecarbon import EmissionsTracker
from cumulator import base

# 2. Preprocesamiento del *dataset*: [Room Occupancy Estimation Data Set](https://www.kaggle.com/ananthr1/room-occupancy-estimation-data-set)

In [9]:
# Load data as dataframe
df = pd.read_csv('Occupancy_Estimation.csv')
df.shape

(10129, 19)

In [10]:
df.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


## Paso 1: Recodificación de la columna *RoomOccupancyCount*

En este caso, la columna que contiene la variable de salida (*RoomOccupancyCount*) cuenta el número de personas en la sala. Para simplificar, nuestro objetivo va a ser detectar si la sala está vacía o, en cambio, hay alguien (no el número exacto de personas).

Por lo tanto, se **recodificará cualquier valor mayor que 0 en esa columna como 1**, de tal forma que la variable de salida sea binaria:
- 0 -> ausencia.
- 1 -> presencia.

In [11]:
# Show "Room_Occupancy_Count" before processing
df["Room_Occupancy_Count"]

0        1
1        1
2        1
3        1
4        1
        ..
10124    0
10125    0
10126    0
10127    0
10128    0
Name: Room_Occupancy_Count, Length: 10129, dtype: int64

In [12]:
#Process "Room_Occupancy_Count"
df["Room_Occupancy_Count"] = np.where(df["Room_Occupancy_Count"] > 0, 1, 0)

# Show "Room_Occupancy_Count" after processing
df["Room_Occupancy_Count"]

0        1
1        1
2        1
3        1
4        1
        ..
10124    0
10125    0
10126    0
10127    0
10128    0
Name: Room_Occupancy_Count, Length: 10129, dtype: int32

## Paso 2: Eliminación de las columnas temporales *Date* y *Time*

Respecto a los datos ofrecidos por las columas *Date* y *Time*, vamos a filtrarlos y dejarlos fuera del proceso. Ya que, aunque los incluyeramos (por ejemplo, juntando ambas partes en una sola columna y convirtiéndolo a formato 'epoch', como long int), los valores de muestras sucesivas de esa columna estarían totalmente correlados entre sí y, como consecuencia, fastidiaríamos a la mayoría de algoritmos que vamos a emplear posteriormente.

Se debe tener en cuenta que no nos estamos olvidando de la información temporal para considerar como si cada valor muestreado (fila de la tabla) fuese independiente de las demás filas. Sabemos que eso no es así, pero estamos considerando que esa información temporal no nos ofrece valor añadido para predecir si la habitación está ocupada o vacía. En caso contrario, se tendrían que usar modelos bastante más complicados para considerar esa relación temporal que indica que, en realidad, las muestras de dos filas adyacentes son consecutivas en el tiempo.


In [13]:
# Filter Date and Time columns
df.drop(['Time'], axis=1, inplace=True)
df.drop(['Date'], axis=1, inplace=True)

# Show the result
df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


## Paso 3: División del *dataset* en *train* y *test*

In [14]:
# Split dataset into train and test
train_df, test_df = train_test_split(df) #75% training, 25% test
train_df.shape, test_df.shape

((7596, 17), (2533, 17))

In [15]:
train_df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
5735,25.31,25.31,25.13,25.88,17,20,75,54,0.08,0.05,0.06,0.1,355,0.0,0,0,0
8847,25.38,25.38,25.0,25.5,0,0,0,0,0.07,0.05,0.05,0.08,360,0.0,0,0,0
350,26.19,26.88,26.06,26.44,20,24,101,62,0.07,0.05,0.06,0.06,970,-0.3,0,0,0
2363,25.0,25.0,24.44,25.31,3,3,17,12,0.08,0.05,0.08,0.06,355,0.003846,0,0,0
7759,25.19,25.19,24.63,25.44,0,0,0,0,0.08,0.05,0.06,0.1,355,0.0,0,0,0


In [16]:
test_df.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
977,26.19,26.63,25.94,26.25,0,0,0,0,0.07,0.05,0.06,0.06,1155,-4.965385,0,0,0
9447,25.19,25.19,24.69,25.25,0,0,0,0,0.07,0.05,0.06,0.09,345,0.0,0,0,0
1767,25.31,25.31,24.81,25.69,0,0,0,0,0.06,0.05,0.06,0.06,365,0.0,0,0,0
353,26.19,26.81,26.06,26.44,19,23,99,60,0.08,0.05,0.06,0.06,970,-0.1,0,0,0
6325,25.44,25.44,25.25,26.0,0,0,0,0,0.08,0.05,0.06,0.1,350,0.0,0,0,0


## Paso 4: División del *dataset* de entrenamiento en *features* y *output*

In [17]:
# Get output column
train_output = train_df['Room_Occupancy_Count']
# Get features columns
train_features = train_df.copy()
train_features.drop(['Room_Occupancy_Count'], axis=1, inplace=True)

In [18]:
train_features.head()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
5735,25.31,25.31,25.13,25.88,17,20,75,54,0.08,0.05,0.06,0.1,355,0.0,0,0
8847,25.38,25.38,25.0,25.5,0,0,0,0,0.07,0.05,0.05,0.08,360,0.0,0,0
350,26.19,26.88,26.06,26.44,20,24,101,62,0.07,0.05,0.06,0.06,970,-0.3,0,0
2363,25.0,25.0,24.44,25.31,3,3,17,12,0.08,0.05,0.08,0.06,355,0.003846,0,0
7759,25.19,25.19,24.63,25.44,0,0,0,0,0.08,0.05,0.06,0.1,355,0.0,0,0


In [19]:
train_output.head()

5735    0
8847    0
350     0
2363    0
7759    0
Name: Room_Occupancy_Count, dtype: int32

# 3. Seguimiento de la huella de carbono

## 1. *Logistic Regression*:


### CodeCarbon:

In [20]:
# Tracking training with codecarbon tool
#tracker = EmissionsTracker()
lg_pipeline = Pipeline([("scaler", StandardScaler()), ("logistic_regression", LogisticRegression()),])

#tracker.start()
#lg_pipeline.fit(train_features, train_output)
#emissions: float = tracker.stop()
#print(f"Emissions: {emissions} kg")

### Cumulator

In [21]:
#cumulator = base.Cumulator()
#cumulator.run(lg_pipeline.fit, X=train_features, y=train_output)
#print(cumulator.total_carbon_footprint())
#cumulator.display_carbon_footprint()

### CarbonTracker

In [None]:
from carbontracker.tracker import CarbonTracker
max_epochs=1
tracker = CarbonTracker(epochs=max_epochs,components="cpu")

# Training loop.
for epoch in range(max_epochs):
    tracker.epoch_start()

    # Your model training.
    lg_pipeline.fit(train_features, train_output)

    tracker.epoch_end()

# Optional: Add a stop in case of early termination before all monitor_epochs has
# been monitored to ensure that actual consumption is reported.
tracker.stop()

CarbonTracker: CRITICAL - Traceback (most recent call last):
  File "C:\Users\INTEL I5\miniconda3\envs\TFG_Project\lib\site-packages\carbontracker\tracker.py", line 124, in run
    self.begin()
  File "C:\Users\INTEL I5\miniconda3\envs\TFG_Project\lib\site-packages\carbontracker\tracker.py", line 138, in begin
    self._components_remove_unavailable()
  File "C:\Users\INTEL I5\miniconda3\envs\TFG_Project\lib\site-packages\carbontracker\tracker.py", line 200, in _components_remove_unavailable
    raise exceptions.NoComponentsAvailableError()
carbontracker.exceptions.NoComponentsAvailableError: No components were available. CarbonTracker supports Intel CPUs with the RAPL interface and NVIDIA GPUs.

CarbonTracker: CRITICAL - Traceback (most recent call last):
  File "C:\Users\INTEL I5\miniconda3\envs\TFG_Project\lib\site-packages\carbontracker\tracker.py", line 124, in run
    self.begin()
  File "C:\Users\INTEL I5\miniconda3\envs\TFG_Project\lib\site-packages\carbontracker\tracker.py", l

## 2. *Random Forest*

### CodeCarbon:

In [23]:
# Tracking training with codecarbon tool
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1)
tracker = EmissionsTracker()

tracker.start()
rnd_clf.fit(train_features, train_output)
emissions: float = tracker.stop()
print(f"Emissions: {emissions} kg")

[codecarbon INFO @ 18:06:24] [setup] RAM Tracking...
[codecarbon INFO @ 18:06:24] [setup] GPU Tracking...
[codecarbon INFO @ 18:06:24] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 18:06:24] [setup] CPU Tracking...
[codecarbon INFO @ 18:06:27] CPU Model on constant consumption mode: Intel(R) Core(TM) i5-9400F CPU @ 2.90GHz
[codecarbon INFO @ 18:06:27] >>> Tracker's metadata:
[codecarbon INFO @ 18:06:27]   Platform system: Windows-10-10.0.19044-SP0
[codecarbon INFO @ 18:06:27]   Python version: 3.10.4
[codecarbon INFO @ 18:06:27]   Available RAM : 7.935 GB
[codecarbon INFO @ 18:06:27]   CPU count: 6
[codecarbon INFO @ 18:06:27]   CPU model: Intel(R) Core(TM) i5-9400F CPU @ 2.90GHz
[codecarbon INFO @ 18:06:27]   GPU count: 1
[codecarbon INFO @ 18:06:27]   GPU model: 1 x NVIDIA GeForce GT 710
[codecarbon INFO @ 18:06:27] Energy consumed for RAM : 0.000000 kWh. RAM Power : 2.97562837600708 W
[codecarbon INFO @ 18:06:27] Energy consumed for all GPUs : 0.000000 kWh. All GPUs Power : 0.0 

Emissions: 4.094688318526983e-07 kg


## 3. *Support Vector Machine* (SVM)

### CodeCarbon

In [24]:
# Tracking training with codecarbon tool
svm_pipeline = Pipeline([("scaler", StandardScaler()),("linear_svc", LinearSVC(C=1, loss="hinge")),])
tracker = EmissionsTracker()

tracker.start()
svm_pipeline.fit(train_features, train_output)
emissions: float = tracker.stop()
print(f"Emissions: {emissions} kg")

[codecarbon INFO @ 18:06:27] [setup] RAM Tracking...
[codecarbon INFO @ 18:06:27] [setup] GPU Tracking...
[codecarbon INFO @ 18:06:27] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 18:06:27] [setup] CPU Tracking...
[codecarbon INFO @ 18:06:29] CPU Model on constant consumption mode: Intel(R) Core(TM) i5-9400F CPU @ 2.90GHz
[codecarbon INFO @ 18:06:29] >>> Tracker's metadata:
[codecarbon INFO @ 18:06:29]   Platform system: Windows-10-10.0.19044-SP0
[codecarbon INFO @ 18:06:29]   Python version: 3.10.4
[codecarbon INFO @ 18:06:29]   Available RAM : 7.935 GB
[codecarbon INFO @ 18:06:29]   CPU count: 6
[codecarbon INFO @ 18:06:29]   CPU model: Intel(R) Core(TM) i5-9400F CPU @ 2.90GHz
[codecarbon INFO @ 18:06:29]   GPU count: 1
[codecarbon INFO @ 18:06:29]   GPU model: 1 x NVIDIA GeForce GT 710
[codecarbon INFO @ 18:06:29] Energy consumed for RAM : 0.000000 kWh. RAM Power : 2.97562837600708 W
[codecarbon INFO @ 18:06:29] Energy consumed for all GPUs : 0.000000 kWh. All GPUs Power : 0.0 

Emissions: 2.0361904608943212e-07 kg
