In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [13]:
from matplotlib.pyplot import rcParams

rcParams['figure.figsize'] = 15, 10
rcParams["font.weight"] = "bold"
rcParams["axes.labelweight"] = "bold"
rcParams["font.size"] = 12

In [14]:
df_placas = pd.read_csv('placas_solares.csv')

df_placas

Unnamed: 0,date,Cumulative_solar_power,kWh electricity/day,Gas/day
0,26/10/2011,0.1,15.1,9.0
1,27/10/2011,10.2,7.4,9.2
2,28/10/2011,20.2,5.8,8.0
3,29/10/2011,29.6,4.9,6.6
4,30/10/2011,34.2,11.7,5.3
...,...,...,...,...
3299,6/11/2020,36445.0,16.0,11.0
3300,7/11/2020,36453.0,13.0,13.0
3301,8/11/2020,36461.0,12.0,11.0
3302,9/11/2020,36466.0,14.0,10.0


In [15]:
df_placas.isna().sum()

date                      0
Cumulative_solar_power    0
kWh electricity/day       0
Gas/day                   0
dtype: int64

In [16]:
df_placas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3304 entries, 0 to 3303
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date                    3304 non-null   object 
 1   Cumulative_solar_power  3304 non-null   float64
 2   kWh electricity/day     3304 non-null   float64
 3   Gas/day                 3304 non-null   float64
dtypes: float64(3), object(1)
memory usage: 103.4+ KB


In [17]:
df_placas["date"].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
3299    False
3300    False
3301    False
3302    False
3303    False
Name: date, Length: 3304, dtype: bool

In [18]:
df_placas["date"].isna().sum()

0

In [19]:
df_placas["date"][df_placas["date"].isna()]

Series([], Name: date, dtype: object)

In [20]:
df_placas[4:3300][["date", "Gas/day", "Cumulative_solar_power"]]

Unnamed: 0,date,Gas/day,Cumulative_solar_power
4,30/10/2011,5.3,34.2
5,31/10/2011,5.7,38.0
6,1/11/2011,5.3,46.6
7,2/11/2011,5.4,51.6
8,3/11/2011,7.6,58.6
...,...,...,...
3295,2/11/2020,6.0,36422.0
3296,3/11/2020,5.0,36424.0
3297,4/11/2020,9.0,36430.0
3298,5/11/2020,11.0,36437.0


In [21]:
df_placas.drop(df_placas.dropna().index)

Unnamed: 0,date,Cumulative_solar_power,kWh electricity/day,Gas/day


In [22]:
# No hay nada que limpiar
df_placas.shape[0] - 0

3304

In [23]:
df_solar_filt = df_placas.dropna().reset_index(drop=True)
df_solar_filt

Unnamed: 0,date,Cumulative_solar_power,kWh electricity/day,Gas/day
0,26/10/2011,0.1,15.1,9.0
1,27/10/2011,10.2,7.4,9.2
2,28/10/2011,20.2,5.8,8.0
3,29/10/2011,29.6,4.9,6.6
4,30/10/2011,34.2,11.7,5.3
...,...,...,...,...
3299,6/11/2020,36445.0,16.0,11.0
3300,7/11/2020,36453.0,13.0,13.0
3301,8/11/2020,36461.0,12.0,11.0
3302,9/11/2020,36466.0,14.0,10.0


### Análisis exploratorio, tratamiento y limpieza de datos


In [24]:
df_solar_filt.describe()

Unnamed: 0,Cumulative_solar_power,kWh electricity/day,Gas/day
count,3304.0,3304.0,3304.0
mean,17616.116435,4.585048,8.343705
std,10577.158537,9.856726,6.334102
min,0.1,-24.0,0.0
25%,8089.5,-3.0,2.0
50%,17184.5,6.0,8.0
75%,27116.5,13.0,13.0
max,36469.0,34.0,29.0


In [25]:
# consultar el tipo de datos
df_solar_filt.dtypes

date                       object
Cumulative_solar_power    float64
kWh electricity/day       float64
Gas/day                   float64
dtype: object

In [26]:
df_solar_filt.dtypes[df_solar_filt.dtypes == "object"]

date    object
dtype: object

In [27]:
# esto no hace falta
df_solar_filt['date'].value_counts()

date
26/10/2011    1
9/11/2017     1
30/10/2017    1
31/10/2017    1
1/11/2017     1
             ..
3/11/2014     1
4/11/2014     1
5/11/2014     1
6/11/2014     1
10/11/2020    1
Name: count, Length: 3304, dtype: int64

In [28]:
# eliminar la variable date al ser tipo objeto y meramente informativa
df_solar_filt.drop(['date'], axis=1)

Unnamed: 0,Cumulative_solar_power,kWh electricity/day,Gas/day
0,0.1,15.1,9.0
1,10.2,7.4,9.2
2,20.2,5.8,8.0
3,29.6,4.9,6.6
4,34.2,11.7,5.3
...,...,...,...
3299,36445.0,16.0,11.0
3300,36453.0,13.0,13.0
3301,36461.0,12.0,11.0
3302,36466.0,14.0,10.0


In [29]:
df_solar_filt2 = df_solar_filt.drop(['date'], axis=1)
df_solar_filt2.dtypes.value_counts()

float64    3
Name: count, dtype: int64

### Reducción de variables
Importancia de variables

In [30]:
# generar la lista de features y la variable target
target = 'Cumulative_solar_power'
features = [x for x in df_solar_filt.columns if x != target]

print(target)
print(features)

Cumulative_solar_power
['date', 'kWh electricity/day', 'Gas/day']


In [None]:
df_solar_filt2