# **Test Preparation**

In [1]:
import pandas as pd

In [2]:
test_en = pd.read_csv('daasbstp2023/energia_202301-202304.csv', na_filter=False, encoding = "latin")
test_me = pd.read_csv('daasbstp2023/meteo_202301-202304.csv', na_filter=False, encoding = "latin")

In [3]:
test_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Data                     2256 non-null   object 
 1   Hora                     2256 non-null   int64  
 2   Normal (kWh)             2256 non-null   float64
 3   Horário Económico (kWh)  2256 non-null   float64
 4   Autoconsumo (kWh)        2256 non-null   float64
dtypes: float64(3), int64(1), object(1)
memory usage: 88.2+ KB


In [4]:
test_en.isna().any()

Data                       False
Hora                       False
Normal (kWh)               False
Horário Económico (kWh)    False
Autoconsumo (kWh)          False
dtype: bool

In [5]:
test_me.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   dt                   1752 non-null   int64  
 1   dt_iso               1752 non-null   object 
 2   city_name            1752 non-null   object 
 3   temp                 1752 non-null   float64
 4   feels_like           1752 non-null   float64
 5   temp_min             1752 non-null   float64
 6   temp_max             1752 non-null   float64
 7   pressure             1752 non-null   int64  
 8   sea_level            1752 non-null   object 
 9   grnd_level           1752 non-null   object 
 10  humidity             1752 non-null   int64  
 11  wind_speed           1752 non-null   float64
 12  rain_1h              1752 non-null   object 
 13  clouds_all           1752 non-null   int64  
 14  weather_description  1752 non-null   object 
dtypes: float64(5), int64(4), object(6)
mem

In [6]:
test_me.isna().any()

dt                     False
dt_iso                 False
city_name              False
temp                   False
feels_like             False
temp_min               False
temp_max               False
pressure               False
sea_level              False
grnd_level             False
humidity               False
wind_speed             False
rain_1h                False
clouds_all             False
weather_description    False
dtype: bool

In [7]:
# Unique values for each column in the Energy dataset.
for column in test_en.columns:
    unique_values = test_en[column].unique()
    print(f"{column}, Number of Unique Values: {len(unique_values)}")

Data, Number of Unique Values: 94
Hora, Number of Unique Values: 24
Normal (kWh), Number of Unique Values: 709
Horário Económico (kWh), Number of Unique Values: 490
Autoconsumo (kWh), Number of Unique Values: 485


In [8]:
# Unique values for each column in the Meteorology dataset

for column in test_me.columns:
    unique_values = test_me[column].unique()
    print(f"{column}, Number of Unique Values: {len(unique_values)}")

dt, Number of Unique Values: 1752
dt_iso, Number of Unique Values: 1752
city_name, Number of Unique Values: 1
temp, Number of Unique Values: 889
feels_like, Number of Unique Values: 1007
temp_min, Number of Unique Values: 169
temp_max, Number of Unique Values: 174
pressure, Number of Unique Values: 31
sea_level, Number of Unique Values: 1
grnd_level, Number of Unique Values: 1
humidity, Number of Unique Values: 71
wind_speed, Number of Unique Values: 512
rain_1h, Number of Unique Values: 121
clouds_all, Number of Unique Values: 99
weather_description, Number of Unique Values: 8


In [9]:
test_me = test_me.drop(['city_name', 'sea_level', 'grnd_level'], axis=1)

In [10]:
# Convert columns to unified format
test_en['datetime'] = pd.to_datetime(test_en['Data'] + ' ' + test_en['Hora'].astype(str) + ':00:00', format='%Y-%m-%d %H:%M:%S')

# Drop the original 'Data' and 'Hora' columns if needed
test_en = test_en.drop(['Data', 'Hora'], axis=1)

# Print the updated DataFrame
test_en.iloc[901].to_frame().T

Unnamed: 0,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),datetime
901,0.0,0.0,0.311,2023-02-07 13:00:00


In [11]:
# Convert columns to unified format
test_me['dt_iso'] = pd.to_datetime(test_me['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
test_me['dt_iso'] = test_me['dt_iso'].dt.tz_localize(None)

# Rename the column to 'datetime'
test_me = test_me.rename(columns={"dt_iso": "datetime"})

# We can also drop the 'dt' column as it is redundant
test_me = test_me.drop(['dt'], axis=1)

# Print the updated DataFrame
test_me.iloc[801].to_frame().T

Unnamed: 0,datetime,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,rain_1h,clouds_all,weather_description
801,2023-02-03 09:00:00,7.1,4.92,4.71,8.34,1033,81,3.16,,0,sky is clear


In [12]:
# Order the dataframes by datetime so we can detect any time skips
test_en = test_en.sort_values(by=['datetime'])
test_me = test_me.sort_values(by=['datetime'])

In [13]:
time_diff_en = test_en['datetime'].diff()
time_diff_me = test_me['datetime'].diff()

# Print the irregular time intervals
irregularities_en = time_diff_en[time_diff_en != '0 days 01:00:00']
irregularities_me = time_diff_me[time_diff_me != '0 days 01:00:00']
print("Irregular time intervals in df_en:")
print(irregularities_en)
print("\n")
print("Irregular time intervals in df_me:")
print(irregularities_me)

Irregular time intervals in df_en:
0   NaT
Name: datetime, dtype: timedelta64[ns]


Irregular time intervals in df_me:
0   NaT
Name: datetime, dtype: timedelta64[ns]


In [14]:
# Rename the Portuguese Columns to English (Injeção na rede (kWh), Horário Económico (kWh), Autoconsumo (kWh)
test_en = test_en.rename(columns={'Injeção na rede (kWh)': 'Injection', 'Horário Económico (kWh)': 'Economic (kWh)', 'Autoconsumo (kWh)': 'Self-consumption (kWh)'})

test_en.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Normal (kWh)            2256 non-null   float64       
 1   Economic (kWh)          2256 non-null   float64       
 2   Self-consumption (kWh)  2256 non-null   float64       
 3   datetime                2256 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3)
memory usage: 70.6 KB


In [15]:
test_merged = pd.merge(test_en, test_me, on='datetime', how='outer')

test_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Normal (kWh)            2256 non-null   float64       
 1   Economic (kWh)          2256 non-null   float64       
 2   Self-consumption (kWh)  2256 non-null   float64       
 3   datetime                2256 non-null   datetime64[ns]
 4   temp                    1752 non-null   float64       
 5   feels_like              1752 non-null   float64       
 6   temp_min                1752 non-null   float64       
 7   temp_max                1752 non-null   float64       
 8   pressure                1752 non-null   float64       
 9   humidity                1752 non-null   float64       
 10  wind_speed              1752 non-null   float64       
 11  rain_1h                 1752 non-null   object        
 12  clouds_all              1752 non-null   float64 

In [16]:
test_merged['rain_1h'] = pd.to_numeric(test_merged['rain_1h'].replace('', '0'))

In [17]:
# Perform One-Hot Encoding for 'weather_description'
test_merged = pd.get_dummies(test_merged, columns=['weather_description'], prefix='weather')

In [18]:
test_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2256 entries, 0 to 2255
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   Normal (kWh)                  2256 non-null   float64       
 1   Economic (kWh)                2256 non-null   float64       
 2   Self-consumption (kWh)        2256 non-null   float64       
 3   datetime                      2256 non-null   datetime64[ns]
 4   temp                          1752 non-null   float64       
 5   feels_like                    1752 non-null   float64       
 6   temp_min                      1752 non-null   float64       
 7   temp_max                      1752 non-null   float64       
 8   pressure                      1752 non-null   float64       
 9   humidity                      1752 non-null   float64       
 10  wind_speed                    1752 non-null   float64       
 11  rain_1h                       

In [19]:
# Fill the missing values with the mean of the column
test_merged_filled = test_merged.fillna(test_merged.mean())

In [20]:
# Convert datetime features in test dataset
test_merged_filled['year'] = test_merged_filled['datetime'].dt.year
test_merged_filled['hour'] = test_merged_filled['datetime'].dt.hour

# Drop the 'year' column
test_merged_filled = test_merged_filled.drop(['year'], axis=1)

# Transform the month and day into day of year
test_merged_filled['day_of_year'] = test_merged_filled['datetime'].dt.dayofyear

# Drop the 'month' and 'day' columns as well as the 'datetime' column
test_merged_filled = test_merged_filled.drop(['datetime'], axis=1)

In [21]:
# Print the updated DataFrame
test_merged_filled.iloc[801].to_frame().T

Unnamed: 0,Normal (kWh),Economic (kWh),Self-consumption (kWh),temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,...,weather_broken clouds,weather_few clouds,weather_heavy intensity rain,weather_light rain,weather_moderate rain,weather_overcast clouds,weather_scattered clouds,weather_sky is clear,hour,day_of_year
801,0.0,0.0,0.358,7.1,4.92,4.71,8.34,1033.0,81.0,3.16,...,False,False,False,False,False,False,False,True,9,34


In [22]:
# Convert 'rain_1h' to binary (1 for rain, 0 for no rain)
test_merged_filled['rain_binary'] = test_merged_filled['rain_1h'].apply(lambda x: 1 if x > 0 else 0)

# Convert rain_binary to boolean
test_merged_filled['rain_binary'] = test_merged_filled['rain_binary'].astype(bool)

In [24]:
# Write the final dataframe to a csv file
test_merged_filled.to_csv('datasets/test_merged_filled.csv', index=False, encoding='latin')