In [None]:
#DATASETS:  https://www.kaggle.com/competitions/daasbstp2023/data

In [None]:
#Let's start by importing all packages needed.
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
import seaborn as sns
from sklearn import preprocessing
import numpy as np
import time
from dateutil import parser
import pytz

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import plot_tree, export_text


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

from scipy.stats import uniform, poisson

from sklearn.metrics import confusion_matrix, \
                            accuracy_score, \
                            precision_score, \
                            recall_score, \
                            f1_score, \
                            fbeta_score, \
                            mean_squared_error, \
                            mean_absolute_error, \
                            roc_auc_score, \
                            roc_curve, \
                            classification_report


# **Sample and Assess**

In [None]:
#Load the datasets.
df_en1 = pd.read_csv('daasbstp2023/energia_202109-202112.csv', na_filter=False, encoding = "latin")
df_en2 = pd.read_csv('daasbstp2023/energia_202201-202212.csv', na_filter=False, encoding = "latin")
df_me1 = pd.read_csv('daasbstp2023/meteo_202109-202112.csv', na_filter=False, encoding = "latin")
df_me2 = pd.read_csv('daasbstp2023/meteo_202201-202212.csv', na_filter=False, encoding = "latin")

In [None]:
#Append the datasets by type.
df_en = df_en1._append(df_en2,ignore_index=True)
df_me = df_me1._append(df_me2,ignore_index=True)

## Energy Consumption Dataset

This dataset provides information on energy consumption recorded at different timestamps. Each record includes the following columns:

- **Data (Date):** The timestamp associated with the record, indicating the date.
- **Hora (Hour):** The hour associated with the record.
- **Normal (kWh):** The amount of electrical energy consumed, in kilowatt-hours (kWh), from the electrical grid during a normal period in daily bi-hourly cycles (non-off-peak hours).
- **Horário Económico (kWh):** The amount of electrical energy consumed, in kilowatt-hours (kWh), from the electrical grid during an economic period in daily bi-hourly cycles (off-peak hours).
- **Autoconsumo (kWh):** The amount of electrical energy consumed, in kilowatt-hours (kWh), generated from solar panels (self-consumption).
- **Injeção na rede (kWh):** A qualitative feature indicating the level of energy injection into the grid on a scale of None, Low, Medium, High, and Very High.

This dataset is utilized in a competition and includes various features, with particular emphasis on the "Injeção na rede (kWh)" feature, which provides information about the qualitative scale of energy injection into the grid.

In [None]:
df_en.info()

In [None]:
df_en.iloc[901].to_frame().T

In [None]:
df_en.isna().any()

In [None]:
# Unique values for each column in the Energy dataset.
for column in df_en.columns:
    unique_values = df_en[column].unique()
    print(f"{column}, Number of Unique Values: {len(unique_values)}")

## Weather Information Dataset

This dataset provides comprehensive information related to weather conditions at different timestamps. The columns in the dataset include:

- **dt (Timestamp):** The timestamp associated with the record.
- **dt_iso (ISO Date):** The date associated with the record, accurate to the second.
- **city_name:** The location for which weather data is recorded.
- **temp (Temperature):** The temperature in degrees Celsius.
- **feels_like (Feels Like):** The perceived temperature in degrees Celsius.
- **temp_min (Min Temperature):** The minimum felt temperature in degrees Celsius.
- **temp_max (Max Temperature):** The maximum felt temperature in degrees Celsius.
- **pressure:** Atmospheric pressure in atmospheres.
- **sea_level:** Atmospheric pressure at sea level in atmospheres.
- **grnd_level:** Atmospheric pressure at local altitude in atmospheres.
- **humidity:** Humidity level as a percentage.
- **wind_speed:** Wind speed in meters per second.
- **rain_1h (Rainfall):** Average precipitation value in the last hour.
- **clouds_all (Cloudiness):** Cloud coverage percentage.
- **weather_description:** Qualitative assessment of weather conditions.


In [None]:
df_me.info()

In [None]:
df_me.iloc[801].to_frame().T

In [None]:
df_me.isna().any()

In [None]:
# Unique values for each column in the Energy dataset.
for column in df_me.columns:
    unique_values = df_me[column].unique()
    print(f"{column}, Number of Unique Values: {len(unique_values)}")

# **Data Processing**

After Loading the data, we must now process it in order to be able to use it in our model. For this we will follow the following steps:
1. Remove unnecessary columns
2. Handle the Date column on both datasets and unify the format
3. Join the datasets
4. Handle the missing values or rows (dates that are not present in both datasets) if any
5. Handle the categorical values

In [None]:
# We can drop city_name, sea_level and grnd_level as they only have one unique value
df_me = df_me.drop(['city_name', 'sea_level', 'grnd_level'], axis=1)

In [None]:
# # Convert columns to categorical
# df_en["Injeção na rede (kWh)"] = df_en["Injeção na rede (kWh)"].astype("category")
# df_me["weather_description"] = df_me['weather_description'].astype("category")

# # For "Injeção na rede (kWh)" column in df_en
# print("Categories for 'Injeção na rede (kWh)' in df_en:")
# print(df_en["Injeção na rede (kWh)"].cat.categories)
# print("\n")

# # For "weather_description" column in df_me
# print("Categories for 'weather_description' in df_me:")
# print(df_me["weather_description"].cat.categories)
# print("\n")


In [None]:
# Convert columns to unified format
df_en['datetime'] = pd.to_datetime(df_en['Data'] + ' ' + df_en['Hora'].astype(str) + ':00:00', format='%Y-%m-%d %H:%M:%S')

# Drop the original 'Data' and 'Hora' columns if needed
df_en = df_en.drop(['Data', 'Hora'], axis=1)

# Print the updated DataFrame
df_en.iloc[901].to_frame().T

In [None]:
df_en.info()

In [None]:
#fix datetime in df_me (loses the UTC tag)
df_me['dt_iso'] = pd.to_datetime(df_me['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
df_me['dt_iso'] = df_me['dt_iso'].dt.tz_localize(None)
df_me = df_me.rename(columns={"dt_iso": "datetime"})
df_me.iloc[801].to_frame().T

In [None]:
# We can also drop the 'dt' column as it is redundant
df_me = df_me.drop(['dt'], axis=1)

In [None]:
# Order the dataframes by datetime so we can detect any time skips
df_en = df_en.sort_values(by=['datetime'])
df_me = df_me.sort_values(by=['datetime'])

In [None]:
time_diff_en = df_en['datetime'].diff()
time_diff_me = df_me['datetime'].diff()

# Print the irregular time intervals
irregularities_en = time_diff_en[time_diff_en != '0 days 01:00:00']
irregularities_me = time_diff_me[time_diff_me != '0 days 01:00:00']
print("Irregular time intervals in df_en:")
print(irregularities_en)
print("\n")
print("Irregular time intervals in df_me:")
print(irregularities_me)


In [None]:
outer_join_merged_df = pd.merge(df_en, df_me, on='datetime', how='outer')

In [None]:
outer_join_merged_df.isna().sum()

Since the datetime data exhibited no irregularities, the additional entries present in the Weather dataset but not in the Energy dataset can be attributed to the Weather dataset containing data from days before or after the Energy dataset's first or last entry, respectively. A manual analysis of the dataset reveals that the Weather dataset includes entries starting from 2021-09-01, while the Energy dataset commences from 2021-09-29. Consequently, it is necessary to exclude entries from the Weather dataset that precede 2021-09-29, as they will not contribute to the modeling process. In order to achieve this we will do an Inner Join between the two datasets on the datetime column, and the resulting dataset will be the one we use going forward.

In [None]:
#at this point, if I do an inner join I am losing 5.7% of all datarows
merged_df = pd.merge(df_en, df_me, on='datetime', how='inner')

In [None]:
#Sava the clean dataset
merged_df.to_csv('merged.csv')

In [None]:
merged_df.rename(columns={'Injeção na rede (kWh)': 'Injection'}, inplace=True)
merged_df.info()

In [None]:
merged_df.iloc[901].to_frame().T

In [None]:
# Convert 'rain_1h' column to float and replace empty strings with 0
merged_df['rain_1h'] = pd.to_numeric(merged_df['rain_1h'].replace('', '0'))

In [None]:
merged_df.info()

In [None]:
weather_description_counts = merged_df['weather_description'].value_counts()
print(weather_description_counts)

In [None]:
# Perform One-Hot Encoding for 'weather_description'
merged_df = pd.get_dummies(merged_df, columns=['weather_description'], prefix='weather')


In [None]:
# Drop the original 'weather_description' column
merged_df = merged_df.drop('weather_description', axis=1)


# ALTERNATIVE PATH
---


In [None]:
merged_df_testing = merged_df.copy()
order_mapping = {'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}

# Map the categorical values to numerical values using the specified order
merged_df_testing['Injection'] = merged_df_testing['Injection'].map(order_mapping)


merged_df_testing.iloc[901].to_frame().T

In [None]:
plt.figure(figsize=(12, 6))

# Specify the order for 'Injeção na Rede'
injecao_order = [0, 1, 2, 3, 4]

# Box plot with 'clouds_all' on the x-axis and 'Injeção na Rede' on the y-axis
sns.boxplot(x='clouds_all', y='Injection', data=merged_df_testing, order=injecao_order, palette='viridis')

plt.title('Injeção na Rede vs. Cloud Coverage')
plt.xlabel('Cloud Coverage (%)')
plt.ylabel('Injeção na Rede')

# Manually create a legend to the right side
legend_labels = ['None', 'Low', 'Medium', 'High', 'Very High']
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=f'{label} ({value})', markerfacecolor=color, markersize=10) for value, (label, color) in enumerate(zip(legend_labels, sns.color_palette('viridis', n_colors=len(legend_labels))))]

plt.legend(handles=legend_handles, title='Injeção na Rede', bbox_to_anchor=(1.05, 1), loc='upper left')

# Explicitly show the plot
plt.show()

In [None]:
# Convert 'rain_1h' column to float and replace empty strings with 0
merged_df['rain_1h'] = pd.to_numeric(merged_df['rain_1h'].replace('', '0'))

# Exclude 'Injection' and 'weather_description' columns from the correlation matrix
columns_to_exclude = ['Injection', 'weather_description']
correlation_matrix = merged_df.drop(columns_to_exclude, axis=1).corr()

# Generate a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix (Excluding "Injection" and "weather_description")')
plt.show()

In [None]:
# Assuming 'Injection' is a categorical column with order: None, Low, Medium, High, Very High
injection_order = ['None', 'Low', 'Medium', 'High', 'Very High']

# Set the order of the 'Injection' column
merged_df['Injection'] = pd.Categorical(merged_df['Injection'], categories=injection_order, ordered=True)

# Create subplots with 4 plots per row
num_columns = 4
num_rows = (len(merged_df.columns) - 1) // num_columns + 1

# Set up the subplots
fig, axes = plt.subplots(num_rows, num_columns, figsize=(16, 4 * num_rows))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate over numerical columns (excluding 'Injection')
numerical_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
for i, column in enumerate(numerical_columns):
    ax = axes[i]
    sns.boxplot(x='Injection', y=column, data=merged_df, ax=ax, order=injection_order)
    ax.set_title(f'Box Plot of {column} by Injection')

# Remove empty subplots
for i in range(len(numerical_columns), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout
plt.tight_layout()
plt.show()