In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import gdown
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (mean_squared_error, r2_score, accuracy_score, confusion_matrix, f1_score)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
url = 'https://drive.google.com/uc?id=1p-TGJx4b6HK5JhotLcdtLF3mdJOUPiE8'
output = "SolarPrediction.csv"
gdown.download(url, output, quiet=False)

df = pd.read_csv(
    'SolarPrediction.csv',
    sep=',',
    na_values=['?'],
    low_memory=False,
    on_bad_lines='skip'
)

Downloading...
From: https://drive.google.com/uc?id=1p-TGJx4b6HK5JhotLcdtLF3mdJOUPiE8
To: /content/SolarPrediction.csv
100%|██████████| 2.96M/2.96M [00:00<00:00, 71.8MB/s]


In [22]:
print(df.columns.tolist())

['UNIXTime', 'Data', 'Time', 'Radiation', 'Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'TimeSunRise', 'TimeSunSet']


In [23]:
y = df["Radiation"]

In [24]:
X = df[["Temperature", "Pressure", "Humidity", "WindDirection(Degrees)", "Speed"]]


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
Models = {
    "LinearRegression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=0),
    "Random Forest": RandomForestRegressor(random_state=0, n_estimators=100)
}

In [33]:
for name, model in Models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  r2 = r2_score(y_test, y_pred)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)

  print(f"Modelo: {name}")
  print(f"R²: {r2: .2f}")
  print(f"MSE: {mse: .2f}")
  print(f"RMSE: {rmse: .2f}")


Modelo: LinearRegression
R²:  0.57
MSE:  43953.69
RMSE:  209.65
Modelo: Decision Tree
R²:  0.52
MSE:  48815.01
RMSE:  220.94
Modelo: Random Forest
R²:  0.74
MSE:  26056.46
RMSE:  161.42


In [None]:
## O modelo que melhor explica é o Random Forest, modelo não linear.

In [40]:
url = "https://drive.google.com/uc?id=1gB4AXCUOaJvMFZe--Xlo42q92_DhYuhq"
output = "T1.csv"
gdown.download(url, output, quiet=False)

df2 = pd.read_csv(
    'T1.csv',
    sep=',',
    na_values=['?'],
    low_memory=False,
    on_bad_lines='skip'
)

Downloading...
From: https://drive.google.com/uc?id=1gB4AXCUOaJvMFZe--Xlo42q92_DhYuhq
To: /content/T1.csv
100%|██████████| 3.97M/3.97M [00:00<00:00, 34.6MB/s]


In [36]:
print(df.columns.tolist())

['Date/Time', 'LV ActivePower (kW)', 'Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)']


In [41]:
threshold = 1000
df2["stability"] = (df2["LV ActivePower (kW)"] >= threshold).astype(int)

In [42]:
y = df2["stability"]

In [43]:
X = df2[['LV ActivePower (kW)', 'Wind Speed (m/s)', 'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)']]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [56]:
Models = {
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

for name, model in Models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  acc = accuracy_score(y_test, y_pred)
  cm = confusion_matrix(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print(f"Modelo: {name}")
  print(f"Acurácia: {acc: .2f}")
  print(f"Matriz de confusao: {cm}")
  print(f"F1 Score: {f1: .2f}")

Modelo: Decision Tree
Acurácia:  1.00
Matriz de confusao: [[5441    0]
 [   0 4665]]
F1 Score:  1.00
Modelo: KNN
Acurácia:  1.00
Matriz de confusao: [[5430   11]
 [   7 4658]]
F1 Score:  1.00
Modelo: Logistic Regression
Acurácia:  1.00
Matriz de confusao: [[5441    0]
 [   0 4665]]
F1 Score:  1.00


In [61]:
## O resultado saiu perfeito, não sei se está correto, mas os melhores modelos dentre os 3 são : Decision Tree e Logistic Regression
