In [51]:
import pandas as pd
import plotly.express as px

In [52]:
df = pd.read_csv("../data/pvlive_2023_2024.csv")

In [None]:
display(df)

In [54]:
def filter_date_range(df, start_date="2023-01-01", end_date="2024-07-15"):
    df["start_datetime_utc"] = pd.to_datetime(df["start_datetime_utc"])
    df = df[df["start_datetime_utc"] > start_date]
    df = df[df["start_datetime_utc"] < end_date]
    return df

df_1 = filter_date_range(df)

In [None]:
px.scatter(df_1, x="start_datetime_utc",y="capacity_mwp",title="Solar Capacity 2023/01/01 - 2024/07/05")
# px.update_layout(title="Solar Capacity 2023/01/01  - 2024/11/11")

# px.scatter(df_1, x="start_datetime_utc",y="installedcapacity_mwp")

In [None]:
# Get peak generation per day
df_1['date'] = pd.to_datetime(df_1['start_datetime_utc']).dt.date
daily_peak = df_1.groupby('date').agg({
    'generation_mw': 'max',
    'capacity_mwp': 'first'
}).reset_index()

# Create scatter plot of daily peak generation vs capacity
fig = px.scatter(
    daily_peak,
    x="capacity_mwp", 
    y="generation_mw",
    title="Daily Peak Solar Generation vs Capacity",
    labels={
        "capacity_mwp": "Capacity (MWp)",
        "generation_mw": "Peak Generation (MW)",
        "date": "Date"
    },
    hover_data=["date"]
)
fig.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np

# Create the scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='start_datetime_utc', y='capacity_mwp', alpha=0.5)

# Calculate the line of best fit
slope, intercept, r_value, p_value, std_err = stats.linregress(
    x=df_1['start_datetime_utc'].astype(np.int64) // 10**9,  # Convert datetime to unix timestamp
    y=df_1['capacity_mwp']
)

# Create points for the line of best fit
x_values = df_1['start_datetime_utc']
y_values = slope * (x_values.astype(np.int64) // 10**9) + intercept

# Plot the line of best fit
plt.plot(x_values, y_values, color='red', label=f'Slope: {slope:.5f} MWp/s')

plt.title('Solar Capacity Over Time with Trend Line')
plt.xlabel('Date')
plt.ylabel('Capacity (MWp)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Print the slope in more readable units
daily_slope = slope * 86400  # Convert seconds to days
print(f"Capacity is increasing by approximately {daily_slope:.2f} MWp per day")

In [58]:
df_2 = filter_date_range(df, start_date="2023-01-01", end_date="2024-11-12")

In [None]:
# Create the scatter plot
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df_2, x='start_datetime_utc', y='capacity_mwp', alpha=0.5)
# Create points for the line of best fit
x_values = df_2['start_datetime_utc']
y_values = slope * (x_values.astype(np.int64) // 10**9) + intercept

# Plot the line of best fit
plt.plot(x_values, y_values, color='red', label=f'Slope: 4.61 MWp/day')

plt.title('Forecast Solar Capacity Retrospective Updates')
plt.xlabel('Date')
plt.ylabel('Capacity (MWp)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()