# Data preparation

## Data do datasetu s 5minutovým intervalem

### Data z FVE

In [34]:
import pandas as pd
from datetime import timedelta


df_generation = pd.read_csv("data\solax\datacsvnaexport_final.csv", sep = ";")
df_generation['update time'] = df_generation['update time'].str.rstrip('.')
df_generation['timestamp'] = pd.to_datetime(df_generation['update time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

### Data z meteostanic 

In [35]:
# load
primary_df = pd.read_csv('data\wunderground\IVELKO9.csv')
secondary_df = pd.read_csv('data\wunderground\IPODBR33.csv')

# pridan nazev stanice
primary_df = primary_df.assign(dataset='IVELKO9')
secondary_df = secondary_df.assign(dataset='IPODBR33')

# vytvoren timestamp sloupec
primary_df['timestamp'] = primary_df['Date'] + ' ' + primary_df['Time']
secondary_df['timestamp'] = secondary_df['Date'] + ' ' + secondary_df['Time']
# konvert
primary_df['timestamp'] = pd.to_datetime(primary_df['timestamp'], format='%Y/%m/%d %I:%M %p', errors='coerce')
secondary_df['timestamp'] = pd.to_datetime(secondary_df['timestamp'], format='%Y/%m/%d %I:%M %p', errors='coerce')
# drop NAT
primary_df = primary_df.dropna(subset=['timestamp'])
secondary_df = secondary_df.dropna(subset=['timestamp'])
# zaokrouhleno
primary_df["rounded_timestamp"] = primary_df["timestamp"].dt.round("5min")
secondary_df["rounded_timestamp"] = secondary_df["timestamp"].dt.round("5min")

# reseni missing values vypadku stanic a concat
primary_mask = primary_df['Temperature_C'].isna()
missing_timestamps = primary_df.loc[primary_mask, 'rounded_timestamp']
secondary_subset = secondary_df[secondary_df['rounded_timestamp'].isin(missing_timestamps)]
merged_df = pd.concat([primary_df, secondary_subset], ignore_index=True)

# seřadit podle rounded_timestamp pro časovou posloupnost
merged_df = merged_df.sort_values('rounded_timestamp')

#drop zbytečné sloupce
merged_df_dropped = merged_df.drop(columns=['Date','Time',])

# rounded_timestamp na index0 pro přehlednost
cols = list(merged_df_dropped.columns)
cols.insert(0, cols.pop(cols.index('rounded_timestamp')))
merged_df_dropped = merged_df.loc[:, cols]


df_weather = merged_df_dropped.dropna(subset=['Temperature_C'])

In [36]:
df_reference = pd.read_csv('data_final/reference_table.csv')
#df_consumption = pd.read_csv('data/cez_data_elektromer/pnd_spotreba3.csv',sep = ";", encoding='ISO-8859-1')


df_generation.drop(['EPS active power R(W)','EPS active power S(W)','EPS active power T(W)','EPS apparent power R(VA)','EPS apparent power S(VA)'
                    ,'EPS apparent power T(VA)'], axis=1, inplace=True)


# Custom rounding function
def custom_rounding(timestamp, interval):
    """
    Custom rounding function that rounds timestamps to the nearest interval and sets seconds to zero.
    Breaks ties by rounding down.
    
    :param timestamp: The original timestamp.
    :param interval: The interval to round to, in minutes.
    :return: The rounded timestamp with seconds set to zero.
    """
    # Convert interval to a timedelta
    delta = timedelta(minutes=interval)
    
    # Find the remainder when dividing the timestamp by the interval
    remainder = timestamp.minute % interval
    
    # If the remainder is less than half the interval, round down
    if remainder < interval / 2:
        rounded = timestamp - timedelta(minutes=remainder)
    # If the remainder is exactly half the interval, also round down
    elif remainder == interval / 2:
        rounded = timestamp - timedelta(minutes=remainder)
    # Otherwise, round up
    else:
        rounded = timestamp + (delta - timedelta(minutes=remainder))
    
    # Set seconds (and microseconds) to zero
    return rounded.replace(second=0, microsecond=0)

# Convert timestamps to datetime and set as index
df_reference['timestamp'] = pd.to_datetime(df_reference['datetime'])
df_generation['timestamp'] = pd.to_datetime(df_generation['timestamp'])
df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
#df_consumption['timestamp'] = pd.to_datetime(df_consumption['Datum'], dayfirst=True)

df_generation['timestamp_rounded'] = df_generation['timestamp'].apply(lambda x: custom_rounding(x, 5))
df_weather['timestamp_rounded'] = df_weather['timestamp'].apply(lambda x: custom_rounding(x, 5))

df_reference.set_index('timestamp', inplace=True)
df_generation.set_index('timestamp_rounded', inplace=True)
df_weather.set_index('timestamp_rounded', inplace=True)
#df_consumption.set_index('timestamp', inplace=True)

df_merged_with_reference = pd.merge(df_reference, df_generation, how='left', left_index=True, right_index=True)
df_final_merged = pd.merge(df_merged_with_reference, df_weather, how='left', left_index=True, right_index=True)

df_final_merged.index.name = 'timestamp'
df_final_merged.drop(['update time','timestamp_x','rounded_timestamp','timestamp_y','datetime'], axis=1, inplace=True)

df_final_merged.to_csv("merged_5min.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_weather['timestamp_rounded'] = df_weather['timestamp'].apply(lambda x: custom_rounding(x, 5))


## Vytvoření datasetu s intervaly 1d

In [42]:
print(df_final_merged['daily yield(kWh)'].dtype)

object


In [47]:
df_final_merged['consume energy(kWh)'] = df_final_merged['consume energy(kWh)'].str.replace(',', '.').astype(float)



In [54]:
# Define a dictionary with the resampling method for each column
resample_dict = {
    'daily yield(kWh)': 'max',
    'consume energy(kWh)': 'last',
    'feed-in energy(kWh)': 'last',
    'PV1 input power(W)': 'max',
    'PV2 input power(W)': 'max',
    'feed-in power(W)': 'min',
    'feed-in power(W)': 'max',
    # Add other columns and their respective methods here
}

# Resample the DataFrame using the specified dictionary
df_filtered = df_final_merged.between_time('00:05', '23:55')
daily_df = df_filtered.resample('D').agg(resample_dict)

df_filtered['Temperature_C'] = pd.to_numeric(df_filtered['Temperature_C'], errors='coerce')

daily_df['peak_production_time'] = df_filtered['output power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['peak_PV1_production_time'] = df_filtered['PV1 input power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['peak_PV2_production_time'] = df_filtered['PV2 input power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df[''] = df_filtered['PV1 input power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['average_temperature'] = df_filtered['Temperature_C'].resample('D').mean()
daily_df['total_precipitation'] = df_filtered['Precip_Accum_mm'].resample('D').last()

# For the consumed and feed-in energy, subtract the first value of the day from the last value
daily_df['daily_consumption'] = daily_df['consume energy(kWh)'] - df_final_merged['consume energy(kWh)'].resample('D').first()
daily_df['daily_feed_in_energy'] = daily_df['feed-in energy(kWh)'] - df_final_merged['feed-in energy(kWh)'].resample('D').first()

# Drop the original 'last' columns as they are no longer needed after the subtraction
daily_df.drop(columns=['consume energy(kWh)', 'feed-in energy(kWh)'], inplace=True)

print(daily_df.head())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Temperature_C'] = pd.to_numeric(df_filtered['Temperature_C'], errors='coerce')


            daily yield(kWh)  PV1 input power(W)  PV2 input power(W)  \
timestamp                                                              
2023-03-16              22.6              3837.0              3634.0   
2023-03-17              20.5              3440.0              4075.0   
2023-03-18              22.1              3751.0              3689.0   
2023-03-19              24.4              4075.0              3288.0   
2023-03-20              17.3              4230.0              3892.0   

           peak_production_time peak_PV1_production_time  \
timestamp                                                  
2023-03-16             14:50:00                 13:20:00   
2023-03-17             15:05:00                 10:00:00   
2023-03-18             11:30:00                 12:45:00   
2023-03-19             11:15:00                 12:25:00   
2023-03-20             18:00:00                 11:40:00   

           peak_PV2_production_time  average_temperature total_precipitati