# Data preparation

## Data do datasetu s 5minutovým intervalem

### Data z FVE

In [1]:
import pandas as pd
from datetime import timedelta


df_generation = pd.read_csv("data\solax\datacsvnaexport_final.csv", sep = ";")
df_generation['update time'] = df_generation['update time'].str.rstrip('.')
df_generation['timestamp'] = pd.to_datetime(df_generation['update time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

### Data z meteostanic 

In [2]:
# load
primary_df = pd.read_csv('data\wunderground\IVELKO9.csv')
secondary_df = pd.read_csv('data\wunderground\IPODBR33.csv')

# pridan nazev stanice
primary_df = primary_df.assign(dataset='IVELKO9')
secondary_df = secondary_df.assign(dataset='IPODBR33')

# vytvoren timestamp sloupec
primary_df['timestamp'] = primary_df['Date'] + ' ' + primary_df['Time']
secondary_df['timestamp'] = secondary_df['Date'] + ' ' + secondary_df['Time']
# konvert
primary_df['timestamp'] = pd.to_datetime(primary_df['timestamp'], format='%Y/%m/%d %I:%M %p', errors='coerce')
secondary_df['timestamp'] = pd.to_datetime(secondary_df['timestamp'], format='%Y/%m/%d %I:%M %p', errors='coerce')
# drop NAT
primary_df = primary_df.dropna(subset=['timestamp'])
secondary_df = secondary_df.dropna(subset=['timestamp'])
# zaokrouhleno
primary_df["rounded_timestamp"] = primary_df["timestamp"].dt.round("5min")
secondary_df["rounded_timestamp"] = secondary_df["timestamp"].dt.round("5min")

# reseni missing values vypadku stanic a concat
primary_mask = primary_df['Temperature_C'].isna()
missing_timestamps = primary_df.loc[primary_mask, 'rounded_timestamp']
secondary_subset = secondary_df[secondary_df['rounded_timestamp'].isin(missing_timestamps)]
merged_df = pd.concat([primary_df, secondary_subset], ignore_index=True)

# seřadit podle rounded_timestamp pro časovou posloupnost
merged_df = merged_df.sort_values('rounded_timestamp')

#drop zbytečné sloupce
merged_df_dropped = merged_df.drop(columns=['Date','Time',])

# rounded_timestamp na index0 pro přehlednost
cols = list(merged_df_dropped.columns)
cols.insert(0, cols.pop(cols.index('rounded_timestamp')))
merged_df_dropped = merged_df.loc[:, cols]


df_weather = merged_df_dropped.dropna(subset=['Temperature_C'])

### Spojení dataset
Napojovat se bude na timestamp, takže je provedeno zaokrouhlení na 5min

In [3]:
df_reference = pd.read_csv('data_final/reference_table.csv')
#df_consumption = pd.read_csv('data/cez_data_elektromer/pnd_spotreba3.csv',sep = ";", encoding='ISO-8859-1')


df_generation.drop(['EPS active power R(W)','EPS active power S(W)','EPS active power T(W)','EPS apparent power R(VA)','EPS apparent power S(VA)'
                    ,'EPS apparent power T(VA)'], axis=1, inplace=True)


# fce na zaokrouhlení
def custom_rounding(timestamp, interval):
    delta = timedelta(minutes=interval)
    remainder = timestamp.minute % interval
    if remainder < interval / 2:
        rounded = timestamp - timedelta(minutes=remainder)
    elif remainder == interval / 2:
        rounded = timestamp - timedelta(minutes=remainder)
    else:
        rounded = timestamp + (delta - timedelta(minutes=remainder))
    return rounded.replace(second=0, microsecond=0)

# timestamp na datetime format
df_reference['timestamp'] = pd.to_datetime(df_reference['datetime'])
df_generation['timestamp'] = pd.to_datetime(df_generation['timestamp'])
df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])

#call rounding fce
df_generation['timestamp_rounded'] = df_generation['timestamp'].apply(lambda x: custom_rounding(x, 5))
df_weather['timestamp_rounded'] = df_weather['timestamp'].apply(lambda x: custom_rounding(x, 5))

df_reference.set_index('timestamp', inplace=True)
df_generation.set_index('timestamp_rounded', inplace=True)
df_weather.set_index('timestamp_rounded', inplace=True)

df_merged_with_reference = pd.merge(df_reference, df_generation, how='left', left_index=True, right_index=True)
df_final_merged = pd.merge(df_merged_with_reference, df_weather, how='left', left_index=True, right_index=True)

df_final_merged.index.name = 'timestamp'

#drop nepotrebnych
df_final_merged.drop(['update time','timestamp_x','rounded_timestamp','timestamp_y','datetime'], axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_weather['timestamp'] = pd.to_datetime(df_weather['timestamp'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_weather['timestamp_rounded'] = df_weather['timestamp'].apply(lambda x: custom_rounding(x, 5))


## Vytvoření datasetu s intervaly 1d

In [4]:
# dict na resampling metodu
resample_dict = {
    'daily yield(kWh)': 'max',
    'consume energy(kWh)': 'last',
    'feed-in energy(kWh)': 'last',
    'PV1 input power(W)': 'max',
    'PV2 input power(W)': 'max',
    'feed-in power(W)': 'min',
    'feed-in power(W)': 'max',
}

# resample
df_filtered = df_final_merged.between_time('00:05', '23:55')
daily_df = df_filtered.resample('D').agg(resample_dict)

df_filtered['Temperature_C'] = pd.to_numeric(df_filtered['Temperature_C'], errors='coerce')

daily_df['peak_production_time'] = df_filtered['output power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['peak_PV1_production_time'] = df_filtered['PV1 input power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['peak_PV2_production_time'] = df_filtered['PV2 input power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['peak_feed_inpower_time'] = df_filtered['feed-in power(W)'].resample('D').apply(lambda x: x.idxmax().time())
daily_df['low_feed_inpower_time'] = df_filtered['feed-in power(W)'].resample('D').apply(lambda x: x.idxmin().time())
daily_df['average_temperature'] = df_filtered['Temperature_C'].resample('D').mean()
daily_df['total_precipitation'] = df_filtered['Precip_Accum_mm'].resample('D').last()

# kalkulace feature
daily_df['daily_consumption'] = daily_df['consume energy(kWh)'] - df_final_merged['consume energy(kWh)'].resample('D').first()
daily_df['daily_feed_in_energy'] = daily_df['feed-in energy(kWh)'] - df_final_merged['feed-in energy(kWh)'].resample('D').first()

# drop zbytecnychy
daily_df.drop(columns=['consume energy(kWh)', 'feed-in energy(kWh)'], inplace=True)

daily_df.to_csv("daily.csv")



TypeError: '>=' not supported between instances of 'str' and 'float'

## Kategorizace atributu pro CleverMiner

### Sjednocení formátu float values
Některé features měly decimal oddělený "," a některé "."

In [6]:
columns_to_convert = ['PV1 voltage (V)', 'PV1 current (A)', 'PV1 input power(W)',
       'PV2 voltage (V)', 'PV2 current (A)', 'PV2 input power(W)',
       'AC current R(A)', 'AC voltage R(V)', 'AC current S(A)',
       'AC voltage S(V)', 'AC current T(A)', 'AC voltage T(V)',
       'output power(W)', 'feed-in power(W)', 'daily yield(kWh)',
       'total yield(kWh)', 'feed-in energy(kWh)', 'consume energy(kWh)',
       'Temperature_C', 'Dew_Point_C', 'Humidity_%', 'Speed_kmh', 'Gust_kmh',
       'Pressure_hPa', 'Precip_Rate_mm', 'Precip_Accum_mm', 'UV', 'Solar_w/m2']


def replace_zeros(value):
    if isinstance(value, str):
        if value.strip() == '0':
            return '0.0'
        return value.replace(',', '.')
    return value

df = df.applymap(replace_zeros)

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce')





NameError: name 'df' is not defined

### Bohužel. ruční kategorizace pomocí vlastního definování hranic a názvů kategorií

In [None]:
# FVE
bins = [-1, 50, 300, 400, 500, 602]
bins2 = [-1, 50, 270, 350, 440, 506]
bins3 = [-1, 0.1, 3, 7, 10, 15.4]
bins4 = [-1, 15, 500, 1500, 3000, 4500, 6010]
bins5 = [-1, 15, 500, 1000, 2750, 4000, 5800]
bins6 = [-500, -1, 1, 1000, 3000, 5000, 7000, 12000]
bins7 = [-10500, -7500, -5000, -2500, -1000, -1, 1, 1000, 3000, 5000, 7000, 12000]

labels = ['Nula', 'Extremely Low', 'Low', 'Normal', 'High']
labels2 = ['Nula', 'A<3', '3<A<7', '7<A<10', '10<A<15.4']
labels3 = ['Nula', 'Extremely Low', 'Low', 'Normal', 'High', 'Very High']
labels4 = ['Negative', 'Nula', 'Extremely Low', 'Low', 'Normal', 'High', 'Very High']
labels5 = ['Neg Very High', 'Neg High', 'Neg Normal', 'Neg Low', 'Neg Extremely Low',
 'Nula', 'Extremely Low', 'Low', 'Normal', 'High', 'Very High']

df['PV1_voltage_(V)_cat'] = pd.cut(df['PV1 voltage (V)'], bins=bins, labels=labels)
df['PV2_voltage_(V)_cat'] = pd.cut(df['PV2 voltage (V)'], bins=bins2, labels=labels)
df['PV1_current_(A)_cat'] = pd.cut(df['PV1 current (A)'], bins=bins3, labels=labels2)
df['PV2_current_(A)_cat'] = pd.cut(df['PV2 current (A)'], bins=bins3, labels=labels2)
df['PV1_input_power(W)_cat'] = pd.cut(df['PV1 input power(W)'], bins=bins4, labels=labels3)
df['PV2_input_power(W)_cat'] = pd.cut(df['PV2 input power(W)'], bins=bins5, labels=labels3)
df['output_power(W)_cat'] = pd.cut(df['output power(W)'], bins=bins6, labels=labels4)
df['feed-in_power(W)_cat'] = pd.cut(df['feed-in power(W)'], bins=bins7, labels=labels5)

#pocasi
bin = [-14, -10, -5, 0, 5, 10, 15, 20, 25, 30, 35, 37]
bin1 = [-15, -10, -5, 0, 5, 10, 15, 20, 25]
bin2 = [20, 40, 60, 80, 100]
bin3 = [-1, 2.5, 5, 10, 15, 20]
bin4 = [-1, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 40, 60, 80, 100]
bin5 = [-1, 1, 100]

label = ['<-10', '-10 až -5', '-5 až 0', '0 až 5', '5 až 10', '10 až 15', '15 až 20', '20 až 25', '25 až 30', '30 až 35', '>35']
label1 = ['<-10', '-10 až -5', '-5 až 0', '0 až 5', '5 až 10', '10 až 15', '15 až 20', '20 až 25']
label2 = ['20-39', '40-59', '60-79', '80-99']
label3 = ['0-2.5','2.5-5', '5-9', '10-14', '15-19']
label4 = ['0', '0.1-0.19', '0.2-0.49', '0.5-0.99', '1-1.99', '2-4.99', '5-9.99', '10-19.99', '20-39.99', '40-59.99', '60-79.99', '80-99.99']
label5 = ['neprší', "prší"]

df['Temperature_cat'] = pd.cut(df['Temperature_C'], bins=bin, labels=label, include_lowest=True)
df['Dew_Point_C_cat'] = pd.cut(df['Dew_Point_C'], bins=bin1, labels=label1, include_lowest=True)
df['Humidity_%_cat'] = pd.cut(df['Humidity_%'], bins=bin2, labels=label2, include_lowest=True)
df['Speed_kmh_cat'] = pd.cut(df['Speed_kmh'], bins=bin3, labels=label3, include_lowest=True)
df['Precip_Rate_mm_cat'] = pd.cut(df['Precip_Rate_mm'], bins=bin4, labels=label4, include_lowest=True)
df['prší?'] = pd.cut(df['Precip_Rate_mm'], bins=bin5, labels=label5, include_lowest=True)


### vytvoření sloupců s informací o dni v týdnu a hodině a zdali se jedná o pracovní den

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour

# 0=Monday, 6=Sunday
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

df['is_workday'] = df['day_of_week'].apply(lambda x: "pracovní den" if x < 5 else "víkend")

df.to_csv('finalintoclever.csv')

## Resampling na hodinové intervaly 

In [10]:
import pandas as pd
df = pd.read_csv('df_final.csv')

print(df.columns)

Index(['Unnamed: 0', 'timestamp', 'PV1 voltage (V)', 'PV1 current (A)',
       'PV1 input power(W)', 'PV2 voltage (V)', 'PV2 current (A)',
       'PV2 input power(W)', 'AC current R(A)', 'AC voltage R(V)',
       'AC current S(A)', 'AC voltage S(V)', 'AC current T(A)',
       'AC voltage T(V)', 'output power(W)', 'feed-in power(W)',
       'daily yield(kWh)', 'total yield(kWh)', 'feed-in energy(kWh)',
       'consume energy(kWh)', 'Inverter Status', 'Battery operating status',
       'Temperature_C', 'Dew_Point_C', 'Humidity_%', 'Wind', 'Speed_kmh',
       'Gust_kmh', 'Pressure_hPa', 'Precip_Rate_mm', 'Precip_Accum_mm', 'UV',
       'Solar_w/m2', 'dataset', 'PV1_voltage_(V)_cat', 'PV2_voltage_(V)_cat',
       'PV1_current_(A)_cat', 'PV2_current_(A)_cat', 'PV1_input_power(W)_cat',
       'PV2_input_power(W)_cat', 'output_power(W)_cat', 'feed-in_power(W)_cat',
       'Temperature_cat', 'Dew_Point_C_cat', 'Humidity_%_cat', 'Speed_kmh_cat',
       'Precip_Rate_mm_cat', 'prší?', 'hour', 'da

In [21]:
import pandas as pd

# Read the CSV file into a DataFrame
df_hod = pd.read_csv('df_final.csv', sep=',')

# Convert 'timestamp' to datetime and set as index
df_hod['timestamp'] = pd.to_datetime(df_hod['timestamp'])
df_hod.set_index('timestamp', inplace=True)

# Define the columns that need to be converted to numeric
columns_to_convert = [
    'PV1 voltage (V)', 'PV1 current (A)', 'PV1 input power(W)', 
    'PV2 voltage (V)', 'PV2 current (A)', 'PV2 input power(W)', 
    'AC current R(A)', 'AC voltage R(V)', 'AC current S(A)', 
    'AC voltage S(V)', 'AC current T(A)', 'AC voltage T(V)', 
    'output power(W)', 'feed-in power(W)', 'daily yield(kWh)', 
    'total yield(kWh)', 'feed-in energy(kWh)', 'consume energy(kWh)',
    'Temperature_C', 'Dew_Point_C', 'Humidity_%', 'Wind', 'Speed_kmh',
    'Gust_kmh', 'Pressure_hPa', 'Precip_Rate_mm', 'Precip_Accum_mm', 'UV',
    'Solar_w/m2'
]

# Convert columns to numeric, coercing errors to NaN
for column in columns_to_convert:
    df_hod[column] = pd.to_numeric(df_hod[column], errors='coerce')

# Define the aggregation dictionary for resampling
aggregation = {
    'PV1 voltage (V)': 'mean',
    'PV1 current (A)': 'mean',
    'PV1 input power(W)': 'mean',
    'PV2 voltage (V)': 'mean',
    'PV2 current (A)': 'mean',
    'PV2 input power(W)': 'mean',
    'AC current R(A)': 'mean',
    'AC voltage R(V)': 'mean',
    'AC current S(A)': 'mean',
    'AC voltage S(V)': 'mean',
    'AC current T(A)': 'mean',
    'AC voltage T(V)': 'mean',
    'output power(W)': 'mean',
    'feed-in power(W)': 'mean',
    'daily yield(kWh)': 'mean',
    'total yield(kWh)': 'mean',
    'feed-in energy(kWh)': 'mean',
    'consume energy(kWh)': 'mean',
    'Inverter Status': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Battery operating status': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Temperature_C': 'mean',
    'Dew_Point_C': 'mean',
    'Humidity_%': 'mean',
    'Wind': 'mean',
    'Speed_kmh': 'mean',
    'Gust_kmh': 'mean',
    'Pressure_hPa': 'mean',
    'Precip_Rate_mm': 'mean',
    'Precip_Accum_mm': 'mean',
    'UV': 'mean',
    'Solar_w/m2': 'mean',
    'dataset': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV1_voltage_(V)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV2_voltage_(V)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV1_current_(A)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV2_current_(A)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV1_input_power(W)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'PV2_input_power(W)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'output_power(W)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'feed-in_power(W)_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Temperature_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Dew_Point_C_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Humidity_%_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Speed_kmh_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'Precip_Rate_mm_cat': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'prší?': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'hour': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'day_of_week': lambda x: x.mode()[0] if not x.mode().empty else 'N/A',
    'is_workday': lambda x: x.mode()[0] if not x.mode().empty else 'N/A'
}

# Resample the dataset to hourly data and aggregate
hourly_data = df_hod.resample('H').agg(aggregation)

hourly_data['hourly_yield'] = hourly_data['total yield(kWh)'].diff().shift(-1)
hourly_data['hourly_feed_in_energy'] = hourly_data['feed-in energy(kWh)'].diff().shift(-1)
hourly_data['hourly_consumed'] = hourly_data['consume energy(kWh)'].diff().shift(-1)

numeric_columns3 = [
    'PV1 voltage (V)', 'PV1 current (A)', 'PV1 input power(W)', 
    'PV2 voltage (V)', 'PV2 current (A)', 'PV2 input power(W)', 
    'AC current R(A)', 'AC voltage R(V)', 'AC current S(A)', 
    'AC voltage S(V)', 'AC current T(A)', 'AC voltage T(V)', 
    'output power(W)', 'feed-in power(W)', 'daily yield(kWh)', 
    'total yield(kWh)', 'feed-in energy(kWh)', 'consume energy(kWh)',
    'Temperature_C', 'Dew_Point_C', 'Humidity_%', 'Wind', 'Speed_kmh',
    'Gust_kmh', 'Pressure_hPa', 'Precip_Rate_mm', 'Precip_Accum_mm', 'UV',
    'Solar_w/m2','hourly_yield','hourly_feed_in_energy','hourly_consumed'
]

for column in numeric_columns3:
    hourly_data[column] = hourly_data[column].round(2)

# Save the hourly data to a CSV file
hourly_data.to_csv('hourly_data.csv')





In [22]:
hourly_data['sobestacnost'] = hourly_data['hourly_consumed'].apply(lambda x: 'ano' if x == 0 else 'ne')
hourly_data['system_pracuje'] = hourly_data['hourly_yield'].apply(lambda x: 'ne' if x == 0 else 'ano')
hourly_data['pretok'] = hourly_data['hourly_feed_in_energy'].apply(lambda x: 'ne' if x == 0 else 'ano')

hourly_data.to_csv('hourly_data.csv')


# Kategorizovat 

In [29]:
# dodatečná kategorizace FVE
bins1 = [-1, 0.1, 1.25, 2, 3, 20]
bins2 = [-1000, 0.001, 0.5, 1.5, 2.5, 4, 6, 1000]
bins3 = [-1000, 0.001, 0.5, 1.5, 2.5, 4, 6, 1000]

labels1 = ['<100 Wh', '1000-1250Wh', '1250Wh-2000Wh', '2000-3000Wh', '>3000Wh']
labels2 = ['Vypnuto', '<500Wh', '500-1500Wh', '1500-2500Wh', '2500-4000Wh', '4000-6000Wh', '>6000Wh']
labels3 = ['Nedodava', '<500Wh', '500-1500Wh', '1500-2500Wh', '2500-4000Wh', '4000-6000Wh', '>6000Wh']

hourly_data['h_spotreba_cat'] = pd.cut(hourly_data['hourly_consumed'], bins=bins1, labels=labels1)
hourly_data['h_yield_cat'] = pd.cut(hourly_data['hourly_yield'], bins=bins2, labels=labels2)
hourly_data['h_feedin_cat'] = pd.cut(hourly_data['hourly_feed_in_energy'], bins=bins3, labels=labels3)

hourly_data.to_csv('hourly_data1.csv')

# CleverMiner

### Příprava datasetů

In [39]:
data_h = hourly_data.apply(lambda col: col.astype('category') if col.dtype == 'object' else col)

In [40]:

print(data_h.describe([0, 0.05, 0.50, 0.95, 0.99, 1]))
print(data_h.shape)
print(data_h.dtypes.to_frame('Data Type').style.set_caption("Data Types"))
print(data_h.info())

       PV1 voltage (V)  PV1 current (A)  PV1 input power(W)  PV2 voltage (V)  \
count      9502.000000      9502.000000         9502.000000      9502.000000   
mean        222.142145         1.431563          664.309753       201.773104   
std         221.600057         2.389906         1095.218063       188.719313   
min           0.000000         0.000000            0.000000         0.000000   
0%            0.000000         0.000000            0.000000         0.000000   
5%            0.000000         0.000000            0.000000         0.000000   
50%         174.725000         0.040000           21.540000       254.665000   
95%         489.979500         7.190000         3299.017500       412.709500   
99%         510.527800         9.400000         4178.550900       447.079600   
100%        565.200000        10.730000         4782.750000       479.560000   
max         565.200000        10.730000         4782.750000       479.560000   

       PV2 current (A)  PV2 input power

In [41]:
dhfloat64 = data_h.select_dtypes(include=['float64'])
dhcm = data_h.select_dtypes(exclude=['float64'])

print(dhfloat64.describe([0, 0.05, 0.50, 0.95, 0.99, 1]))
print(dhfloat64.shape)
print(dhfloat64.dtypes.to_frame('Data Type').style.set_caption("Data Types"))
print(dhfloat64.info())
print(dhcm.describe([0, 0.05, 0.50, 0.95, 0.99, 1]))
print(dhcm.shape)
print(dhcm.dtypes.to_frame('Data Type').style.set_caption("Data Types"))
print(dhcm.info())

       PV1 voltage (V)  PV1 current (A)  PV1 input power(W)  PV2 voltage (V)  \
count      9502.000000      9502.000000         9502.000000      9502.000000   
mean        222.142145         1.431563          664.309753       201.773104   
std         221.600057         2.389906         1095.218063       188.719313   
min           0.000000         0.000000            0.000000         0.000000   
0%            0.000000         0.000000            0.000000         0.000000   
5%            0.000000         0.000000            0.000000         0.000000   
50%         174.725000         0.040000           21.540000       254.665000   
95%         489.979500         7.190000         3299.017500       412.709500   
99%         510.527800         9.400000         4178.550900       447.079600   
100%        565.200000        10.730000         4782.750000       479.560000   
max         565.200000        10.730000         4782.750000       479.560000   

       PV2 current (A)  PV2 input power

In [None]:
from cleverminer import cleverminer as cm

## 4FTMINER #1

In [None]:
clm = cm(df=df_copy,proc='4ftMiner',
               quantifiers= {'Conf': 0.70},
               ante ={
                    'attributes':[
                        {'name': 'hour', 'type': 'seq', 'minlen': 1, 'maxlen': 3},
                        {'name': 'month', 'type': 'seq', 'minlen': 1, 'maxlen': 3},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'feed-in_power(W)_cat', 'type': 'lcut', 'minlen': 1, 'maxlen': 3}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               )


clm.print_summary()
clm.print_rulelist()
clm.print_rule(1)

Cleverminer version 1.0.8.
Starting data preparation ...
Automatically reordering numeric categories ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
  0%|                                                    |Elapsed Time: 0:00:00
  5%|##                                                  |Elapsed Time: 0:00:00
  9%|####                                                |Elapsed Time: 0:00:00
 13%|#######                                             |Elapsed Time: 0:00:00
 17%|#########                                           |Elapsed Time: 0:00:00
 21%|###########                                         |Elapsed Time: 0:00:00
 26%|#############                                       |Elapsed Time: 0:00:00
 29%|###############                                     |Elapsed Time: 0:00:00
 34%|#################                                   |Elapsed Time: 0:00:00
 36%|##################       

## 4FTMINER #2

In [None]:
clm = cm(df=df_copy,proc='4ftMiner',
               quantifiers= {'Conf':0.8},
               ante ={
                    'attributes':[
                        {'name': 'PV1_current_(A)_cat', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'PV1_input_power(W)_cat', 'type': 'subset', 'minlen': 1, 'maxlen': 1} 
                    ], 'minlen':1, 'maxlen':2, 'type':'con'},
                cond ={
                    'attributes':[
                        {'name': 'PV1_voltage_(V)_cat', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )


clm.print_summary()
clm.print_rulelist()
clm.print_rule(9)

Cleverminer version 1.0.8.
Starting data preparation ...
Automatically reordering numeric categories ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
  0%|                                                    |Elapsed Time: 0:00:00
100%|####################################################|Elapsed Time: 0:00:00
Done. Total verifications : 150, rules 11, times: prep 0.30sec, processing 0.03sec

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 150
Number of rules : 11
Total time needed : 00h 00m 00s
Time of data preparation : 00h 00m 00s
Time of rule mining : 00h 00m 00s


List of rules:
RULEID BASE  CONF  AAD    Rule
     1   848 0.805 +0.715 PV1_current_(A)_cat(A<3) => PV1_input_power(W)_cat(Extremely Low) | PV1_voltage_(V)_cat(Extremely Low)
     2   214 0.986 +7.453 PV1_current_(A)_cat(3<A<7) => PV1_input_power(W)_cat(Low) | PV1_voltage_(V)_cat(Extremel

In [None]:
clm = cm(df=df_copy,target='PV1_input_power(W)_cat',proc='CFMiner',
               quantifiers= {'S_Up':4},
               cond ={
                    'attributes':[
                        {'name': 'month', 'type': 'subset', 'minlen': 1, 'maxlen': 5},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )


clm.print_summary()
clm.print_rulelist()
clm.print_rule(1)

Cleverminer version 1.0.8.
Starting data preparation ...
Automatically reordering numeric categories ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  CFMiner
Starting to mine rules.
  0%|                                                    |Elapsed Time: 0:00:00
 15%|#######                                             |Elapsed Time: 0:00:00
 39%|####################                                |Elapsed Time: 0:00:00
100%|####################################################|Elapsed Time: 0:00:00
Done. Total verifications : 1585, rules 1, times: prep 0.25sec, processing 0.16sec

CleverMiner task processing summary:

Task type : CFMiner
Number of verifications : 1585
Number of rules : 1
Total time needed : 00h 00m 00s
Time of data preparation : 00h 00m 00s
Time of rule mining : 00h 00m 00s


List of rules:
RULEID BASE  S_UP  S_DOWN Condition
     1  5826     4     1 month(5)



Rule id : 1

Base :  5826  Relative base : 

In [None]:

clm = cm(df=constructed_data,proc='4ftMiner',
               quantifiers= {'Base':5},
               ante ={
                    'attributes':[
                        {'name': 'day_of_week', 'type': 'seq', 'minlen': 1, 'maxlen': 3}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'daily_consumption_binned', 'type': 'subset', 'minlen': 1, 'maxlen':1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
                cond ={
                    'attributes':[
                        {'name': 'month', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':2, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(1)

Cleverminer version 1.0.8.
Starting data preparation ...
Automatically reordering numeric categories ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
  0%|                                                    |Elapsed Time: 0:00:00
100%|####################################################|Elapsed Time: 0:00:00
Done. Total verifications : 156, rules 156, times: prep 0.04sec, processing 0.06sec

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 156
Number of rules : 156
Total time needed : 00h 00m 00s
Time of data preparation : 00h 00m 00s
Time of rule mining : 00h 00m 00s


List of rules:
RULEID BASE  CONF  AAD    Rule
     1     5 0.500 +0.292 day_of_week(0 1) => daily_consumption_binned(high) | month(1)
     2     5 0.333 +0.033 day_of_week(0 1 2) => daily_consumption_binned(medium) | month(1)
     3     6 0.400 +0.033 day_of_week(0 1 2) => daily_consump

# Regrese