In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import tensorflow as tf

import keras
from keras.models import Sequential
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.optimizers import Adam
from keras.layers import Dense, Activation, Dropout, Flatten, Conv1D, MaxPooling1D, BatchNormalization, ZeroPadding2D, Add, ReLU, LSTM, Bidirectional, Input
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
data_frame = pd.read_excel('data.xlsx')

In [None]:
data_frame.head(11)
df_preprocess = data_frame.dropna(subset='Unnamed: 0')

In [None]:
df_preprocess = df_preprocess.reset_index(drop=True)
df_preprocess.columns = df_preprocess.iloc[0]
df_preprocess = df_preprocess.drop(0).reset_index(drop=True)
df_preprocess

In [None]:
df = df_preprocess.iloc[:, [1, 4]]
df

In [None]:
df = df.dropna(subset='Điện áp (V)')
df = df.reset_index(drop=True)
df.shape

In [None]:
df['Điện áp (V)'] = df['Điện áp (V)'].astype(float)

In [None]:
df['Thời điểm đo'] = pd.to_datetime(df['Thời điểm đo'], errors='coerce')
df['Year'] = df['Thời điểm đo'].dt.year
df['Month'] = df['Thời điểm đo'].dt.month
df['Day'] = df['Thời điểm đo'].dt.day
df['Hour'] = df['Thời điểm đo'].dt.hour
df['Minute'] = df['Thời điểm đo'].dt.minute

In [None]:
# Check for null values in the 'Year' column
null_years = df['Điện áp (V)'].isnull().sum()
print(f'Number of null values in the Year column: {null_years}')
df = df.dropna(subset='Điện áp (V)')
df = df.reset_index(drop=True)

In [None]:
df['DayOfWeek'] = df['Thời điểm đo'].dt.dayofweek 
df['DayOfYear'] = df['Thời điểm đo'].dt.dayofyear
df['Quarter'] = df['Thời điểm đo'].dt.quarter
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)


In [None]:
# def get_season(month):
#     if month in [12, 1, 2]:
#         return 'Winter'
#     elif month in [3, 4, 5]:
#         return 'Spring'
#     elif month in [6, 7, 8]:
#         return 'Summer'
#     else:
#         return 'Fall'

# df['Season'] = df['Month'].apply(get_season)

In [None]:
df['TimeDifference'] = df['Thời điểm đo'] - df['Thời điểm đo'].shift(1) 

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder()
# season_encoded = encoder.fit_transform(df[['Season']]).toarray()

In [None]:
# Map the season names to numerical values
season_mapping = {'Winter': 0.0, 'Spring': 1.0, 'Summer': 2.0, 'Fall': 3.0}
df['Season'] = df['Season'].map(season_mapping).astype(float)

In [None]:
# from sklearn.impute import KNNImputer

# # Select the columns to impute
# columns_to_impute = ['Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
#        'DayOfWeek', 'DayOfYear', 'Quarter', 'IsWeekend', 'Season', 'Hour_sin',
#        'Hour_cos', 'Month_sin', 'Month_cos', 'TimeDifference']

# # Initialize the KNNImputer
# imputer = KNNImputer(n_neighbors=5)

# # Apply the imputer to the selected columns
# df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

# # Verify the imputation
# df.info()

In [None]:
import seaborn as sns

# Select the relevant columns
cols = ['Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'DayOfWeek', 'DayOfYear', 'Quarter', 'IsWeekend', 'Season', 'TimeDifference']
corr_matrix = df[cols].corr()

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
df = df.dropna(subset='TimeDifference')
df = df.reset_index(drop=True)
df

In [None]:

df['TimeDifference_hours'] = df['TimeDifference'].dt.components['hours']
df['TimeDifference_minutes'] = df['TimeDifference'].dt.components['minutes']
df['TimeDifference_day'] = df['TimeDifference'].dt.components['days']


df.head()

In [None]:
df.columns

In [None]:
columns_to_convert = ['Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
    'DayOfWeek', 'DayOfYear', 'Quarter', 'IsWeekend', 'Season', 'TimeDifference_hours', 
    'TimeDifference_minutes', 'TimeDifference_day']
df_preprocess = df[columns_to_convert].astype(float)
df_preprocess.dtypes

In [None]:
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()
columns_to_normalize = [ 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'DayOfWeek', 'DayOfYear', 'Quarter', 'IsWeekend', 'Season', 'TimeDifference_hours',
       'TimeDifference_minutes', 'TimeDifference_day']

df_preprocess['Điện áp (V)'] = minmax_scaler.fit_transform(df_preprocess[['Điện áp (V)']])
df_preprocess['Điện áp (V)'] = standard_scaler.fit_transform(df_preprocess[['Điện áp (V)']])
df_preprocess[columns_to_normalize] = minmax_scaler.fit_transform(df_preprocess[columns_to_normalize])
df_preprocess[columns_to_normalize] = standard_scaler.fit_transform(df_preprocess[columns_to_normalize])


In [None]:
df_preprocess.info()

In [2]:
data_frame = pd.read_excel("C:/Users/admin/Documents/DPL302m/DPL302m_Material/Code/EduNext_Slot5/data.xlsx")

In [3]:
data_frame.head(11)
df_preprocess = data_frame.dropna(subset='Unnamed: 0')

In [4]:
df_preprocess = df_preprocess.reset_index(drop=True)
df_preprocess.columns = df_preprocess.iloc[0]
df_preprocess = df_preprocess.drop(0).reset_index(drop=True)
df_preprocess

Unnamed: 0,STT,Thời điểm đo,NaN,NaN.1,Điện áp (V),NaN.2,NaN.3,Dòng điện (A),NaN.4,NaN.5,...,NaN.6,NaN.7,Tần số (Hz),NaN.8,NaN.9,NaN.10,I (đm),U (đm),P (đm),Mã trạm
0,1,01/01/22 00:00,01/01/2022 00:01:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,237.234,236.063,237.32,304.2,348.75,334.35,...,,,50.13,50.12,50.12,,811.59,230,560,020343
1,2,01/01/2022 00:30:00,-,Serial: - - -\n- TU: - - TI: - - HSN: -,,,,,,,...,,,,,,,,,,-
2,3,01/01/22 01:00,01/01/2022 00:57:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,234.406,233.089,234.427,279.6,297.9,286.05,...,,,50.1,50.1,50.1,,811.59,230,560,020343
3,4,01/01/22 01:30,01/01/2022 01:25:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,235.978,234.657,235.904,260.25,270.3,277.05,...,,,50.02,50.02,50.02,,811.59,230,560,020343
4,5,01/01/22 02:00,01/01/2022 01:54:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,233.935,232.942,234.02,294,279.3,287.85,...,,,49.99,49.99,49.99,,811.59,230,560,020343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26663,26664,20/06/23 13:00,20/06/2023 13:00:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,232.302,232.09,233.033,413.7,455.25,441.45,...,,,49.83,49.83,49.83,,811.59,230,560,020343
26664,26665,20/06/23 13:30,20/06/2023 13:27:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,233.709,233.969,234.602,417,439.5,460.5,...,,,50.03,50.03,50.03,,811.59,230,560,020343
26665,26666,20/06/23 14:00,20/06/2023 13:54:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,234.291,234.486,235.393,411.75,434.1,393.45,...,,,50.08,50.08,50.07,,811.59,230,560,020343
26666,26667,20/06/23 14:30,20/06/2023 14:22:00,Serial: ML31717088282 - ML3 - Meter NURI\n- TU...,235.209,235.167,236.066,407.85,449.55,438.6,...,,,50.04,50.03,50.02,,811.59,230,560,020343


In [5]:
df = df_preprocess.iloc[:, [1, 4]]
df

Unnamed: 0,Thời điểm đo,Điện áp (V)
0,01/01/22 00:00,237.234
1,01/01/2022 00:30:00,
2,01/01/22 01:00,234.406
3,01/01/22 01:30,235.978
4,01/01/22 02:00,233.935
...,...,...
26663,20/06/23 13:00,232.302
26664,20/06/23 13:30,233.709
26665,20/06/23 14:00,234.291
26666,20/06/23 14:30,235.209


In [6]:
df = df.dropna(subset='Điện áp (V)')
df = df.reset_index(drop=True)
df.shape

(25277, 2)

In [7]:
df['Điện áp (V)'] = df['Điện áp (V)'].astype(float)

In [8]:
df['Thời điểm đo'] = pd.to_datetime(df['Thời điểm đo'], errors='coerce')
df['Year'] = df['Thời điểm đo'].dt.year
df['Month'] = df['Thời điểm đo'].dt.month
df['Day'] = df['Thời điểm đo'].dt.day
df['Hour'] = df['Thời điểm đo'].dt.hour
df['Minute'] = df['Thời điểm đo'].dt.minute

In [9]:
# Check for null values in the 'Year' column
null_years = df['Điện áp (V)'].isnull().sum()
print(f'Number of null values in the Year column: {null_years}')
df = df.dropna(subset='Điện áp (V)')
df = df.reset_index(drop=True)

Number of null values in the Year column: 0


In [10]:
df['DayOfWeek'] = df['Thời điểm đo'].dt.dayofweek 
df['DayOfYear'] = df['Thời điểm đo'].dt.dayofyear
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)


In [11]:
# def get_season(month):
#     if month in [12, 1, 2]:
#         return 'Winter'
#     elif month in [3, 4, 5]:
#         return 'Spring'
#     elif month in [6, 7, 8]:
#         return 'Summer'
#     else:
#         return 'Fall'

# df['Season'] = df['Month'].apply(get_season)

In [12]:
# df['TimeDifference'] = df['Thời điểm đo'] - df['Thời điểm đo'].shift(1) 

In [13]:
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder()
# season_encoded = encoder.fit_transform(df[['Season']]).toarray()

In [14]:
# # Map the season names to numerical values
# season_mapping = {'Winter': 0.0, 'Spring': 1.0, 'Summer': 2.0, 'Fall': 3.0}
# df['Season'] = df['Season'].map(season_mapping).astype(float)

In [15]:
df.columns

Index(['Thời điểm đo', 'Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'DayOfWeek', 'DayOfYear', 'IsWeekend'],
      dtype='object', name=0)

In [16]:
from sklearn.impute import KNNImputer

# Select the columns to impute
columns_to_impute = ['Year', 'Month', 'Day', 'Hour', 'Minute',
       'DayOfWeek', 'DayOfYear', 'IsWeekend']

# Initialize the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Apply the imputer to the selected columns
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

# Verify the imputation
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25277 entries, 0 to 25276
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Thời điểm đo  25277 non-null  datetime64[ns]
 1   Điện áp (V)   25277 non-null  float64       
 2   Year          25277 non-null  float64       
 3   Month         25277 non-null  float64       
 4   Day           25277 non-null  float64       
 5   Hour          25277 non-null  float64       
 6   Minute        25277 non-null  float64       
 7   DayOfWeek     25277 non-null  float64       
 8   DayOfYear     25277 non-null  float64       
 9   IsWeekend     25277 non-null  float64       
dtypes: datetime64[ns](1), float64(9)
memory usage: 1.9 MB


In [17]:
# import seaborn as sns

# # Select the relevant columns
# cols = ['Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
#        'DayOfWeek', 'DayOfYear', 'IsWeekend']
# corr_matrix = df[cols].corr()

# # Plot the heatmap
# plt.figure(figsize=(10, 6))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
# plt.title('Correlation Matrix')
# plt.show()

In [18]:
# df = df.dropna(subset='TimeDifference')
# df = df.reset_index(drop=True)
# df

In [19]:

# df['TimeDifference_hours'] = df['TimeDifference'].dt.components['hours']
# df['TimeDifference_minutes'] = df['TimeDifference'].dt.components['minutes']
# df['TimeDifference_day'] = df['TimeDifference'].dt.components['days']


# df.head()

In [20]:
df.columns

Index(['Thời điểm đo', 'Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'DayOfWeek', 'DayOfYear', 'IsWeekend'],
      dtype='object', name=0)

In [21]:
columns_to_convert = df.columns.to_list()
columns_to_convert.remove('Thời điểm đo')
df_preprocess = df[columns_to_convert].astype(float)
df_preprocess.dtypes

0
Điện áp (V)    float64
Year           float64
Month          float64
Day            float64
Hour           float64
Minute         float64
DayOfWeek      float64
DayOfYear      float64
IsWeekend      float64
dtype: object

In [22]:
df_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25277 entries, 0 to 25276
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Điện áp (V)  25277 non-null  float64
 1   Year         25277 non-null  float64
 2   Month        25277 non-null  float64
 3   Day          25277 non-null  float64
 4   Hour         25277 non-null  float64
 5   Minute       25277 non-null  float64
 6   DayOfWeek    25277 non-null  float64
 7   DayOfYear    25277 non-null  float64
 8   IsWeekend    25277 non-null  float64
dtypes: float64(9)
memory usage: 1.7 MB


In [23]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)

    return data

df_preprocess = encode(df_preprocess, 'Month', 12)
df_preprocess = encode(df_preprocess, 'Day', 31) 
df_preprocess = encode(df_preprocess, 'Hour', 24)
df_preprocess = encode(df_preprocess, 'Minute', 60)
df_preprocess = encode(df_preprocess, 'DayOfWeek', 7)
df_preprocess = encode(df_preprocess, 'DayOfYear', 366)
# df_preprocess = encode(df_preprocess, 'Quarter', 4)


In [24]:
df_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25277 entries, 0 to 25276
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Điện áp (V)    25277 non-null  float64
 1   Year           25277 non-null  float64
 2   Month          25277 non-null  float64
 3   Day            25277 non-null  float64
 4   Hour           25277 non-null  float64
 5   Minute         25277 non-null  float64
 6   DayOfWeek      25277 non-null  float64
 7   DayOfYear      25277 non-null  float64
 8   IsWeekend      25277 non-null  float64
 9   Month_sin      25277 non-null  float64
 10  Month_cos      25277 non-null  float64
 11  Day_sin        25277 non-null  float64
 12  Day_cos        25277 non-null  float64
 13  Hour_sin       25277 non-null  float64
 14  Hour_cos       25277 non-null  float64
 15  Minute_sin     25277 non-null  float64
 16  Minute_cos     25277 non-null  float64
 17  DayOfWeek_sin  25277 non-null  float64
 18  DayOfW

In [25]:
df_preprocess

Unnamed: 0,Điện áp (V),Year,Month,Day,Hour,Minute,DayOfWeek,DayOfYear,IsWeekend,Month_sin,...,Day_sin,Day_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos,DayOfWeek_sin,DayOfWeek_cos,DayOfYear_sin,DayOfYear_cos
0,237.234,2022.0,1.0,1.0,0.0,0.0,5.0,1.0,1.0,5.000000e-01,...,0.201299,0.979530,0.000000e+00,1.000000,0.000000e+00,1.0,-0.974928,-0.222521,0.017166,0.999853
1,234.406,2022.0,1.0,1.0,1.0,0.0,5.0,1.0,1.0,5.000000e-01,...,0.201299,0.979530,2.588190e-01,0.965926,0.000000e+00,1.0,-0.974928,-0.222521,0.017166,0.999853
2,235.978,2022.0,1.0,1.0,1.0,30.0,5.0,1.0,1.0,5.000000e-01,...,0.201299,0.979530,2.588190e-01,0.965926,5.665539e-16,-1.0,-0.974928,-0.222521,0.017166,0.999853
3,233.935,2022.0,1.0,1.0,2.0,0.0,5.0,1.0,1.0,5.000000e-01,...,0.201299,0.979530,5.000000e-01,0.866025,0.000000e+00,1.0,-0.974928,-0.222521,0.017166,0.999853
4,235.131,2022.0,1.0,1.0,2.0,30.0,5.0,1.0,1.0,5.000000e-01,...,0.201299,0.979530,5.000000e-01,0.866025,5.665539e-16,-1.0,-0.974928,-0.222521,0.017166,0.999853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25272,230.992,2023.0,6.0,20.0,12.0,30.0,1.0,171.0,0.0,1.224647e-16,...,-0.790776,-0.612106,1.224647e-16,-1.000000,5.665539e-16,-1.0,0.781831,0.623490,0.204552,-0.978856
25273,232.302,2023.0,6.0,20.0,13.0,0.0,1.0,171.0,0.0,1.224647e-16,...,-0.790776,-0.612106,-2.588190e-01,-0.965926,0.000000e+00,1.0,0.781831,0.623490,0.204552,-0.978856
25274,233.709,2023.0,6.0,20.0,13.0,30.0,1.0,171.0,0.0,1.224647e-16,...,-0.790776,-0.612106,-2.588190e-01,-0.965926,5.665539e-16,-1.0,0.781831,0.623490,0.204552,-0.978856
25275,234.291,2023.0,6.0,20.0,14.0,0.0,1.0,171.0,0.0,1.224647e-16,...,-0.790776,-0.612106,-5.000000e-01,-0.866025,0.000000e+00,1.0,0.781831,0.623490,0.204552,-0.978856


In [26]:
df_cols= df_preprocess.columns.to_list()
df_cols.remove('Điện áp (V)')

In [28]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, random_state=42)
kmeans.fit(df_preprocess[df_cols])
df_preprocess['Cluster'] = kmeans.labels_
# Balance the sample sizes for each cluster
# min_samples = df_preprocess['Cluster'].value_counts().min()

# # Create a balanced dataframe
# df_preprocess = df_preprocess.groupby('Cluster').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)
# df_preprocess

  super()._check_params_vs_input(X, default_n_init=10)


AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
df_preprocess = shuffle(df_preprocess, random_state=64)
df_preprocess.reset_index(drop=True, inplace=True)

In [None]:
df_cols

['Điện áp (V)',
 'Year',
 'Month',
 'Day',
 'Hour',
 'Minute',
 'DayOfWeek',
 'DayOfYear',
 'IsWeekend',
 'Month_sin',
 'Month_cos',
 'Month_mean',
 'Month_median',
 'Month_variance',
 'Day_sin',
 'Day_cos',
 'Day_mean',
 'Day_median',
 'Day_variance',
 'Hour_sin',
 'Hour_cos',
 'Hour_mean',
 'Hour_median',
 'Hour_variance',
 'Minute_sin',
 'Minute_cos',
 'Minute_mean',
 'Minute_median',
 'Minute_variance',
 'DayOfWeek_sin',
 'DayOfWeek_cos',
 'DayOfWeek_mean',
 'DayOfWeek_median',
 'DayOfWeek_variance',
 'DayOfYear_sin',
 'DayOfYear_cos',
 'DayOfYear_mean',
 'DayOfYear_median',
 'DayOfYear_variance']

In [None]:
# minmax_scaler = MinMaxScaler()
# standard_scaler = StandardScaler()
# columns_to_normalize = ['Year', 'Month', 'Day',
#  'Hour', 'Minute', 'DayOfWeek', 'DayOfYear', 'IsWeekend']

# df_preprocess['Điện áp (V)'] = standard_scaler.fit_transform(df_preprocess[['Điện áp (V)']])
# df_preprocess[columns_to_normalize] = minmax_scaler.fit_transform(df_preprocess[columns_to_normalize])
# df_preprocess[columns_to_normalize] = standard_scaler.fit_transform(df_preprocess[columns_to_normalize])

In [None]:
from sklearn.cluster import DBSCAN


def get_cluster_data(df, cluster_num):
    return df[df['Cluster'] == cluster_num]
df_preprocess_cluster_0 = get_cluster_data(df_preprocess, 0)
df_preprocess_cluster_1 = get_cluster_data(df_preprocess, 1)
df_preprocess_cluster_2 = get_cluster_data(df_preprocess, 2)
df_preprocess_cluster_3 = get_cluster_data(df_preprocess, 3)
df_preprocess_cluster_4 = get_cluster_data(df_preprocess, 4)
df_preprocess_cluster_5 = get_cluster_data(df_preprocess, 5)


def cluster(df):
    kmeans = KMeans(n_clusters=6, random_state=42)
    kmeans.fit(df[['Month_sin','Month_cos','Day_sin','Day_cos']])
    df['Date_Cluster'] = kmeans.labels_
    
    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans.fit(df[['Hour_sin','Hour_cos']])
    df['Time_Cluster'] = kmeans.labels_
    
    kmeans = KMeans(n_clusters=4, random_state=42)
    kmeans.fit(df)
    df['No_Cluster'] = kmeans.labels_
    
    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()
    columns_to_normalize = ['Year', 'Month', 'Day',
    'Hour', 'Minute', 'DayOfWeek', 'DayOfYear', 'IsWeekend']

    df['Điện áp (V)'] = standard_scaler.fit_transform(df[['Điện áp (V)']])
    df[columns_to_normalize] = minmax_scaler.fit_transform(df[columns_to_normalize])
    df[columns_to_normalize] = standard_scaler.fit_transform(df[columns_to_normalize])
    return df
df_preprocess_cluster_0 =cluster(df_preprocess_cluster_0)
df_preprocess_cluster_1 =cluster(df_preprocess_cluster_1)
df_preprocess_cluster_2 =cluster(df_preprocess_cluster_2)
df_preprocess_cluster_3 =cluster(df_preprocess_cluster_3)
df_preprocess_cluster_4 =cluster(df_preprocess_cluster_4)
df_preprocess_cluster_5 =cluster(df_preprocess_cluster_5)


KeyError: 'Cluster'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier

# Function to train and evaluate a model with a specified train-test split
def train_and_evaluate_model_train_test_split(df_cluster, test_size):
    corr_matrix = df_cluster.corr()

    # Select the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

    # Drop the features with high correlation
    df_cluster = df_cluster.drop(columns=to_drop)
    
    X = df_cluster.drop(columns=['Điện áp (V)', 'Cluster'])
    y = df_cluster['Điện áp (V)']
    
    # Split the data with specified test size
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Initialize the model
    model = RandomForestRegressor(n_estimators=100,random_state=42)
    
    
    # Train the model
    model = model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return mse, r2

# List of dataframes for each cluster (ensure these are defined)
df_clusters = [
    df_preprocess_cluster_0, df_preprocess_cluster_1, df_preprocess_cluster_2, df_preprocess_cluster_3,
    df_preprocess_cluster_4, df_preprocess_cluster_5
]

mses = []
r2_scores = []

for i, df_cluster in enumerate(df_clusters):
    mse, r2 = train_and_evaluate_model_train_test_split(df_cluster, test_size=0.03)
    
    mses.append(mse)
    r2_scores.append(r2)
    print(f'Cluster {i}: Mean Squared Error = {mse}, R² Score = {r2}')

# model, mse, r2 = train_and_evaluate_model_train_test_split(df_ble, test_size=0.03)
# models.append(model)
# mses.append(mse)
# r2_scores.append(r2)
# print(f'Cluster: Mean Squared Error = {mse}, R² Score = {r2}')


Cluster 0: Mean Squared Error = 0.5353124824847705, R² Score = 0.4613173935535594
Cluster 1: Mean Squared Error = 0.7395483044922939, R² Score = 0.27765124135871144
Cluster 2: Mean Squared Error = 0.7042838436306453, R² Score = 0.387791574904915
Cluster 3: Mean Squared Error = 0.6865622887515218, R² Score = 0.33781014109787166
Cluster 4: Mean Squared Error = 0.6347137893417414, R² Score = 0.20840456393570717
Cluster 5: Mean Squared Error = 0.7891462689815562, R² Score = 0.21672511594672605


In [None]:
mean_r2_score = np.mean(r2_scores)
mean_r2_score

0.31495000513291516

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# Function to train and evaluate a model for each cluster using K-Fold cross-validation
def train_and_evaluate_model_kfold(df_cluster, n_splits=12):
    X = df_cluster.drop(columns=['Điện áp (V)', 'Cluster'])
    y = df_cluster['Điện áp (V)']
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    mses = []
    r2_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Initialize the model
        model = RandomForestRegressor(random_state=42)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate the mean squared error
        mse = mean_squared_error(y_test, y_pred)
        mses.append(mse)
        
        # Calculate the R² score
        r2 = r2_score(y_test, y_pred)
        r2_scores.append(r2)
    
    return np.mean(mses), np.mean(r2_scores)

# List of dataframes for each cluster
df_clusters = [
    df_preprocess_cluster_0, df_preprocess_cluster_1, df_preprocess_cluster_2,
    df_preprocess_cluster_3, df_preprocess_cluster_4, df_preprocess_cluster_5
]

# Train and evaluate a model for each cluster using K-Fold cross-validation
models = []
mses = []
r2_scores = []

for i, df_cluster in enumerate(df_clusters):
    mse, r2 = train_and_evaluate_model_kfold(df_cluster)
    mses.append(mse)
    r2_scores.append(r2)
    print(f'Cluster {i}: Mean Squared Error = {mse}, R² Score = {r2}')


Cluster 0: Mean Squared Error = 0.637259380048302, R² Score = 0.3583840788051247
Cluster 1: Mean Squared Error = 0.7375133001570101, R² Score = 0.2604398129282829
Cluster 2: Mean Squared Error = 0.7185240415356976, R² Score = 0.27956485488247096
Cluster 3: Mean Squared Error = 0.7398550824308155, R² Score = 0.2590723465817655
Cluster 4: Mean Squared Error = 0.7424530859310629, R² Score = 0.25234029205080094
Cluster 5: Mean Squared Error = 0.7696985148197127, R² Score = 0.22649795356347732


In [None]:
mean_r2_score = np.mean(r2_scores)
mean_r2_score

0.27271655646865367