In [9]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [2]:
df= pd.read_csv('df_preprocess.csv')
df.columns

Index(['Điện áp (V)', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeek',
       'DayOfYear', 'Quarter', 'IsWeekend', 'Season', 'Hour_sin', 'Hour_cos',
       'Month_sin', 'Month_cos', 'TimeDifference_hours',
       'TimeDifference_minutes', 'TimeDifference_hours_sin',
       'TimeDifference_hours_cos', 'KMean_Cluster_YMD', 'KMean_Cluster_HM',
       'KMean_Cluster_DoWY', 'KMean_Cluster_sincos', 'KMean_Cluster'],
      dtype='object')

In [3]:
sentences = df.apply(lambda row: [
    str(row['Year']),
    str(row['Month']),
    str(row['Day']),
    str(row['Hour']),
    str(row['Minute']),
    str(row['DayOfWeek']),
    str(row['DayOfYear']),
    str(row['Quarter']),
    str(row['IsWeekend']),
    str(row['Season']),
    str(row['Hour_sin']),
    str(row['Hour_cos']),
    str(row['Month_sin']),
    str(row['Month_cos']),
    str(row['TimeDifference_hours']),
    str(row['TimeDifference_minutes']),
    str(row['TimeDifference_hours_sin']),
    str(row['TimeDifference_hours_cos']),
    str(row['KMean_Cluster_YMD']),
    str(row['KMean_Cluster_HM']),
    str(row['KMean_Cluster_DoWY']),
    str(row['KMean_Cluster_sincos']),
    str(row['KMean_Cluster']),
], axis=1).tolist()

In [10]:
best_r2 = -np.inf
best_params = {}

for vector_size in [10, 15, 20]:
    for window in [2, 5, 10]:
        for sg in [0, 1]:
            model = Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=1, sg=sg)
            
            def embed_sentence(sentence):
                return np.mean([model.wv[word] for word in sentence], axis=0)
            
            embedded_sentences = np.array([embed_sentence(sentence) for sentence in sentences])
            
            df_word2vec = pd.DataFrame(embedded_sentences, columns=[f'vec_{i}' for i in range(embedded_sentences.shape[1])])
            
            df_combined = pd.concat([df_word2vec, df['Điện áp (V)']], axis=1)

            X = df_combined.drop(columns=['Điện áp (V)'])
            y = df_combined['Điện áp (V)']

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            rf = RandomForestRegressor(n_estimators=100, random_state=42)
            rf.fit(X_train, y_train)

            y_pred = rf.predict(X_test)

            r2 = r2_score(y_test, y_pred)
            
            if r2 > best_r2:
                best_r2 = r2
                best_params = {
                    'vector_size': vector_size,
                    'window': window,
                    'sg': sg
                }

print(f"Best R²: {best_r2}")
print(f"Best Parameters: {best_params}")

Best R²: 0.12920673796697246
Best Parameters: {'vector_size': 20, 'window': 10, 'sg': 1}


In [13]:
from sklearn.model_selection import GridSearchCV

def embed_sentence(sentence):
    return np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)

model = Word2Vec(sentences=sentences, vector_size=best_params['vector_size'], window=best_params['window'], sg=best_params['sg'])
embedded_sentences = np.array([embed_sentence(sentence) for sentence in sentences])
df_word2vec = pd.DataFrame(embedded_sentences, columns=[f'vec_{i}' for i in range(embedded_sentences.shape[1])])
df_combined = pd.concat([df_word2vec, df['Điện áp (V)']], axis=1)

X = df_combined.drop(columns=['Điện áp (V)'])
y = df_combined['Điện áp (V)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
best_r2_score = r2_score(y_test, y_pred)
print(f"Best R² after tuning Random Forest: {best_r2_score}")
