In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

warnings.filterwarnings('ignore')

# CONFIGURATIE
HOUSES = [
    {'name': 'House1', 'file': 'CLEAN_House1.csv'},
    {'name': 'House2', 'file': 'CLEAN_House2.csv'} 
]

WINDOW_SIZE = 60
TRAIN_SPLIT = 0.8
SAMPLE_SIZE = 50000 # Pentru optimizarea KMeans

In [None]:
# --- FUNCTIE 1: INCARCARE SI FEATURES ---
def process_dataframe(filename):
    print(f"\n[INFO] Procesare fisier: {filename}...")
    df = pd.read_csv(filename)
    df['dt'] = pd.to_datetime(df['Time'])
    df.set_index('dt', inplace=True)
    df = df[['Aggregate']]
    df['Aggregate'] = pd.to_numeric(df['Aggregate'], errors='coerce')
    
    # Resampling
    df_1min = df.resample('1min').mean().fillna(method='ffill')
    
    # Features
    df_1min['Hour'] = df_1min.index.hour
    df_1min['DayOfWeek'] = df_1min.index.dayofweek
    df_1min['Hour_Sin'] = np.sin(2 * np.pi * df_1min['Hour'] / 24.0)
    df_1min['Hour_Cos'] = np.cos(2 * np.pi * df_1min['Hour'] / 24.0)
    df_1min['IsWeekend'] = df_1min['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
    df_1min['DayOfWeek_Scaled'] = df_1min['DayOfWeek'] / 6.0
    
    return df_1min

# --- FUNCTIE 2: CLUSTERING ---
def apply_clustering(df):
    print("[INFO] Optimizare si aplicare KMeans...")
    # Esantion pentru viteza
    data_sample = df[['Aggregate']].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).values
    
    # Cautam cel mai bun k (simplificat pentru automatizare)
    range_n_clusters = [2, 3, 4, 5]
    scores = []
    for k in range_n_clusters:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(data_sample)
        scores.append(silhouette_score(data_sample, labels))
    
    best_k = range_n_clusters[np.argmax(scores)]
    print(f" -> Cel mai bun k detectat: {best_k}")
    
    # Aplicare finala
    kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    df['Cluster'] = kmeans_final.fit_predict(df[['Aggregate']].values)
    
    # Ordonare clustere (0 = Low, 1 = High logic)
    centers = kmeans_final.cluster_centers_.flatten()
    if len(centers) == 2 and centers[0] > centers[1]:
        df['Cluster'] = 1 - df['Cluster'] # Inversare
        
    return df, kmeans_final

# --- FUNCTIE 3: SECVENTE SI SCALARE ---
def create_sequences(data, window_size):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data[i-window_size:i])
        y.append(data[i, 0])
    return np.array(X), np.array(y)


def prepare_and_save(df, kmeans_model, house_name):
    print(f"[INFO] Generare secvente pentru {house_name}...")
    
    scaler = RobustScaler()
    df['Aggregate_Scaled'] = scaler.fit_transform(df[['Aggregate']])
    
    features_cols = ['Aggregate_Scaled', 'Hour_Sin', 'Hour_Cos', 'DayOfWeek_Scaled', 'IsWeekend', 'Cluster']
    dataset = df[features_cols].values
    
    train_len = int(len(dataset) * TRAIN_SPLIT)
    train_data = dataset[:train_len]
    test_data_values = dataset[train_len:]
    
    X_train, y_train = create_sequences(train_data, WINDOW_SIZE)
    test_inputs = np.concatenate((train_data[-WINDOW_SIZE:], test_data_values))
    X_test, y_test = create_sequences(test_inputs, WINDOW_SIZE)
    
    # SALVARE FISER
    save_filename = f'processed_data_{house_name}.pkl'
    
    data_package = {
        'X_train': X_train, 'y_train': y_train,
        'X_test': X_test, 'y_test': y_test,
        'scaler': scaler, 'kmeans': kmeans_model,
        'test_data': df.iloc[train_len:],
        'df_1min': df,
        'WINDOW_SIZE': WINDOW_SIZE,
        'train_size': train_len 
    }
    
    with open(save_filename, 'wb') as f:
        pickle.dump(data_package, f)
        
    print(f"[SUCCESS] Salvat: {save_filename} | Train: {X_train.shape}, Test: {X_test.shape}")

In [3]:
# Iteram prin fiecare casa
for house in HOUSES:
    print(f"\n{'='*40}")
    print(f"START PROCESARE: {house['name']}")
    print(f"{'='*40}")
    
    try:
        # 1. Incarcare si Features
        df = process_dataframe(house['file'])
        
        # 2. Clustering
        df, kmeans_model = apply_clustering(df)
        
        # 3. Salvare (Sequences)
        prepare_and_save(df, kmeans_model, house['name'])
        
    except Exception as e:
        print(f"[EROARE] Problema la {house['name']}: {e}")

print("\n" + "="*50)
print("--- TOATE CASELE AU FOST PROCESATE ---")
print("GATA! Fisierele .pkl sunt generate.")
print("Poti trece acum la urmatorul notebook: '02_Models_Training.ipynb'")
print("="*50)


START PROCESARE: House1

[INFO] Procesare fisier: CLEAN_House1.csv...
[INFO] Optimizare si aplicare KMeans...
 -> Cel mai bun k detectat: 2
[INFO] Generare secvente pentru House1...
[SUCCESS] Salvat: processed_data_House1.pkl | Train: (736012, 60, 6), Test: (184019, 60, 6)

START PROCESARE: House2

[INFO] Procesare fisier: CLEAN_House2.csv...
[INFO] Optimizare si aplicare KMeans...
 -> Cel mai bun k detectat: 2
[INFO] Generare secvente pentru House2...
[SUCCESS] Salvat: processed_data_House2.pkl | Train: (711202, 60, 6), Test: (177816, 60, 6)

--- TOATE CASELE AU FOST PROCESATE ---
GATA! Fisierele .pkl sunt generate.
Poti trece acum la urmatorul notebook: '02_Models_Training.ipynb'
