In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import warnings
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

warnings.filterwarnings('ignore')

# CONFIGURATIE
HOUSES = [
    {'name': 'House1', 'file': 'CLEAN_House1.csv'},
    {'name': 'House2', 'file': 'CLEAN_House2.csv'} 
]

WINDOW_SIZE = 60
TRAIN_SPLIT = 0.8
SAMPLE_SIZE = 50000 # Pentru optimizarea KMeans

In [17]:
# --- FUNCTIE 1: INCARCARE SI FEATURES ---
def process_dataframe(filename):
    print(f"\n[INFO] Procesare fisier: {filename}...")
    df = pd.read_csv(filename)
    df['dt'] = pd.to_datetime(df['Time'])
    df.set_index('dt', inplace=True)
    df = df[['Aggregate']]
    df['Aggregate'] = pd.to_numeric(df['Aggregate'], errors='coerce')
    
    # Resampling
    df_1min = df.resample('1min').mean().fillna(method='ffill')
    
    # Features
    df_1min['Hour'] = df_1min.index.hour
    df_1min['DayOfWeek'] = df_1min.index.dayofweek
    df_1min['Hour_Sin'] = np.sin(2 * np.pi * df_1min['Hour'] / 24.0)
    df_1min['Hour_Cos'] = np.cos(2 * np.pi * df_1min['Hour'] / 24.0)
    df_1min['IsWeekend'] = df_1min['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
    df_1min['DayOfWeek_Scaled'] = df_1min['DayOfWeek'] / 6.0
    
    return df_1min

# --- FUNCTIE 2: CLUSTERING ---
def apply_clustering(df, train_len):
    print("[INFO] Optimizare si aplicare KMeans (fit pe train, predict pe test)...")

    # Split pe df (doar pentru antrenarea KMeans fara leakage)
    df_train = df.iloc[:train_len].copy()
    df_test  = df.iloc[train_len:].copy()

    # Esantion doar din train pentru alegerea lui k
    data_sample = df_train[['Aggregate']].sample(
        n=min(SAMPLE_SIZE, len(df_train)),
        random_state=42
    ).values

    range_n_clusters = [2, 3, 4, 5]
    scores = []
    for k in range_n_clusters:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(data_sample)
        scores.append(silhouette_score(data_sample, labels))

    best_k = range_n_clusters[np.argmax(scores)]
    print(f" -> Cel mai bun k detectat (train): {best_k}")

    # Fit pe train, predict pe test
    kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
    df_train['Cluster'] = kmeans_final.fit_predict(df_train[['Aggregate']].values)
    df_test['Cluster']  = kmeans_final.predict(df_test[['Aggregate']].values)

    # Ordonare pentru cazul k=2 (0 = low, 1 = high)
    centers = kmeans_final.cluster_centers_.flatten()
    if len(centers) == 2 and centers[0] > centers[1]:
        df_train['Cluster'] = 1 - df_train['Cluster']
        df_test['Cluster']  = 1 - df_test['Cluster']

    # Recombinare, cu aceleasi randuri ca df initial
    df_out = pd.concat([df_train, df_test], axis=0)
    return df_out, kmeans_final


# --- FUNCTIE 3: SECVENTE ---
def create_sequences(data, window_size):
    X, y = [], []
    for i in range(window_size, len(data)):
        X.append(data[i-window_size:i])
        y.append(data[i, 0])
    return np.array(X), np.array(y)


def prepare_and_save(df, kmeans_model, house_name):
    print(f"[INFO] Generare secvente pentru {house_name}...")
    
    # ============================================================
    # 1. Selectarea feature-urilor brute (fara scalare initiala)
    # ============================================================
    features_cols_raw = [
        'Aggregate',
        'Hour_Sin',
        'Hour_Cos',
        'DayOfWeek_Scaled',
        'IsWeekend',
        'Cluster'
    ]

    dataset_raw = df[features_cols_raw].values

    # ============================================================
    # 2. Split temporal train / test
    # ============================================================
    train_len = int(len(dataset_raw) * TRAIN_SPLIT)

    train_raw = dataset_raw[:train_len]
    test_raw  = dataset_raw[train_len:]

    # ============================================================
    # 3. Scalare DOAR pe setul de antrenare (evitare data leakage)
    # ============================================================
    scaler = RobustScaler()

    # Coloana 0 = Aggregate
    train_agg_scaled = scaler.fit_transform(train_raw[:, [0]])
    test_agg_scaled  = scaler.transform(test_raw[:, [0]])

    # ============================================================
    # 4. Reconstruirea dataset-ului final scalat
    #    Aggregate_Scaled + celelalte feature-uri nemodificate
    # ============================================================
    train_data = np.concatenate([train_agg_scaled, train_raw[:, 1:]], axis=1)
    test_data_values = np.concatenate([test_agg_scaled, test_raw[:, 1:]], axis=1)

    # ============================================================
    # 5. Crearea secventelor pentru antrenare
    # ============================================================
    X_train, y_train = create_sequences(train_data, WINDOW_SIZE)

    # ============================================================
    # 6. Crearea secventelor pentru test
    #    Se concateneaza ultimele WINDOW_SIZE valori din train
    #    pentru a pastra contextul temporal
    # ============================================================
    test_inputs = np.concatenate((train_data[-WINDOW_SIZE:], test_data_values))
    X_test, y_test = create_sequences(test_inputs, WINDOW_SIZE)

    # ============================================================
    # 7. Salvare fisier procesat
    # ============================================================
    save_filename = f'processed_data_{house_name}.pkl'

    data_package = {
        'X_train': X_train,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test,
        'scaler': scaler,
        'kmeans': kmeans_model,
        'test_data': df.iloc[train_len:],   # datele brute din test (pentru analiza)
        'df_1min': df,                      # dataframe-ul complet procesat
        'WINDOW_SIZE': WINDOW_SIZE,
        'train_size': train_len
    }
        
    with open(save_filename, 'wb') as f:
            pickle.dump(data_package, f)
            
    print(f"[SUCCESS] Salvat: {save_filename} | Train: {X_train.shape}, Test: {X_test.shape}")

In [18]:
# Iteram prin fiecare casa
for house in HOUSES:
    print(f"\n{'='*40}")
    print(f"START PROCESARE: {house['name']}")
    print(f"{'='*40}")
    
    try:
        # 1. Incarcare si Features
        df = process_dataframe(house['file'])
        
        # 2. Clustering
        train_len = int(len(df) * TRAIN_SPLIT)
        df, kmeans_model = apply_clustering(df, train_len)
        
        # 3. Salvare (Sequences)
        prepare_and_save(df, kmeans_model, house['name'])
        
    except Exception as e:
        print(f"[EROARE] Problema la {house['name']}: {e}")

print("\n" + "="*50)
print("--- TOATE CASELE AU FOST PROCESATE ---")
print("GATA! Fisierele .pkl sunt generate.")
print("Poti trece acum la urmatorul notebook: '02_Models_Training.ipynb'")
print("="*50)


START PROCESARE: House1

[INFO] Procesare fisier: CLEAN_House1.csv...
[INFO] Optimizare si aplicare KMeans (fit pe train, predict pe test)...
 -> Cel mai bun k detectat (train): 2
[INFO] Generare secvente pentru House1...
[SUCCESS] Salvat: processed_data_House1.pkl | Train: (736012, 60, 6), Test: (184019, 60, 6)

START PROCESARE: House2

[INFO] Procesare fisier: CLEAN_House2.csv...
[INFO] Optimizare si aplicare KMeans (fit pe train, predict pe test)...
 -> Cel mai bun k detectat (train): 2
[INFO] Generare secvente pentru House2...
[SUCCESS] Salvat: processed_data_House2.pkl | Train: (711202, 60, 6), Test: (177816, 60, 6)

--- TOATE CASELE AU FOST PROCESATE ---
GATA! Fisierele .pkl sunt generate.
Poti trece acum la urmatorul notebook: '02_Models_Training.ipynb'
