In [1]:
import copy
import csv
import logging
import sys
import time
import warnings

import importlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.metrics import r2_score

import pypsa
from n_dimensional_datasets import *
from plotter import *

from IPython.display import display # for better Pandas printing 

warnings.filterwarnings('ignore')
logger = logging.getLogger("pypsa")
logger.setLevel("WARNING")
%matplotlib inline

path_to_powerflow_example = "../../pypsa/examples/ieee-13/"
path_to_powerflow_data = path_to_powerflow_example + "/ieee-13-with-load-gen/"
path_to_powerflow_results = path_to_powerflow_data + "results/"

sys.path.append(path_to_powerflow_example)
from ieee13_pf import run

from change_powerflow_data import set_sample_size

def reject_outliers(data, m=3, return_positions=False):
    positions = abs(data - np.mean(data)) < m * np.std(data)
    if return_positions:
        return positions
    return data[positions]

## data sampling

In [2]:
n_original_samples = 2

if n_original_samples < 2:
    raise ValueError("n_original_samples must be an integer >1")

sample_size = 1000

In [3]:
def personalise_column_names(df, name):
        new_columns = []
        for column in df.columns:
            new_columns.append(name +  "-" + str(column))
        df.columns = new_columns
        return pd.DataFrame(df)

def collect_data(data):
    data["loads"] = personalise_column_names(pd.read_csv(path_to_powerflow_data + "loads-p_set.csv"), "load")
    data["vmags"] = personalise_column_names(pd.read_csv(path_to_powerflow_results + "vmags.csv"), "vmag")
    data["vangs"] = personalise_column_names(pd.read_csv(path_to_powerflow_results + "vangs.csv"), "vang")
    data["qmags"] = personalise_column_names(pd.read_csv(path_to_powerflow_results + "qmags.csv"), "qmag")
    data["linemags"] = personalise_column_names(pd.read_csv(path_to_powerflow_results + "linemags.csv"), "linemag")

In [4]:
# data_to_change = ["loads-p_set", "snapshots", "loads-q_set"]

# set_sample_size(path_to_powerflow_data, data_to_change, sample_size, n_original_samples, seed=None)
# network = run()


data = {"loads": [], "vmags": [], "vangs": [], "qmags": [], "linemags": []}
collect_data(data)

## train

In [5]:
from sklearn.ensemble import RandomForestRegressor
from scipy.interpolate import LinearNDInterpolator
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from custom_transformers import DataFrameSelector, RejectOutliers
from sklearn.decomposition import PCA

In [10]:
features = data["loads"].drop("load-name", axis=1)
labels = data["vangs"].loc[:,["vang-632", "vang-671", "vang-675"]]
features_and_labels = features.join(labels)

In [36]:
# data transformations

all_pipeline = Pipeline([
    ("outliers", RejectOutliers(labels.columns)),
])

features_and_labels = pd.DataFrame(all_pipeline.fit_transform(features_and_labels)) 
features = features_and_labels[[col for col in features_and_labels if 'load' in col]]
labels = features_and_labels[[col for col in features_and_labels if 'vang' in col]]

feature_pipeline = Pipeline([
#     ("selector", DataFrameSelector(["load-671", "load-675"])),
    ("scaler", StandardScaler()),
    ("pca", PCA())
])
feature_pipeline.set_params(pca__n_components=0.95)

features = pd.DataFrame(feature_pipeline.fit_transform(features)) # usually columns=features.columns but with pca it doesn't always work

In [37]:
training_percentage = 80
n_samples = features.shape[0]
n_training_samples = int(n_samples*(training_percentage/100))

random_seed=0
X_train = features.sample(n_training_samples, random_state=random_seed)
y_train = labels.sample(n_training_samples, random_state=random_seed)
X_val = features[~features.isin(X_train)].dropna()
y_val = labels[~labels.isin(y_train)].dropna()

X_train = X_train.values
y_train = y_train.values
X_val = X_val.values
y_val = y_val.values

In [38]:
training_stats = {"n_training_samples": n_training_samples}
stats = {"trainscore": [], 
         "trainscorevar": [],
         "valscore": [],
         "valscorevar": [],
         "mae": [],
         "time": []}
approx_type = {"svr": copy.deepcopy(stats),
               "rf": copy.deepcopy(stats),
               "interp": copy.deepcopy(stats)}
approx_type["interp"].pop("trainscorevar")
approx_type["interp"].pop("valscorevar")


time_start = time.time()
### setup approximators


## random forest
forest = RandomForestRegressor()

time_forest = time.time()
approx_type["rf"]["time"].append(time_forest-time_start)

forest_xval_training_score = reject_outliers(cross_val_score(forest, X_train, y_train, cv=5, n_jobs=-1))
forest_xval_val_score = reject_outliers(cross_val_score(forest, X_val, y_val, cv=5, n_jobs=-1))
approx_type["rf"]["trainscore"].append(forest_xval_training_score.mean())
approx_type["rf"]["trainscorevar"].append(forest_xval_training_score.std())
approx_type["rf"]["valscore"].append(forest_xval_val_score.mean())
approx_type["rf"]["valscorevar"].append(forest_xval_val_score.std())

forest.fit(X_train, y_train)
approx_type["rf"]["mae"].append(np.mean(y_val-forest.predict(X_val)))

                                
## support vector regression
n_labels = y_train.shape[1]
svr = copy.deepcopy(stats)
svr_labels = {"y_train": None, "y_val": None}
for idx in range(n_labels):
    svr_labels["y_train"] = y_train.T[idx].T
    svr_labels["y_val"] = y_val.T[idx].T
    clf = SVR(gamma='scale', C=1.0, epsilon=0.0002, kernel='linear')
    '''
    Scikit-Learn cross-validation features expect a utility function (greater is better) rather than a cost function
    (lower is better), so the scoring function is actually the opposite of the MSE (i.e., a negative value), 
    which is why the preceding code computes -scores before calculating the square root.
    - A. Geron, Hands on Machine Learning pg 101 
    '''
    svr_xval_training_score = reject_outliers(cross_val_score(clf, X_train, svr_labels["y_train"], cv=5, n_jobs=-1))
    svr_xval_val_score = reject_outliers(cross_val_score(clf, X_val, svr_labels["y_val"], cv=5, n_jobs=-1))    
    svr["trainscore"].append(svr_xval_training_score.mean())
    svr["trainscorevar"].append(svr_xval_training_score.std())
    svr["valscore"].append(svr_xval_val_score.mean())
    svr["valscorevar"].append(svr_xval_val_score.std())

    clf.fit(X_train, svr_labels["y_train"])
    svr["mae"].append(np.mean(svr_labels["y_val"]-clf.predict(X_val)))

    time_svr = time.time()
    svr["time"].append(time_svr - time_forest)

approx_type["svr"]["trainscore"].append(np.mean(svr["trainscore"]))
approx_type["svr"]["trainscorevar"].append(np.mean(svr["trainscorevar"]))
approx_type["svr"]["valscore"].append(np.mean(svr["valscore"]))
approx_type["svr"]["valscorevar"].append(np.mean(svr["valscorevar"]))
approx_type["svr"]["mae"].append(np.mean(svr["mae"]))
approx_type["svr"]["time"].append(np.mean(svr["time"]))


## interpolation
# interp training gets very slow as the number of features grows
if X_train.shape[1] < 4:
    interp = LinearNDInterpolator(X_train, y_train, fill_value=0)

    time_interp = time.time()
    approx_type["interp"]["time"].append(time_interp-time_svr)

    approx_type["interp"]["trainscore"].append(r2_score(y_train, interp(X_train)))
    approx_type["interp"]["valscore"].append(r2_score(y_val, interp(X_val)))
    approx_type["interp"]["mae"].append(np.mean(y_val-interp(X_val)))


## print stats
print("Training Stats: {}".format(training_stats))
for t in approx_type:
    print("\n", t + " Stats: \n")
    display(pd.DataFrame(approx_type[t]))

print("svr training score - non crossvalidation: ", r2_score(svr_labels["y_train"], clf.predict(X_train)))
print("svr validation score - non crossvalidation: ", r2_score(svr_labels["y_val"], clf.predict(X_val)))
print("rf training score - non crossvalidation: ", r2_score(y_train, forest.predict(X_train)))
print("rf validation score - non crossvalidation: ", r2_score(y_val, forest.predict(X_val)))

Training Stats: {'n_training_samples': 797}

 svr Stats: 



Unnamed: 0,trainscore,trainscorevar,valscore,valscorevar,mae,time
0,0.762583,0.026486,0.756792,0.047527,-5.7e-05,50.0124



 rf Stats: 



Unnamed: 0,trainscore,trainscorevar,valscore,valscorevar,mae,time
0,0.582559,0.016742,0.57299,0.054713,-0.00011,7.5e-05



 interp Stats: 



Unnamed: 0,trainscore,valscore,mae,time


svr training score - non crossvalidation:  0.683335363518906
svr validation score - non crossvalidation:  0.6896891295959366
rf training score - non crossvalidation:  0.9224637799921137
rf validation score - non crossvalidation:  0.6480399664406761


In [39]:
# plotting
from plotter import plot3d_approximation

if X_train.shape[1] == 1:
    plt.plot(X_train, clf.predict(X_train), "x", X_train, y_train, "o")
    plt.show()

    plt.figure()
    plt.plot(X_val, clf.predict(X_val), "x", X_val, y_val, "o")
    plt.show()
    
elif X_train.shape[1] == 2:
    plot3d_approximation(X_train.T, y_train, clf.predict(X_train))

In [40]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(forest, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names=features.columns.tolist())

AttributeError: 'int' object has no attribute 'startswith'