In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score


In [2]:
path = './data/'
ll_data = 'leaf_level/group/'
csv = '.csv'

In [3]:
mkt_data = pd.read_csv(path+'power_market_EWM'+csv, parse_dates = ["date"])
mkt_data.sort_values("date", inplace=True)
mkt_data.drop('Unnamed: 0', axis=1, inplace=True)
mkt_data.set_index('date', inplace=True)
mkt_data = mkt_data/1000
mkt_data.reset_index(inplace=True)
mkt_data

Unnamed: 0,date,spot_EWM7,spot_EWM30,spot_EWM60,spot_EWM100,base_d00_EWM7,base_d00_EWM30,base_d00_EWM60,base_d00_EWM100,base_d01_EWM7,...,peak_y04_EWM60,peak_y04_EWM100,peak_y05_EWM7,peak_y05_EWM30,peak_y05_EWM60,peak_y05_EWM100,peak_y06_EWM7,peak_y06_EWM30,peak_y06_EWM60,peak_y06_EWM100
0,2021-05-25,0.035688,0.049980,0.051158,0.050463,0.039654,0.051701,0.052374,0.051457,0.044685,...,0.065687,0.064227,0.068391,0.067111,0.065602,0.064314,0.068500,0.067368,0.065892,0.064629
1,2021-05-26,0.041711,0.050612,0.051441,0.050647,0.044685,0.052223,0.052617,0.051622,0.049917,...,0.065803,0.064326,0.068550,0.067234,0.065714,0.064407,0.068712,0.067496,0.066006,0.064722
2,2021-05-27,0.047685,0.051580,0.051905,0.050943,0.049917,0.053086,0.053043,0.051899,0.055380,...,0.065924,0.064429,0.068613,0.067335,0.065815,0.064494,0.068772,0.067590,0.066102,0.064806
3,2021-05-28,0.053705,0.052882,0.052556,0.051356,0.055380,0.054292,0.053657,0.052292,0.059935,...,0.066021,0.064517,0.068692,0.067438,0.065918,0.064582,0.068904,0.067700,0.066207,0.064895
4,2021-05-29,0.058679,0.054219,0.053246,0.051796,0.059935,0.055537,0.054311,0.052714,0.059074,...,0.066089,0.064588,0.068669,0.067513,0.066006,0.064661,0.068923,0.067783,0.066298,0.064976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,2022-10-10,0.161124,0.261446,0.311049,0.318351,0.161124,0.261447,0.311049,0.318357,0.161366,...,0.205437,0.198425,0.169934,0.166714,0.164490,0.159582,0.149933,0.146538,0.145526,0.142793
504,2022-10-11,0.161366,0.255036,0.306165,0.315257,0.161366,0.255037,0.306165,0.315262,0.185992,...,0.205706,0.198726,0.171106,0.167224,0.164822,0.159880,0.151105,0.147059,0.145824,0.143027
505,2022-10-12,0.185993,0.255348,0.304647,0.314160,0.185992,0.255348,0.304648,0.314165,0.210129,...,0.205915,0.198990,0.171594,0.167601,0.165092,0.160141,0.151594,0.147447,0.146062,0.143226
506,2022-10-13,0.210129,0.257102,0.303922,0.313534,0.210129,0.257103,0.303923,0.313539,0.221962,...,0.206111,0.199245,0.171918,0.167942,0.165348,0.160394,0.151918,0.147798,0.146285,0.143417


In [4]:
files = pd.read_csv(path+'supplier_list'+csv)
files.drop(columns = ['Unnamed: 0'], inplace = True)
files.rename(columns = {"0" : "File_name"}, inplace=True)

In [5]:
data_files = files['File_name'].to_list()

In [6]:
supp_reg_coeff_list = []

In [7]:
def R_forest(data_file):
    supp_data = pd.read_csv(path+ll_data+data_file, parse_dates = ["date"])
    supp_data.sort_values("date", inplace=True)
    
    if supp_data.shape[0]>100: 
        supp_data = supp_data.groupby('date').agg({'price_per_kwh':'mean'})

        supp_data = supp_data.merge(mkt_data, how='inner', on='date')

        supp_data.corr()

        new_mkt = supp_data.copy()
        new_mkt.drop(columns = ['price_per_kwh'], inplace = True)
        new_mkt.set_index('date', inplace=True)

        X= new_mkt
        X
        
        y = supp_data['price_per_kwh']
        
        # Using PCA to find the most relevant price instruments
        pca = PCA(0.99)
        X_pca = pca.fit_transform(X)
        pca_comp = pca.components_
        num_ins = pca.n_components_
        
        # Train - test split
        X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
        
        # Train Random Forest Regression
        model = RandomForestRegressor(n_estimators=150, max_depth=20, max_features='auto')
        model.fit(X_train, y_train)
        
        #Test the model
        predictions = model.predict(X_test)
        
        # Evaluate the model's performance
        # Mean absolute error
        mae = np.mean(abs(predictions - y_test))
        print(f'Mean absolute error: {mae:.2f}')
        # R^2
        r = r2_score(y_test, predictions)
        print('R squared:', r)
        
            
        # Calculating model accuracy
        acc = model.score(X_test, y_test)
    
        #pc1_coeff = pca.components_[0]
        #ex_var = 0
        #for n_ins in range(100):
         #   if ex_var<0.99:
          #      ex_var += model.feature_importances_[n_ins]
        
        n_ins = 10
        imp = model.feature_importances_# Importance factor of features
        ex_var_ins = np.zeros(shape=X.shape[1]) #Explanatory power of each instrument is stored
        for k, comp in enumerate(pca_comp):# for each PC
            ex_var_ins += comp*imp[k] #Explained variance of instrument
        ex_var_ins = ex_var_ins / ex_var_ins.sum() #Proportion of exp var
        ind = ex_var_ins.argsort()[-20:][::-1] # Indices of most significant features
        feature = X.columns
        sig_features = [feature[i] for i in ind] # Get the most significant features
        sig_features_imp = ex_var_ins[ind] # Importance of significant features
        
        for j in range(20):
            sup_data = {'File': str(data_file),'Price instrument':sig_features[j], 'Variance': sig_features_imp[j], 'R sq':r}
            supp_reg_coeff_list.append(sup_data)
        print(str(data_file))
        
        #reg_dim = sorted(zip(X.columns, model.feature_importances_), key=lambda x: abs(x[1]), reverse=True)[:num_ins]
        #for dim, coeff in reg_dim:
        #   sup_data = {'File': str(data_file),'Price instrument':str(dim), 'Variance': float(coeff), 'R sq':r}
        #   supp_reg_coeff_list.append(sup_data)
       #print(str(data_file))
        
    else:
        return pd.DataFrame()
    
    return acc

for data_file in data_files:
    R_forest(data_file)
    print(data_files.index(data_file))
    


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.7415090377372644
Criador.csv
0
Mean absolute error: 0.00
R squared: 0.9669712963183428


  supp_data.corr()
  warn(


Knik.csv
1


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9785484533848938
Galerne.csv
2


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9717259894851004
Gregale.csv
3


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9955117533059981
Ko.csv
4
5


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9911624468699957
Pulenat.csv
6


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9916073935807183
Auster.csv
7
8
9


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9608361067663194
Belat.csv
10
11


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9811080487043605
Sarma.csv
12


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9343640907604536
Downburst.csv
13
14


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.8674813358011069
Kusi.csv
15


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9924177686394343
Canterbury.csv
16
17


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.8391471858629156
Aquilo.csv
18


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9536627395020639
Elvegust.csv
19
20


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9859615267265287
Caurus.csv
21
Mean absolute error: 0.01
R squared: 0.7663100758170308
Passat.csv
22
23


  supp_data.corr()
  warn(
  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9812784880778757
Werra.csv
24
25


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9179855985282612
Drau.csv
26


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9113275354290084
Papagayos.csv
27
28
29
30
31
32


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.971781365845831
Tornado.csv
33


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9892698379602042
Oder.csv
34


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.959970669268592
Bergwind.csv
35


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9793023088606765
Favonius.csv
36


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9927940981687052
Sirkos.csv
37


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9879400787847716
Collada.csv
38


  supp_data.corr()
  warn(


Mean absolute error: 0.03
R squared: 0.7036058427077723
Bhoot.csv
39
40


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9591216746763764
Norte.csv
41


  supp_data.corr()
  warn(


Mean absolute error: 0.03
R squared: 0.8872251994890694
Wambra.csv
42


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9836903075469426
Sureot.csv
43


  supp_data.corr()
  warn(


Mean absolute error: 0.04
R squared: 0.9201533951020702
Main.csv
44
45
46
47


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9941613963377777
Austru.csv
48
49


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.996417189097568
Lule älv.csv
50
51


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9811729917067309
Reshabar.csv
52


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9896635894737033
Solanus.csv
53


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9874168917280682
Tehuantepecers.csv
54


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.985585571318262
Kachchan.csv
55


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9854619605292861
Purga.csv
56
57


  supp_data.corr()
  warn(


Mean absolute error: 0.03
R squared: 0.5682611998006846
Leste.csv
58


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9911590922050822
Yalca.csv
59
60


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9851202679222921
Boreas.csv
61
62
63
64
65


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.952881331297027
Zyklon.csv
66
67


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.8726569954164142
Ångermanälven.csv
68
69


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9943637530976839
Hangwind.csv
70


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.8342428596380231
Rhone.csv
71


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9733476819638809
Wischera.csv
72
73
74


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9689515311824115
Monsunwind.csv
75
76
77
78
79


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9983117832162787
Severn.csv
80
81
82


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9871368200833722
Piteraq.csv
83
84


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9893739465929859
Suracon.csv
85


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9966864703584514
Belaja.csv
86
87
88
89
90
91
92
93
94
95


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.8559553162165567
Paramitos.csv
96
97
98
99
100
101
102


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.8573580348724732
Aspr.csv
103


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9940905230465547
Helm.csv
104
105
106
107
108
109


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9775106631926724
Nördliche Dwina.csv
110
111
112
113
114
115
116
117


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9722790035238235
Poniente.csv
118
119
120


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9555317830724326
Santa.csv
121


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9867506221152896
Vento.csv
122
123


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9290904513077054
Coromell.csv
124


  supp_data.corr()
  warn(


Mean absolute error: 0.02
R squared: 0.9442716295716468
Kata.csv
125
126
127


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9420873417574649
Joran.csv
128
129
130


  supp_data.corr()
  warn(


Mean absolute error: 0.01
R squared: 0.9693848223201305
Mureș.csv
131


  supp_data.corr()
  warn(


Mean absolute error: 0.00
R squared: 0.9920404722531377
Baguio.csv
132
133
134
135
136
137
138
139


In [8]:
df = pd.DataFrame(supp_reg_coeff_list)    
df.to_csv('D:/DAiA/Eon/Master data/output/DAiA-EON/data/Supplier_Regression_coeff_list.csv')