In [1]:
from utils import *
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Segoe UI Symbol','simHei','Arial','sans-serif']

## Load Data

In [2]:
years = [2016,2017]
resampling_period = '60T'

original_data = load_resample_data(years, resampling_period)
windfarms_names = original_data.columns.tolist()    

year:2016	(105408, 20)
year:2017	(105120, 20)
resampled to  (17544, 20)


### Normalize data = Capacity Factor (optional) 

In [3]:
windfarms = pd.read_excel('NRGstreamData/WindFarms.xlsx').sort_values(by='Asset ID').reset_index(drop = True)
# windfarms.head(4)
normalized_data = original_data / windfarms['Capacity'].values
# normalized_data.head(4)

## Clustering (optional)

In [4]:
from sklearn.cluster import KMeans
n_clusters = 4
kmin = KMeans(n_clusters)
X = np.transpose(normalized_data.iloc[:,:])
Y = kmin.fit(X)
clusters = kmin.labels_
clusters

# clusters = np.array([1, 1, 2, 1, 4, 4, 0, 0, 0, 1, 3, 0, 0, 0, 3, 0, 1, 2, 3, 2])

a = {}
for c in range(n_clusters):
    ind = clusters == c # OR: ind = np.where(clusters==c)[0] --> this generate an np.array of only indices
    # a[c] = (normalized_data.iloc[:,ind]).mean(axis=1) : old version
    a[c] = (original_data.iloc[:,ind]).sum(axis=1) / (windfarms['Capacity'].iloc[ind].sum())
    

clustered_data = pd.DataFrame.from_dict(a)
clustered_data.columns = ['Cluster {:d}'.format(d) for d in range(1,n_clusters+1)] 

sum_capacities_each_row = [sum(~(original_data.iloc[i,:].isnull().values) * 
                                 windfarms['Capacity'].values) for i in range(0,original_data.shape[0])]
clustered_data['Total'] = original_data.sum(axis=1)/ sum_capacities_each_row
# clustered_data['Total'] = clustered_data.mean(axis=1)
# clustered_data.head(4)

In [5]:
clusters

array([1, 1, 0, 1, 3, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 0, 2, 0])

### Decide what data to use for further analysis (Original data | Normalized data | clustered_data)

In [None]:
# #### Be catious !
data = clustered_data  # Normalized_data  | Original_data
data.head()

### Convert original numerical data into categorical representation

In [None]:
## Set initial variables
alphabet_size=5
alphabets = list(string.ascii_uppercase[:alphabet_size]) # ['A', 'B', 'C', 'D', 'E' , ...]
alphabets = [ "○" , '◔', '◑', '◕', "●"] #  "○" , '◔', '◑', '◕', "●"  OR '▁ ','▃ ', '▄ ', '▆ ', '█ '

categorical_reps_df = categorical_rep_mycode_df(data.iloc[:,-20:],data.columns, alphabet_size, alphabets)
categorical_reps_df.head(3)

## Find frequency of a few pre-defined interesting patterns of a single series (one single windfarm/ one cluster)

In [None]:
def plot_one_dataframe_in_several_axes(dataframe,total_plots,
                                       n_rows = 5, figsize_r = 10, n_cols = 2 , figsize_c = 8, savefig=0):
    
    n_each_fig_plots = n_rows * n_cols

    for p in range(int(-(-total_plots//n_each_fig_plots))):
        fig, axar = plt.subplots(n_rows, n_cols,figsize=(figsize_c,figsize_r), dpi=100)
        fig.subplots_adjust(hspace=0.25, wspace=0.15)        

        ind = 0
        for i in range(n_each_fig_plots*p,min(n_each_fig_plots*(p+1),total_plots)):

            if n_cols ==1:
                ax = axar[ind]
            elif n_cols > 1:
                r = ind // n_cols
                c = ind % n_cols
                ax = axar[r, c]    

            this_df = dataframe.iloc[:,i]
            ax = this_df.plot(kind='bar', ax = ax)
            ax.text(0,0.85,data_here.columns[i], fontsize=10, bbox=dict(boxstyle="round", 
                       ec='gray',
                       fc='w',
                       ))

            ax.tick_params(axis='x', labelsize = 8, which='major', pad=0, rotation=90 , color = 'w')
            ax.tick_params(axis='y', labelsize = 8, which='major', pad=0, rotation=0 , color = 'k')
            # ax.set_title(data_here.columns[i], fontsize=9)
            ax.set_xticklabels(dataframe.index, color = 'w')
            ax.set_ylim(bottom=0, top = 1)
            if c == 0:
                ax.set_ylabel('Probability of the pattern',fontsize=9)
            if r == n_rows-1:
                ax.set_xlabel('Patterns',fontsize=11)
                ax.set_xticklabels(motifs, color = 'k')

            if savefig:
                fig.savefig(Title+str(p)+'.jpg', papertype='letter', dpi = 300, bbox_inches='tight')    

            ind = ind + 1
            #plt.close()

In [None]:
candidate_motifs = [alphabets[-1] * i for i in range(2,13)]
print("Number of desired motifs =", len(candidate_motifs))
print(candidate_motifs)

In [None]:
data_here = categorical_reps_df 

values = []
tick_labels = []
legends = []
for i in range(data_here.shape[1]):
    one_time_series = data_here.iloc[:,i].str.cat(sep='')
    motifs_freqs = find_most_freq_motifs2(one_time_series, candidate_motifs, 10, method='re_findall')
    legend = data_here.columns[i]
    freqs = [x[1] for x in motifs_freqs]
    motifs = [x[0] for x in motifs_freqs]
    values.append(freqs)
    tick_labels.append(motifs)
    legends.append(legend)
    
values = np.array(values)/ (data_here.shape[0]) #* 100
values = values.transpose()
motifs_freqs_df = pd.DataFrame.from_dict(values)
motifs_freqs_df.columns = data_here.columns
motifs_freqs_df.index= motifs
motifs_freqs_df

In [None]:
a = leg.get_frame()
a.set

In [None]:
data_here = motifs_freqs_df # should be a fvDF

total_plots = 6
no_rows = 3 ; figsize_row = 6
no_cols = 2 ; figsize_col = 8
Title = 'Probabilty_of_consequent_high_power_generation'
savefig = 1

plot_one_dataframe_in_several_axes(data_here,total_plots,
                                   n_rows = no_rows, figsize_r = figsize_row, n_cols = no_cols, figsize_c = figsize_col, 
                                   savefig = savefig)

In [None]:
def plot_hbar_v1(values, tick_labels, legends, ylabel='Patterns', xlabel="Frequency of Patterns",
                 total_plots=20, n_rows=5, figsize_r=10, n_cols=2, figsize_c=8, savefig=0):
    n_plots = n_rows * n_cols
    method = 're_findall' #re_findall  OR count
    lenght_time_period = 4
    Title = 'Most frequent patterns in {} consecutive hours '.format(lenght_time_period)

    for p in range(int(-(-total_plots//n_plots))):
        fig, axar = plt.subplots(n_rows, n_cols,figsize=(figsize_c,figsize_r), dpi=100)
        fig.subplots_adjust(hspace=0.15, wspace=0.12*lenght_time_period)        

        ind = 0
        for i in range(n_plots*p,min(n_plots*(p+1),total_plots)):
            r = ind // n_cols
            c = ind % n_cols

            ax = axar[r, c]    
            ax.barh(range(len(values[i])), values[i], color='g', alpha=0.95)
            ax.legend([legends[i]],loc='best', fontsize = 9)
            ax.set_yticks(range(len(motifs)))
            ax.set_yticklabels(motifs, fontsize=8) # fontname='Arial'
            ax.tick_params(axis='x', labelsize = 8, which='major', pad=0, color = 'k')
            ax.tick_params(axis='y', labelsize = 9, which='major', pad=-1, color = 'w')

            if c == 0:
                ax.set_ylabel(ylabel,fontsize=9)
            if r == n_rows-1:
                ax.set_xlabel(xlabel,fontsize=11)

            # Pad margins so that markers don't get clipped by the axes
            ax.margins(0.05)
            
            if savefig:
                fig.savefig(Title+str(p)+'.jpg', papertype='letter', dpi = 300, bbox_inches='tight')    
                
            ind = ind + 1

In [None]:
# this is the same plot as above, however I don't like it
plot_hbar_v1(values.transpose(), tick_labels, legends, total_plots=6, n_rows=3, figsize_r=6, n_cols=2, figsize_c=8, savefig=1)