In [1]:
from utils import *
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Segoe UI Symbol','simHei','Arial','sans-serif']

## Load Data

In [5]:
# %load_ext line_profiler
# %lprun -f 
years = [2016,2017]
resampling_period = '60T'

original_data = load_resample_data(years, resampling_period)
windfarms_names = original_data.columns.tolist()    

year:2016	(105408, 20)
year:2017	(105120, 20)
resampled to  (17544, 20)


### Normalize data = Capacity Factor (optional) 

In [None]:
windfarms = pd.read_excel('NRGstreamData/WindFarms.xlsx').sort_values(by='Asset ID').reset_index(drop = True)
# windfarms.head(4)
normalized_data = original_data / windfarms['Capacity'].values
normalized_data.head(4)

### Clustering (optional)

In [None]:
from sklearn.cluster import KMeans
n_clusters = 5
kmin = KMeans(n_clusters)
X = np.transpose(normalized_data.iloc[:,:])
Y = kmin.fit(X)
clusters = kmin.labels_
clusters

clusters = np.array([1, 1, 2, 1, 4, 4, 0, 0, 0, 1, 3, 0, 0, 0, 3, 0, 1, 2, 3, 2])

a = {}
for c in range(n_clusters):
    ind = clusters == c # OR: ind = np.where(clusters==c)[0] --> this generate an np.array of only indices
    # a[c] = (normalized_data.iloc[:,ind]).mean(axis=1) : old version
    a[c] = (original_data.iloc[:,ind]).sum(axis=1) / (windfarms['Capacity'].iloc[ind].sum())
    

clustered_data = pd.DataFrame.from_dict(a)
clustered_data.columns = ['Cluster {:d}'.format(d) for d in range(1,n_clusters+1)] 

sum_capacities_each_row = [sum(~(original_data.iloc[i,:].isnull().values) * 
                                 windfarms['Capacity'].values) for i in range(0,original_data.shape[0])]
clustered_data['Total'] = original_data.sum(axis=1)/ sum_capacities_each_row
# clustered_data['Total'] = clustered_data.mean(axis=1)
# clustered_data.head(4)

### Decide what data to use for further analysis (Original data | Normalized data | clustered_data)

In [None]:
# #### Be catious !
data = clustered_data  # Normalized_data  | Original_data
data.head()

### Convert original numerical data into categorical representation

In [None]:
## Set initial variables
alphabet_size=5
alphabets = list(string.ascii_uppercase[:alphabet_size]) # ['A', 'B', 'C', 'D', 'E' , ...]
alphabets = [ "○" , '◔', '◑', '◕', "●"] #  "○" , '◔', '◑', '◕', "●"  OR '▁ ','▃ ', '▄ ', '▆ ', '█ '

categorical_reps_df = categorical_rep_mycode_df(data.iloc[:,-20:],data.columns, alphabet_size, alphabets)
categorical_reps_df.head(3)

In [3]:
ord('◔')

9684

## Find most frequent motifs of a single series (one single windfarm/ one cluster)

### Define Motifs

In [None]:
lenght_time_period = 6

# Define to what motifs are we interested ! :)
candidate_motifs = [''.join(p) for p in product(alphabets,repeat=lenght_time_period)] # Generate all permutations of alphabets
print("Number of desired motifs =", len(candidate_motifs))

In [None]:
data_here = categorical_reps_df 
total_plots = 6
n_rows = 3 ; figsize_r = 7
n_cols = 2 ; figsize_c = 8
n_plots = n_rows * n_cols
method = 're_findall' #re_findall  OR count
Title = 'Most frequent patterns in {} consecutive hours '.format(lenght_time_period)

for p in range(int(-(-total_plots//n_plots))):
    fig, axar = plt.subplots(n_rows, n_cols,figsize=(figsize_c,figsize_r), dpi=100)
    fig.subplots_adjust(hspace=0.25, wspace=0.07*lenght_time_period)        

    ind = 0
    for i in range(n_plots*p,min(n_plots*(p+1),total_plots)):
        r = ind // n_cols
        c = ind % n_cols

        one_time_series = data_here.iloc[:,i].str.cat(sep='')
        motifs_freqs = find_most_freq_motifs2(one_time_series, candidate_motifs, 10, method)
        legend = data_here.columns[i]
        freqs = [x[1] for x in motifs_freqs]
        motifs = [x[0] for x in motifs_freqs]
        values= np.array(freqs)/(data.shape[0])
        ax = axar[r, c]    
        ax.barh(range(len(freqs)), values, color='g', alpha=0.95)
        ax.legend([legend],loc='best', fontsize = 9)
        ax.set_yticks(range(len(motifs)))
        ax.set_yticklabels(motifs) # fontname='Arial'
        ax.tick_params(axis='x', labelsize = 8, which='major', pad=0, color = 'b')
        ax.tick_params(axis='y', labelsize = 9, which='major', pad=0, color = 'b')

        if c == 0:
            ax.set_ylabel('Most Frequent Patterns',fontsize=9)
        if r == n_rows-1:
            ax.set_xlabel('Probabilty',fontsize=11)

        # Pad margins so that markers don't get clipped by the axes
        ax.margins(0.05)
        
        ind = ind + 1

    fig.savefig(Title+str(p)+'.jpg', papertype='letter', dpi = 300, bbox_inches='tight')    