# Degradation Data Clustering

Titan Hartono (titan.hartono@helmholtz-berlin.de)
Data collected and cleaned from: Paolo Graniero, Hans Koebler

ver 20221122

## 1. Import libraries and load the dataset

For this section,

**input**: .pkl datasetS that was converted from raw .json data in previous notebook

**process**:
1. load the libraries
2. filter out bad pixels
3. filter out the one with empty MPPT_EFF
4. drop the data with length < 150 hours
5. cut the series to only MPPT_t and MPPT_EFF (there are other things in the dataset, but we're focusing on the MPPT_EFF for now)
6. resample to every 10 minutes 
7. drop the length to 150 hours, hence: 1 hour = 6 data points, 150 hours = 900 data points
8. interpolate if there's a NaN values (selected method: akima)

**output**: one whole .pkl file for good, further-processable data

### 1.1. Load libraries

In [None]:
# Install the following packages if they haven't been installed

# pip install "dask[complete]"
# pip install "-U kaleido"
# pip install "kaleido"

In [None]:
# Import all the packages needed for the notebook to run

# %matplotlib inline
import matplotlib.pyplot as plt
import sys
import os
# import rdkit
import numpy as np
import pandas as pd
from pandas import DataFrame, read_csv
from IPython.display import display_html
import seaborn as sns
import json
import pickle5 as pickle
import dask
from PIL import ImageColor

from minisom import MiniSom
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

In [None]:
# Set up name to save files
filedirname = '20230303_run_revision_excN2/sigma_0p5_learningrate_0p1_150h_only3CAT/20230303_sigma_0p5_learningrate_0p1_only3CAT_'

### 1.2. Load dataset in pickle

In [None]:
# Load column headers
with open('dataset/pkl_4/JSON_Fengjiu-2.pkl', "rb") as fh:
    test = pickle.load(fh)
    
columns_load = test.columns.values.tolist()

In [None]:
# lOAD PICKLE FOR WHOLE SERIES

# Go to the dataset
directory = 'dataset/pkl_4/'

# for pkl_4 folder
wholeSeries = pd.DataFrame(columns=columns_load)
namesofMySeries = []

# Load the files from the directories
for filename in os.listdir(directory):
    if filename.endswith(".pkl"):
        
        # Open the directory and load the files
        with open(directory+filename, "rb") as fh:
            df = pickle.load(fh)
        print(filename)

        wholeSeries = wholeSeries.append(df) 

# Reindexing
wholeSeries.index = range(wholeSeries.shape[0])
wholeSeries

### 1.3. Filter out bad devices, empty rows, and data that is too short
Next: not all devices are good. We are adding the pixel information from 'Pixelfilter' (if it's bad or good), labeled by the researcher fabricating the devices directly.

In [None]:
# Adding the pixelfilter as a column for each device
list_pixel = []
print('Series length: ',len(wholeSeries)/6) # Series: cells, pixel: *6 of the cells

# Adding the pixel number to the list
for i in range(int(len(wholeSeries)/6)):
    for k in range(6):
        list_pixel.append((wholeSeries['Pixelfilter'][i*6])[k])

# Adding the list to the wholeSeries dataframe
wholeSeries['Pixelfilter_ind'] = list_pixel

# Get names of indexes for which column pixelfilter is 0
indexNames = wholeSeries[wholeSeries['Pixelfilter_ind'] == 0 ].index

# Delete these row indexes from dataFrame
wholeSeries.drop(indexNames , inplace=True)

# Reindexing
wholeSeries.index = range(wholeSeries.shape[0])
wholeSeries

Next: 
1. Drop the row with no MPPTdata
2. Drop the row with data length < hour limit (in our case: 150 hours)

In [None]:
# Next: drop row if there is no MPPTdata
indexDrop = []
wholeSeries['T_avg']=0.0
wholeSeries['Irr_avg']=0.0

for num in range(len(wholeSeries)):
    
    # Drop row if there is no MPPTdata or MPPTdata<hour_limit
    if len(wholeSeries['MPPTdata'][num])==0: # or max(wholeSeries['MPPTdata'][num]['MPPT_dur_h'])< hour_limit:
        indexDrop.append(num)

print('# gets dropped due to empty MPPTdata: ', len(indexDrop))

# Drop based on the ctondition
mySeries = wholeSeries.drop(indexDrop, axis=0, inplace=False)
mySeries.reset_index(drop=True, inplace=True)
print('# rows surviving: ', len(mySeries))

In [None]:
# Saving distribution of the researchers

mySeries[['SampleNumber','Pixel','Filename']].to_csv(filedirname+'_myseries_batchname_pixel_sample.csv',index=False)

In [None]:
# Plot the data length variations
max_hour = []

for i in range(len(mySeries)):
    max_hour.append(max(mySeries['MPPTdata'][i]['MPPT_dur_h']))

# Plotting the distribution
sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(5,4)})
g = sns.distplot(np.array((max_hour)),kde=False,
#              bins=[150,300,450,600,750,900,1050,1200,1350,1500,1650,1800,1950,2100])
             bins=[50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,
                   1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,
                   1550,1600,1650,1700,1750,1800,1850,1900,1950,2000,2050,2100])

plt.rcParams['font.family'] = 'Arial'
plt.xlabel('Degradation length (hours)')
plt.ylabel('Count')

g.set(xticks=([0, 150, 500, 1000, 1500, 2000]))

# Save figure
plt.savefig(filedirname+'distribution_hour.png', dpi=600)

# Print the min and max hour
print('min hour: ',min(np.array(max_hour)))
print('max hour: ',max(np.array(max_hour)))

In [None]:
# Next: drop row if the MPPTdata is too short (< hour_limit)

indexDrop = []
hour_limit = 150 #hours
mySeries['T_avg']=0.0
mySeries['Irr_avg']=0.0

for num in range(len(mySeries)):
    
    # Drop row if MPPTdata<hour_limit
    if max(mySeries['MPPTdata'][num]['MPPT_dur_h'])< hour_limit:
        indexDrop.append(num)

print('# gets dropped due to short MPPTdata: ', len(indexDrop))

# Drop based on the condition
mySeries = mySeries.drop(indexDrop, axis=0, inplace=False)
mySeries.reset_index(drop=True, inplace=True)
print('# rows surviving: ', len(mySeries))

# Reindexing
mySeries.index = range(mySeries.shape[0])

mySeries

In [None]:
# Save the 'shell' dataframe (without actual data)
series_nodata = mySeries.drop(['Temperature', 'Irradiation','MPPTdata','IVdataFor','IVdataRev'], axis=1)  # df.columns is zero-based pd.Index
series_nodata.to_csv('dataset/pkl_complete/20230303_series_nodata.csv',index=False)

In [None]:
# Save the data as pickle
mySeries.to_pickle('dataset/pkl_complete/20230303_selected_pixel.pkl')
# os.getcwd()

### 1.4. Further pre-processing: cut the dataset to MPPT_EFF, resample for 10 minutes, drop the length to 150 hours, drop if the max. PCE is reached after 150 hours, interpolate if there is NaN, and (optional) select a subset of the dataset

In [None]:
# Load the .pkl file consisting mySeries
with open('dataset/pkl_complete/20230303_selected_pixel.pkl', "rb") as fh:
    mySeries = pickle.load(fh)

In [None]:
print('Column names: ',mySeries.columns.values.tolist())

In [None]:
# Check the irradiation dataframe from row 0
mySeries['Irradiation'][0]

In [None]:
# Check the temperature dataframe from row 0
mySeries['Temperature'][0]

In [None]:
# Check the MPPT dataframe from row 0
((mySeries.loc[300])['MPPTdata'])

In [None]:
# Add empty columns
mySeries['MPPT_t_delta_max'] = 0.0
mySeries['MPPT_t_delta_min'] = 0.0
count_t_delta_max = 0
list_row_drop = []

# Going through the data for each one, and resample

for i,row in mySeries.iterrows():
    print('row: ',i)

    ################### delta time ###################

    # Find the time when MPPT efficiency reaches the highest and lowest
    ((mySeries.loc[i])['MPPTdata'])['MPPT_EFF'] = pd.to_numeric(((mySeries.loc[i])['MPPTdata'])['MPPT_EFF'])
    idmax = ((mySeries.loc[i])['MPPTdata'])['MPPT_EFF'].idxmax()
    idmin = ((mySeries.loc[i])['MPPTdata'])['MPPT_EFF'].idxmin()
    
    # Find time at maximum and minimum MPPT_EFF
    MPPT_t_max = pd.to_datetime(((mySeries.loc[i])['MPPTdata'])['MPPT_t'].iloc[idmax])
    MPPT_t_min = pd.to_datetime(((mySeries.loc[i])['MPPTdata'])['MPPT_t'].iloc[idmin])
    
    # Find the initial time of MPPT
    MPPT_t_initial = pd.to_datetime(((mySeries.loc[i])['MPPTdata'])['MPPT_t'].iloc[0])

    # Calculate the delta
    MPPT_t_delta_max = (MPPT_t_max-MPPT_t_initial).total_seconds()/3600 # Convert to hours
    MPPT_t_delta_min = (MPPT_t_min-MPPT_t_initial).total_seconds()/3600 # Convert to hours
    
    # Assign back to df
    mySeries['MPPT_t_delta_max'][i] = MPPT_t_delta_max
    mySeries['MPPT_t_delta_min'][i] = MPPT_t_delta_min
    
    if MPPT_t_delta_max > hour_limit:
        count_t_delta_max +=1
        print('row: ',i, ' has MPPT_t_delta_max > ',str(hour_limit))
        
        # Print indexes for largest MPPT_EFF
        n = 5
        nlargest_df = ((mySeries.loc[i])['MPPTdata']).nlargest(columns='MPPT_EFF',n=10)
        nlargest_index_list = list(nlargest_df.index.values)
        nlargest_dur_list = nlargest_df['MPPT_dur_h'].values
        print('row: ',i,' largest ',n,' eff index: ',nlargest_index_list)
        print('row: ',i,' largest ',n,' eff duration: ',nlargest_dur_list)
        
        # To interpret this: if the top 10 PCE values have ~similar time duration, they have
        # to be excluded because they are > hour_limit (150, 300, 500) hours ('real' max. PCE, not a 'glitch').
        
        # Let's see the plot
        sns.set_style('darkgrid')
        sns.set(rc={'figure.figsize':(6,4)})
        sns.lineplot(data=((mySeries.loc[i])['MPPTdata']), x="MPPT_dur_h", y="MPPT_EFF")

        plt.rcParams['font.family'] = 'Arial'
        plt.xlabel('Degradation time (hours)')
        plt.ylabel('PCE (%)')

        # Save figure
        plt.tight_layout()
        plt.savefig(filedirname+'MPPT_PCE_row_'+str(i)+'.png', dpi=600)
        
        plt.close('all') 
        
        # See which one has more than/ equal to 5 values above 150h duration
        if sum(p > 150 for p in nlargest_dur_list) >=5:
            print ('More than/ equal to 5 values above ',str(hour_limit),'h duration.')
            list_row_drop.append(i)
        

In [None]:
print('t_delta_max above ',str(hour_limit),'h: ',count_t_delta_max)
print('but only ', len(list_row_drop), ' rows have > 5 in the top 10 values max. PCE above', str(hour_limit),' h')

In [None]:
# DROP IF MPPT_t_delta_max > HOUR_LIMIT

# LIST FOR 150, 300, 500 HOURS (manually decided after looking at individual MPPT tracks)

if hour_limit == 150:
    list_row_drop_manual = [28,33,80,107,141,146,147,157,158,159,
                            160,162,269,294,348,349,350,352,359,494,
                            496,864,928,1007,1009,1016,1020,1022,1024,1028,
                            1029,1030,1243,1245,1248,1249,1313,1320,1322,1328,
                            1450,1452,1454,1469,1473,1475,1476,1632,1634,1635,
                            1639,1640,1641,1670,1671,1674,1676,1934,1935,1936,
                            1937,1943,1988,2141,2171,2173,2175,2189,2190,2191,
                            2192,2193,2194,2195,2196]
elif hour_limit == 300:
    list_row_drop_manual = [135,137,213,415,697,761,792,794,801,805,
                            807,809,813,814,815,1002,1004,1008,1188,1190,
                            1192,1347,1349,1351,1352,1586,1587,1588,1594,1792,
                            1840,1841,1842,1843,1844,1845,1846,1847]
elif hour_limit == 500:
    list_row_drop_manual = [468,565,572,576,578,580,584,918,920,922,
                            1077,1079,1384,1385,1387,1389,1390,1391]

print('t_delta_max above ',str(hour_limit),'h (semi-manually picked): ',len(list_row_drop_manual))

# Drop based on the index list
mySeries = (mySeries.drop(mySeries.index[list_row_drop_manual])).reset_index(drop=True)#(inplace=True)

mySeries

In [None]:
# Next: drop Atmosphere == air, only include N2

mySeries['Atmosphere'].value_counts()

In [None]:
# Drop air

mySeries = mySeries[mySeries['Atmosphere'] == 'N2']
mySeries = mySeries.reset_index(drop=True)
mySeries

# Drop other absorbers but 3CAT (only if you want to do 3CAT)

# mySeries = mySeries[mySeries['ABS'] == '3CAT']
# mySeries = mySeries.reset_index(drop=True)
# mySeries

In [None]:
# Combination: convert to datetime + resample and cut down to a certain length

from datetime import datetime

# Defined length to drop (to 150 hours)
len_drop = hour_limit*6+1


# Going through the data for each one, and resample (MPPT, Temperature, Irradiation)

for i,row in mySeries.iterrows():
    print('row: ',i)

    ################### MPPT ###################
    
    # Cut the series to MPPT_t and MPPT_EFF
    mySeries['MPPTdata'][i] = mySeries['MPPTdata'][i].loc[:,['MPPT_t','MPPT_EFF']]
        
    # Resample
    mySeries['MPPTdata'][i]['MPPT_t'] = pd.to_datetime(mySeries['MPPTdata'][i]['MPPT_t'])
    mySeries['MPPTdata'][i] = mySeries['MPPTdata'][i].set_index('MPPT_t').resample('10min').mean() # Every 10 minutes 
    
    # Drop the length
    if len(mySeries['MPPTdata'][i]) > len_drop:
        mySeries['MPPTdata'][i] = mySeries['MPPTdata'][i].iloc[1:len_drop,:] #Drop first row/ NaN as well
    else:
        mySeries.drop([i],inplace=True)
        
    # Interpolate if there's a NaN
    if mySeries['MPPTdata'][i].isnull().values.any() == True:
        print(i,' MPPTdata has NaN')
        mySeries['MPPTdata'][i] = mySeries['MPPTdata'][i].interpolate(method='akima')
    
    ################### Temperature ###################
    
    # Cut the series to MPPT_t and MPPT_EFF
    mySeries['Temperature'][i] = mySeries['Temperature'][i].loc[:,['Temperature_t','Temperature']]
        
    # Resample
    mySeries['Temperature'][i]['Temperature_t'] = pd.to_datetime(mySeries['Temperature'][i]['Temperature_t'])
    mySeries['Temperature'][i] = mySeries['Temperature'][i].set_index('Temperature_t').resample('10min').mean() # Every 10 minutes 
    
    # Drop the length
    if len(mySeries['Temperature'][i]) > len_drop:
        mySeries['Temperature'][i] = mySeries['Temperature'][i].iloc[1:len_drop,:] #Drop first row/ NaN as well
    else:
        mySeries.drop([i],inplace=True)
        
    # Interpolate if there's a NaN
    if mySeries['Temperature'][i].isnull().values.any() == True:
        print(i,' temperature has NaN')
        mySeries['Temperature'][i] = mySeries['Temperature'][i].interpolate(method='akima')
    
    ################### Irradiation ###################
    
    # Cut the series to MPPT_t and MPPT_EFF
    mySeries['Irradiation'][i] = mySeries['Irradiation'][i].loc[:,['Irradiation_t','Irradiation']]
        
    # Resample
    mySeries['Irradiation'][i]['Irradiation_t'] = pd.to_datetime(mySeries['Irradiation'][i]['Irradiation_t'])
    mySeries['Irradiation'][i] = mySeries['Irradiation'][i].set_index('Irradiation_t').resample('10min').mean() # Every 10 minutes 
    
    # Drop the length
    if len(mySeries['Irradiation'][i]) > len_drop:
        mySeries['Irradiation'][i] = mySeries['Irradiation'][i].iloc[1:len_drop,:] #Drop first row/ NaN as well
    else:
        mySeries.drop([i],inplace=True)
        
    # Interpolate if there's a NaN
    if mySeries['Irradiation'][i].isnull().values.any() == True:
        print(i,' irradiation has NaN')
        mySeries['Irradiation'][i] = mySeries['Irradiation'][i].interpolate(method='akima')

In [None]:
# Check if it still has NaN

for i in range(len(mySeries)):
    if mySeries['MPPTdata'][i].isnull().values.any() == True:
        print(i,' has NaN')

In [None]:
# Re-calculate T_avg
mySeries['T_avg']=0.0
mySeries['Irr_avg']=0.0

for num in range(len(mySeries)):
    # Add column of average irradiation and temperature
    mySeries['T_avg'][num] = mySeries['Temperature'][num]['Temperature'].mean()
    mySeries['Irr_avg'][num] = mySeries['Irradiation'][num]['Irradiation'].mean()

mySeries

In [None]:
# Drop JSON_Lea-1 if looking at hours > 176 hours and
# drop JSON_FloS-2_Lea-2 if looking at hours > 246 hous
# (because there are some time periods not working/ big jump)

if hour_limit >= 176:
    mySeries = mySeries[mySeries['Filename'] != 'JSON_Lea-1']
    mySeries = mySeries.reset_index(drop=True)
    
if hour_limit >= 246:
    mySeries = mySeries[mySeries['Filename'] != 'JSON_FloS-2_Lea-2']
    mySeries = mySeries.reset_index(drop=True)

In [None]:
# Trying to plot the data after pre-processed
fig, axs = plt.subplots(10,10,figsize=(35,35))

for i in range(10):
    for j in range(10):
        if i*10+j+1>len(mySeries): # pass the others that we can't fill
            continue
        axs[i, j].plot(mySeries['MPPTdata'][i*10+j]['MPPT_EFF'])#.values)
        # axs[i, j].set_title(namesofMySeries[i*4+j])
plt.show()

In [None]:
# Saving as csv
mySeries.to_csv('./dataset/pkl_complete/20230303_mySeries.csv')

In [None]:
# Saving as pickle
mySeries.to_pickle('./dataset/pkl_complete/20230303_mySeries.pkl')

## 2. Load from another jupyter notebook & do further pre-processing

**input**: whole .pkl of good pixels

**process**:

1. check: if there is NaN & length of the data, making sure it's uniform
2. group the data into max. PCE group (< 8%, 8-12%, 12-16%, > 16%)
3. calculate the relative change of max. PCE group after 150 hours
4. plot the data
5. scaling/normalization (selected method: MaxAbsScaler)
6. smoothing (selected method: savgol/ savitzky-golay_

**output**: plots, pre-cleaned data ready for SOM

### 2.1. (OPTIONAL) Check the time length to reach min. and max. PCE

**For wholeSeries/ before pre-processing!**

In [None]:
# Next: drop row if there is no MPPTdata
indexDrop = []
wholeSeries['T_avg']=0.0
wholeSeries['Irr_avg']=0.0

for num in range(len(wholeSeries)):
    
    # Drop row if there is no MPPTdata or MPPTdata<hour_limit
    if len(wholeSeries['MPPTdata'][num])==0: # or max(wholeSeries['MPPTdata'][num]['MPPT_dur_h'])< hour_limit:
        indexDrop.append(num)

print('# gets dropped due to empty MPPTdata: ', len(indexDrop))

# Drop based on the condition
wholeSeriesNew = wholeSeries.drop(indexDrop, axis=0, inplace=False)
wholeSeriesNew.reset_index(drop=True, inplace=True)
print('# rows surviving: ', len(wholeSeriesNew))

In [None]:
# Check how long it takes to reach maximum PCE

# Add empty columns
wholeSeriesNew['MPPT_t_delta_max'] = 0.0
wholeSeriesNew['MPPT_t_delta_min'] = 0.0

for i,row in mySeries.iterrows():
    print ('row: ', i)

    # Find the index when MPPT efficiency reaches the highest and lowest
    ((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_EFF'] = pd.to_numeric(((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_EFF'])
    idmax = ((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_EFF'].idxmax()
    idmin = ((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_EFF'].idxmin()
    
    # Find time at maximum and minimum MPPT_EFF
    MPPT_t_max = pd.to_datetime(((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_t'].iloc[idmax])
    MPPT_t_min = pd.to_datetime(((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_t'].iloc[idmin])
    
    # Find the initial time of MPPT
    MPPT_t_initial = pd.to_datetime(((wholeSeriesNew.loc[i])['MPPTdata'])['MPPT_t'].iloc[0])
    
    # Calculate the delta
    MPPT_t_delta_max = (MPPT_t_max-MPPT_t_initial).total_seconds()/3600 # Convert to hours
    MPPT_t_delta_min = (MPPT_t_min-MPPT_t_initial).total_seconds()/3600 # Convert to hours
    
    # Assign back to df
    wholeSeriesNew['MPPT_t_delta_max'][i] = MPPT_t_delta_max
    wholeSeriesNew['MPPT_t_delta_min'][i] = MPPT_t_delta_min

In [None]:
((wholeSeriesNew.loc[i])['MPPTdata'])

In [None]:
# Plot the distribution of areaDeltaNorm for a specific cluster

sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(4.5,3)})
sns.distplot(wholeSeriesNew['MPPT_t_delta_min'],kde=False,
#              bins=[150,300,450,600,750,900,1050,1200,1350,1500,1650,1800,1950,2100])
#              bins=[50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,
#                    1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,
#                    1550,1600,1650,1700,1750,1800,1850,1900,1950,2000,2050,2100],
            )

plt.rcParams['font.family'] = 'Arial'
plt.xlabel('Degradation time length to reach min. PCE (hours)')
plt.ylabel('Count')

# Save figure
plt.tight_layout()
plt.savefig(filedirname+'distribution_delta_t_min_wholeSeries.png', dpi=600)

# Print the min and max hour
print('min MPPT_t_delta_min: ',min(wholeSeriesNew['MPPT_t_delta_min']))
print('mean MPPT_t_delta_min: ',(wholeSeriesNew['MPPT_t_delta_min']).mean())
print('median MPPT_t_delta_min: ',(wholeSeriesNew['MPPT_t_delta_min']).median())
print('max MPPT_t_delta_min: ',max(wholeSeriesNew['MPPT_t_delta_min']))
print('MPPT_t_delta_min above 150 hours: ', len(wholeSeriesNew[wholeSeriesNew['MPPT_t_delta_min'] >150]))
# print('MPPT_t_delta_min below 20 hours: ', len(wholeSeriesNew[wholeSeriesNew['MPPT_t_delta_min'] <20]))

In [None]:
# Plot the distribution of areaDeltaNorm for a specific cluster

sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(4.5,3)})
sns.distplot(wholeSeriesNew['MPPT_t_delta_max'],kde=False,
#              bins=[150,300,450,600,750,900,1050,1200,1350,1500,1650,1800,1950,2100])
#              bins=[50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000,
#                    1050,1100,1150,1200,1250,1300,1350,1400,1450,1500,
#                    1550,1600,1650,1700,1750,1800,1850,1900,1950,2000,2050,2100],
            )

plt.rcParams['font.family'] = 'Arial'
plt.xlabel('Degradation time length to reach max. PCE (hours)')
plt.ylabel('Count')

# Save figure
plt.tight_layout()
plt.savefig(filedirname+'distribution_delta_t_max_wholeSeries.png', dpi=600)

# Print the min and max hour
print('min MPPT_t_delta_max: ',min(wholeSeriesNew['MPPT_t_delta_max']))
print('mean MPPT_t_delta_max: ',(wholeSeriesNew['MPPT_t_delta_max']).mean())
print('median MPPT_t_delta_max: ',(wholeSeriesNew['MPPT_t_delta_max']).median())
print('max MPPT_t_delta_max: ',max(wholeSeriesNew['MPPT_t_delta_max']))
print('MPPT_t_delta_max above 150 hours: ', len(wholeSeriesNew[wholeSeriesNew['MPPT_t_delta_max'] >150]))

**Look at the mySeries max MPPT and min MPPT**

Check if max MPPT> min MPPT!


In [None]:
# Check how long it takes to reach maximum PCE

# Add empty columns
mySeries['MPPT_max'] = 0.0
mySeries['MPPT_min'] = 0.0
mySeries['MPPT_maxmin'] = 0.0

for i,row in mySeries.iterrows():
    print ('row: ', i)
    
    # Find max and min
    max_MPPT = (mySeries['MPPTdata'].loc[i].nlargest(10,['MPPT_EFF'])).mean().item() # top 5 
    min_MPPT = (mySeries['MPPTdata'].loc[i].nsmallest(10,['MPPT_EFF'])).mean().item() # top 5 
    
    # Assign back to df
    mySeries['MPPT_max'][i] = max_MPPT
    mySeries['MPPT_min'][i] = min_MPPT
    if mySeries['MPPT_max'][i] > mySeries['MPPT_min'][i]:
        mySeries['MPPT_maxmin'][i] = 1.0

In [None]:
# If 1.0, MPPT_max is always > MPPT_min
mySeries['MPPT_maxmin'].value_counts()

### 2.2. Checking the dataset

In [None]:
# Load the .pkl file consisting the whole dataset
with open('dataset/pkl_complete/20230303_mySeries.pkl', "rb") as fh:
    mySeries = pickle.load(fh)

In [None]:
# # PICK CERTAIN ARCHITECTURE
# mySeriesDevice = mySeries.loc[mySeries['FrontContact'] == 'FTO']

# # Drop the other columns, and only focus on MPPTdata
# mySeriesDrop = mySeriesDevice['MPPTdata']
# print('Initial length: ',len(mySeriesDrop))

# # Reindexing for the mySeriesDrop
# mySeriesDrop.index = range(mySeriesDrop.shape[0])

# # Checking random row
# mySeriesDrop[155]

In [None]:
mySeries

In [None]:
# Drop the other columns, and only focus on MPPTdata
mySeriesDrop = mySeries['MPPTdata']
print('Initial length: ',len(mySeriesDrop))

# Reindexing for the mySeriesDrop
mySeriesDrop.index = range(mySeriesDrop.shape[0])

# Checking random row
mySeriesDrop[155]

In [None]:
# CHECKING LENGTH
# Preprocessing, because the data needs to be uniform in length
series_lengths = {len(series) for series in mySeriesDrop}

# Finding the longest series to elongate the series
max_len = max(series_lengths)
longest_series = None
for series in mySeriesDrop:
    if len(series) == max_len:
        longest_series = series
print('max length in the series: ',max_len)

# Finding the shortest series
short_name = None
min_len = min(series_lengths)
shortest_series = None
for series in mySeriesDrop:
    if len(series) == min_len:
        shortest_series = series
#         print(series)
print('min length in the series: ',min_len)

# Looking at the distribution of series lengths
# print(type(series_lengths))
sns.set_style('darkgrid')
sns.distplot(np.array(list(series_lengths)))

### 2.3. Now, let's plot the data

1. Calculate the relative change in max. PCE (after 150 hours). 

Relative change in max. PCE = (PCE_max-PCE_150h)/(PCE_max)
For calculating these PCE values: we take the mean for 3 or 5 points around the values of interest, making sure that it's not considering the noise only.

2. Group the max. PCE

3. Plot the following figures:
    - Max. PCE group vs. relative change in max. PCE (after 150 hours)
    - Each max. PCE group's degradation traces over time 

In [None]:
PCEbefore_list = []
PCEafter_list = []
PCEdelta_list = []

for i in range(len(mySeriesDrop)):
    
    ### Calculate the relative change
    
    ## Extracting initial PCE value
    PCEbefore_1 = mySeriesDrop[i].iloc[0]['MPPT_EFF'] # only the initial
    PCEbefore_2 = mySeriesDrop[i]['MPPT_EFF'].head(3).mean() # take the mean
    
    ## Extracting final PCE value at 150 h
    PCEafter_1 = mySeriesDrop[i].iloc[899]['MPPT_EFF'] # only the tail
    PCEafter_2 = mySeriesDrop[i]['MPPT_EFF'].tail(3).mean() # take the mean
    
    ## Extracting maximum PCE
    
    # 1st: just extract top 5
    # PCEtopbef_1 = (mySeriesDrop[i].sort_values(by=['MPPT_EFF']).head(5)).mean() #top 5
    
    # 2nd: extract top 3
    PCEtopbef_1 = (mySeriesDrop[i].nlargest(3,['MPPT_EFF'])).mean().item() # top 5 
    
    # 3rd: find location of top PCE, and averaging around it
    # location_max_PCE = mySeriesDrop[i]['MPPT_EFF'].idxmax()
    # iloc_max_PCE = (mySeriesDrop[i]['MPPT_EFF']).index.get_loc(location_max_PCE)

    # if iloc_max_PCE >= 888: # See if it's at the tail or head, and make adjustments
    #     selected_rows = mySeriesDrop[i]['MPPT_EFF'].iloc[[iloc_max_PCE-2, iloc_max_PCE-1, iloc_max_PCE]]
    # elif iloc_max_PCE <= 1:
    #     selected_rows = mySeriesDrop[i]['MPPT_EFF'].iloc[[iloc_max_PCE, iloc_max_PCE+1, iloc_max_PCE+2]]
    # else:
    #     selected_rows = mySeriesDrop[i]['MPPT_EFF'].iloc[[iloc_max_PCE-2, iloc_max_PCE-1, iloc_max_PCE,   
    #                                                       iloc_max_PCE+1, iloc_max_PCE+2]]
        
    # PCEtopbef_1 = selected_rows.mean()
    
    # Print the results
    # print('row ',i,' iloc ', iloc_max_PCE, ' location of max PCE: ',location_max_PCE,
    #       ' top mean: ', PCEtopbef_1)
    
    ## Calculate the relative change
    PCEdelta = (PCEtopbef_1-PCEafter_1)*100/PCEtopbef_1 # top efficiency
    
    PCEbefore_list.append(PCEtopbef_1)
    PCEafter_list.append(PCEafter_1)
    PCEdelta_list.append(PCEdelta)

# Combine the results to put in the dataframe
PCE_combined = [PCEbefore_list, PCEafter_list, PCEdelta_list]
PCE_combined_transposed = np.array(PCE_combined).T.tolist()
PCE_df = pd.DataFrame (PCE_combined_transposed, columns = ['PCE_before', 'PCE_after','PCE_delta'])

# Load libraries for plotting
import plotly.express as px
import plotly.graph_objs as go

# Plot the overview
fig = go.Figure(data=go.Scatter(x=PCEbefore_list, y=PCEdelta_list, mode='markers'))
fig.update_layout(yaxis=dict(range=[-2.1,2.2]),xaxis_title="Initial PCE (%)",yaxis_title="Relative hange in max. PCE (after 150 hrs.) (%)")
fig.update_yaxes(type='log')

# To display the figure in the output screen
fig.show()

In [None]:
# Sort by PCE_before
PCE_df = PCE_df.sort_values(by=['PCE_before'])
PCE_df['PCE_delta'].iloc[209:466]

In [None]:
# How many groups do we want?
n_group = 5
lengthdf = len(PCE_df)
lengthpergroup = np.round(lengthdf/n_group)
last_n = 0

# Sort by PCE_before
PCE_df = PCE_df.sort_values(by=['PCE_before'])

# Grouping the PCE_before (top) into n groups (dummy, will be replaced with other values)
PCE_df['PCE_before_ceil'] = PCE_df['PCE_before'].apply(np.ceil)
PCE_df['PCE_before_x'] = PCE_df['PCE_before'].apply(np.ceil)
PCE_df['PCE_before_ceil_x'] = PCE_df['PCE_before'].apply(np.ceil)
PCE_df['PCE_before_median_x'] = PCE_df['PCE_before'].apply(np.ceil)
PCE_df['PCE_before_mean_x'] = PCE_df['PCE_before'].apply(np.ceil)

# Loop for all the groups
for i in range(n_group):
    if i!= n_group-1:
        PCE_df['PCE_before_x'].iloc[int(last_n):int((i+1)*lengthpergroup)] = i+1
        PCE_df['PCE_before_ceil_x'].iloc[int(last_n):int((i+1)*lengthpergroup)] = PCE_df['PCE_before'].iloc[int((i+1)*lengthpergroup)]
        PCE_df['PCE_before_median_x'].iloc[int(last_n):int((i+1)*lengthpergroup)] = PCE_df['PCE_before'].loc[PCE_df['PCE_before_x'] == (i+1)].median()
        PCE_df['PCE_before_mean_x'].iloc[int(last_n):int((i+1)*lengthpergroup)] = PCE_df['PCE_before'].loc[PCE_df['PCE_before_x'] == (i+1)].mean()
        last_n = (i+1)*lengthpergroup
    else:
        PCE_df['PCE_before_x'].iloc[int(last_n):] = i+1
        PCE_df['PCE_before_ceil_x'].iloc[int(last_n):] = PCE_df['PCE_before'].iloc[-1]
        PCE_df['PCE_before_median_x'].iloc[int(last_n):] = PCE_df['PCE_before'].loc[PCE_df['PCE_before_x'] == (i+1)].median()
        PCE_df['PCE_before_mean_x'].iloc[int(last_n):] = PCE_df['PCE_before'].loc[PCE_df['PCE_before_x'] == (i+1)].mean()

# Print unique values for each group
unique_ceil = PCE_df['PCE_before_ceil_x'].unique()
unique_median = PCE_df['PCE_before_median_x'].unique()
unique_mean = PCE_df['PCE_before_mean_x'].unique()

# Sort by PCE_before_ceil_x
PCE_df = PCE_df.sort_values(by=['PCE_before_ceil_x'])

# Print length of PCE_before_ceil_x
for i in unique_ceil:
    print("{:.1f}".format(i), ' ceil: ', len(PCE_df[PCE_df['PCE_before_ceil_x']==i]))

# Save as csv
PCE_df.to_csv(filedirname+'PCE_df_grouping.csv')

In [None]:
# Plot boxplot
fig = px.box(PCE_df, x="PCE_before_x", y="PCE_delta")
fig.show()

In [None]:
# Plot violin plot
fig = px.violin(PCE_df, x="PCE_before_x", y="PCE_delta", 
                box=True, points="all",hover_data=PCE_df.columns)

fig.show()

In [None]:
import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib

fig = go.Figure()

# Label for the groups
a = ['PCE < 10.4%','PCE 10.4-14.2%','PCE 14.2-16.8%','PCE 16.8-19.2%', 'PCE > 19.2%']

print('Unique ceil: ',unique_ceil)
print('Median: ',unique_median)
print('Mean: ',unique_mean)

# Color palette for the figure to make it pretty
colors = n_colors('rgb(8,29,88)', 'rgb(127,205,187)', n_group, colortype='rgb')
colors_box = n_colors('rgb(2,7,22)', 'rgb(30,50,45)', n_group, colortype='rgb')
colors_line = n_colors('rgb(0,5,15)', 'rgb(15,25,23)', n_group, colortype='rgb')

# Plotting the violin and boxplot
for (i,color,color_line) in zip(unique_ceil, colors, colors_line):
    fig.add_trace(go.Violin(x=PCE_df['PCE_before_x'][PCE_df['PCE_before_ceil_x'] == i],
                            y=PCE_df['PCE_delta'][PCE_df['PCE_before_ceil_x'] == i],
                            box_visible=False,
                            fillcolor = color,
                            opacity = 0.4,
                            line = dict(color=color_line),
                            jitter=True,
                            meanline_visible=False))

for (i,color,color_line) in zip(unique_ceil, colors, colors_line):
    fig.add_trace(go.Box(x=PCE_df['PCE_before_x'][PCE_df['PCE_before_ceil_x'] == i],
                            y=PCE_df['PCE_delta'][PCE_df['PCE_before_ceil_x'] == i],
                            marker_color = color,
                            opacity = 0.55,
                            line_color = color_line,
                            fillcolor = color,
                            jitter=True,
                            boxmean=True))

fig.update_layout(xaxis_title="Max. PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  boxgap = 0.85,
                  font_family='Arial',
                  showlegend=False)
    
fig.show()

# Save the figure 
pio.write_image(fig, filedirname+'all_data_changedegradation_4.png',
                width=900, height=600, scale=22)

pio.write_image(fig, filedirname+'all_data_changedegradation_3.png',
                width=700, height=450, scale=25)

pio.write_image(fig, filedirname+'all_data_changedegradation_5.png',
                width=600, height=400, scale=25)



In [None]:
# Selecting specific group and plotting their general degradation trends

import random

x = np.linspace(0,150, 900, endpoint=True)
x = np.linspace(0,500, 3000, endpoint=True)
k = 1

# Index for each group
index_1 = PCE_df.index[PCE_df['PCE_before_x'] == 1].tolist()
index_2 = PCE_df.index[PCE_df['PCE_before_x'] == 2].tolist()
index_3 = PCE_df.index[PCE_df['PCE_before_x'] == 3].tolist()
index_4 = PCE_df.index[PCE_df['PCE_before_x'] == 4].tolist()
index_5 = PCE_df.index[PCE_df['PCE_before_x'] == 5].tolist()

# index_6 = PCE_df.index[PCE_df['PCE_before_x'] == 6].tolist()
# index_7 = PCE_df.index[PCE_df['PCE_before_x'] == 7].tolist()
# index_8 = PCE_df.index[PCE_df['PCE_before_x'] == 8].tolist()
# index_9 = PCE_df.index[PCE_df['PCE_before_x'] == 9].tolist()
# index_10 = PCE_df.index[PCE_df['PCE_before_x'] == 10].tolist()

mySeries_selected_1 = mySeriesDrop.iloc[index_1] # all
mySeries_selected_2 = mySeriesDrop.iloc[index_2] # all
mySeries_selected_3 = mySeriesDrop.iloc[index_3] # all
mySeries_selected_4 = mySeriesDrop.iloc[index_4] # all
mySeries_selected_5 = mySeriesDrop.iloc[index_5] # all

# mySeries_selected_6 = mySeriesDrop.iloc[index_6] # all
# mySeries_selected_7 = mySeriesDrop.iloc[index_7] # all
# mySeries_selected_8 = mySeriesDrop.iloc[index_8] # all
# mySeries_selected_9 = mySeriesDrop.iloc[index_9] # all
# mySeries_selected_10 = mySeriesDrop.iloc[index_10] # all

# Randomly selected
# mySeries_selected_8 = mySeries_selected.iloc[random.choices(index_8, k=k)] #random selection
# mySeries_selected_12 = mySeries_selected.iloc[random.choices(index_12, k=k)] #random selection
# mySeries_selected_16 = mySeries_selected.iloc[random.choices(index_16, k=k)] #random selection
# mySeries_selected_20 = mySeries_selected.iloc[random.choices(index_20, k=k)] #random selection

# Function to convert the series to df for specific, selected indexes
def convert_to_df (series, index_specified):
    big_df = pd.DataFrame()
    for i in index_specified:
        l = series[i].reset_index()
        big_df = pd.concat([big_df, l['MPPT_EFF']], axis='columns')
    
    # Calculate statistics for this new df
    big_df['median'] = big_df.median(axis=1)
    big_df['mean'] = big_df.mean(axis=1)
    big_df['lower_quartile'] = big_df.quantile(0.25, axis=1)
    big_df['upper_quartile'] = big_df.quantile(0.75, axis=1)
    big_df['lower_quartile_graph'] = big_df['lower_quartile'][::-1]
    
    return big_df[['mean', 'median', 'lower_quartile', 'upper_quartile',
                   'lower_quartile_graph']]
        
stat_1 = convert_to_df(mySeries_selected_1, index_1)
stat_2 = convert_to_df(mySeries_selected_2, index_2)
stat_3 = convert_to_df(mySeries_selected_3, index_3)
stat_4 = convert_to_df(mySeries_selected_4, index_4)
stat_5 = convert_to_df(mySeries_selected_5, index_5)

# stat_6 = convert_to_df(mySeries_selected_6, index_6)
# stat_7 = convert_to_df(mySeries_selected_7, index_7)
# stat_8 = convert_to_df(mySeries_selected_8, index_8)
# stat_9 = convert_to_df(mySeries_selected_9, index_9)
# stat_10 = convert_to_df(mySeries_selected_10, index_10)

In [None]:
# Plotting for degradation traces for specific max. PCE group

x_rev = x[::-1]

colors = n_colors('rgb(8,29,88)', 'rgb(127,205,187)', 5, colortype='rgb')
fill_colors = n_colors('rgba(8,29,88,0.2)', 'rgb(127,205,187,0.2)', 5, colortype='rgb')

# Label for the groups
a = ['PCE < 10.4%','PCE 10.4-14.2%','PCE 14.2-16.8%','PCE 16.8-19.2%', 'PCE > 19.2%']
b = ['< 10.4%','10.4-14.2%','14.2-16.8%','16.8-19.2%', '> 19.2%']

fig = go.Figure()

# Group 1
fig.add_trace(go.Scatter(
    x=x,
    y=stat_1['upper_quartile'],
    fill=None,
#     fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    mode='lines',
    showlegend=False,
    name=b[0],
))

fig.add_trace(go.Scatter(
    x=x,
    y=stat_1['lower_quartile'],
    fill='tonexty',
    mode='lines',
    fillcolor='rgba(8,29,88,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name=b[0],
))

fig.add_trace(go.Scatter(
    x=x, y=stat_1['median'],
    line_color= colors[0],#'rgb(0,100,80)',
    name=b[0],
))

# Group 2
fig.add_trace(go.Scatter(
    x=x,
    y=stat_2['upper_quartile'],
    fill=None,
#     fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    mode='lines',
    showlegend=False,
    name=b[1],
))

fig.add_trace(go.Scatter(
    x=x,
    y=stat_2['lower_quartile'],
    fill='tonexty',
    mode='lines',
    fillcolor='rgba(31.8, 64.2, 107.8,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name=b[1],
))

fig.add_trace(go.Scatter(
    x=x, y=stat_2['median'],
    line_color= colors[1],#'rgb(0,100,80)',
    name=b[1],
))

# Group 3
fig.add_trace(go.Scatter(
    x=x,
    y=stat_3['upper_quartile'],
    fill=None,
#     fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    mode='lines',
    showlegend=False,
    name=b[2],
))

fig.add_trace(go.Scatter(
    x=x,
    y=stat_3['lower_quartile'],
    fill='tonexty',
    mode='lines',
    fillcolor='rgba(55.6, 99.4, 127.6,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name=b[2],
))

fig.add_trace(go.Scatter(
    x=x, y=stat_3['median'],
    line_color= colors[2],#'rgb(0,100,80)',
    name=b[2],
))

# Group 4
fig.add_trace(go.Scatter(
    x=x,
    y=stat_4['upper_quartile'],
    fill=None,
#     fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    mode='lines',
    showlegend=False,
    name=b[3],
))

fig.add_trace(go.Scatter(
    x=x,
    y=stat_4['lower_quartile'],
    fill='tonexty',
    mode='lines',
    fillcolor='rgba(79.4, 134.60000000000002, 147.4,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name=b[3],
))

fig.add_trace(go.Scatter(
    x=x, y=stat_4['median'],
    line_color= colors[3],#'rgb(0,100,80)',
    name=b[3],
))

# Group 5
fig.add_trace(go.Scatter(
    x=x,
    y=stat_5['upper_quartile'],
    fill=None,
#     fillcolor='rgba(0,100,80,0.2)',
    line_color='rgba(255,255,255,0)',
    mode='lines',
    showlegend=False,
    name=b[4],
))

fig.add_trace(go.Scatter(
    x=x,
    y=stat_5['lower_quartile'],
    fill='tonexty',
    mode='lines',
    fillcolor='rgba(103.2, 169.8, 167.2,0.2)',
    line_color='rgba(255,255,255,0)',
    showlegend=False,
    name=b[4],
))

fig.add_trace(go.Scatter(
    x=x, y=stat_5['median'],
    line_color= colors[4],#'rgb(0,100,80)',
    name=b[4],
))

# # Group 6
# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_6['upper_quartile'],
#     fill=None,
# #     fillcolor='rgba(0,100,80,0.2)',
#     line_color='rgba(255,255,255,0)',
#     mode='lines',
#     showlegend=False,
#     name=b[5],
# ))

# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_6['lower_quartile'],
#     fill='tonexty',
#     mode='lines',
#     fillcolor='rgba(8,29,88,0.2)',
#     line_color='rgba(255,255,255,0)',
#     showlegend=False,
#     name=b[5],
# ))

# fig.add_trace(go.Scatter(
#     x=x, y=stat_6['median'],
#     line_color= colors[0],#'rgb(0,100,80)',
#     name=b[5],
# ))

# # Group 7
# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_7['upper_quartile'],
#     fill=None,
# #     fillcolor='rgba(0,100,80,0.2)',
#     line_color='rgba(255,255,255,0)',
#     mode='lines',
#     showlegend=False,
#     name=b[6],
# ))

# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_7['lower_quartile'],
#     fill='tonexty',
#     mode='lines',
#     fillcolor='rgba(31.8, 64.2, 107.8,0.2)',
#     line_color='rgba(255,255,255,0)',
#     showlegend=False,
#     name=b[6],
# ))

# fig.add_trace(go.Scatter(
#     x=x, y=stat_7['median'],
#     line_color= colors[1],#'rgb(0,100,80)',
#     name=b[6],
# ))

# # Group 8
# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_8['upper_quartile'],
#     fill=None,
# #     fillcolor='rgba(0,100,80,0.2)',
#     line_color='rgba(255,255,255,0)',
#     mode='lines',
#     showlegend=False,
#     name=b[7],
# ))

# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_8['lower_quartile'],
#     fill='tonexty',
#     mode='lines',
#     fillcolor='rgba(55.6, 99.4, 127.6,0.2)',
#     line_color='rgba(255,255,255,0)',
#     showlegend=False,
#     name=b[7],
# ))

# fig.add_trace(go.Scatter(
#     x=x, y=stat_8['median'],
#     line_color= colors[2],#'rgb(0,100,80)',
#     name=b[7],
# ))

# # Group 9
# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_9['upper_quartile'],
#     fill=None,
# #     fillcolor='rgba(0,100,80,0.2)',
#     line_color='rgba(255,255,255,0)',
#     mode='lines',
#     showlegend=False,
#     name=b[8],
# ))

# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_9['lower_quartile'],
#     fill='tonexty',
#     mode='lines',
#     fillcolor='rgba(79.4, 134.60000000000002, 147.4,0.2)',
#     line_color='rgba(255,255,255,0)',
#     showlegend=False,
#     name=b[8],
# ))

# fig.add_trace(go.Scatter(
#     x=x, y=stat_9['median'],
#     line_color= colors[3],#'rgb(0,100,80)',
#     name=b[8],
# ))

# # Group 10
# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_10['upper_quartile'],
#     fill=None,
# #     fillcolor='rgba(0,100,80,0.2)',
#     line_color='rgba(255,255,255,0)',
#     mode='lines',
#     showlegend=False,
#     name=b[9],
# ))

# fig.add_trace(go.Scatter(
#     x=x,
#     y=stat_10['lower_quartile'],
#     fill='tonexty',
#     mode='lines',
#     fillcolor='rgba(103.2, 169.8, 167.2,0.2)',
#     line_color='rgba(255,255,255,0)',
#     showlegend=False,
#     name=b[9],
# ))

# fig.add_trace(go.Scatter(
#     x=x, y=stat_10['median'],
#     line_color= colors[4],#'rgb(0,100,80)',
#     name=b[9],
# ))

fig.update_traces(mode='lines')

# Save a figure
pio.write_image(fig, filedirname+'stat_all_1.png', width=1*600, height=600, scale=12)

fig.show()

#### Now, looking at different layers that the devices have

In [None]:
# Look at the unique types for each column

print('BackContact: ',mySeries['BackContact'].unique())
print('FrontContact: ',mySeries['FrontContact'].unique())
print('CSL1: ',mySeries['CSL1'].unique())
print('CSL2: ',mySeries['CSL2'].unique())
print('CSL3: ',mySeries['CSL3'].unique())
print('CSL4: ',mySeries['CSL4'].unique())
print('CSL5: ',mySeries['CSL5'].unique())
print('CSL6: ',mySeries['CSL6'].unique())
print('ABS: ',mySeries['ABS'].unique())

print('Column name: ',mySeries.columns.values.tolist())

In [None]:
# Now let's combine the CSL1, CSL2, and CSL3; CSL4, CSL5, and CSL6

mySeries['FrontContact_CSL1_CSL2_CSL3'] = mySeries['FrontContact']+'_'+mySeries['CSL1'] + '_'+mySeries['CSL2'] + '_'+ mySeries['CSL3']#.astype(str)
mySeries['FrontContact_CSL4_CSL5_CSL6'] = mySeries['FrontContact']+'_'+mySeries['CSL4'] + '_'+mySeries['CSL5'] + '_'+ mySeries['CSL6']#.astype(str)

In [None]:
# Look at the specific entries and their values

mySeries['T_avg'].value_counts()
# Plotting the distribution of temperature
sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(5,4)})
sns.distplot(mySeries['T_avg'],kde=False)

plt.rcParams['font.family'] = 'Arial'
plt.xlabel('Average degradation temperature (deg. C)')
plt.ylabel('Count')

# Save figure
plt.tight_layout()
plt.savefig(filedirname+'distribution_temperature.png', dpi=600)

# Print the min and max hour
print('Min temp: ',min(mySeries['T_avg']))
print('Max temp: ',max(mySeries['T_avg']))

In [None]:
# Look at the specific entries and their values

mySeries['Irr_avg'].value_counts()
# Plotting the distribution of temperature
sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(5,4)})
sns.distplot(mySeries['Irr_avg'],kde=False)

plt.rcParams['font.family'] = 'Arial'
plt.xlabel('Average degradation irradiation (W/m2)')
plt.ylabel('Count')

# Save figure
plt.savefig(filedirname+'distribution_irradiation.png', dpi=600)

# Print the min and max hour
print('Min irradiation: ',min(mySeries['Irr_avg']))
print('Max irradiation: ',max(mySeries['Irr_avg']))

In [None]:
# Look at the specific entries and their values

mySeries['FrontContact'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['BackContact'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL1'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL2'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['ABS'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL3'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL4'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL5'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['CSL6'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['FrontContact_CSL1_CSL2_CSL3'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['FrontContact_CSL4_CSL5_CSL6'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['Encapsulated'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['Area'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['Atmosphere'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['Filter'].value_counts()

In [None]:
# Look at the specific entries and their values

mySeries['T_avg'].value_counts()

In [None]:
# Min and max of temperature average

print('Max T: ',mySeries['T_avg'].max())
print('Min T: ',mySeries['T_avg'].min())

In [None]:
# Saving the mySeriesDrop as .pkl file (only has MPPTdata)
mySeriesDrop.to_pickle('./dataset/pkl_complete/20230303_mySeriesDrop.pkl')

### 2.4. Scaling/ normalization

There are two types of scaling/ normalization:

1. sklearn.preprocessing.MinMaxScaler -> scaling between min-max of the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler)

2. sklearn.preprocessing.MaxAbsScaler -> scaling between 0 and max of the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler)

In [None]:
# Load the mySeriesDrop that only has MPPTdata
with open('dataset/pkl_complete/20230303_mySeriesDrop.pkl', "rb") as fh:
    mySeriesDrop = pickle.load(fh)

In [None]:
# Preprocessing: scaling/ normalization
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

# Function to scale, normalize and plot it
def normalize(mySeriesDrop,normalizationMethod):
    '''
    A function to normalize and plot the result
    
    input:
    1. mySeriesDrop (only contains MPPTdata)
    2. normalizationMethod: a string of normalization type, 'MinMaxScaler',
       'MaxAbsScaler'
    
    '''
    mySeriesDrop_norm = mySeriesDrop.copy()
    
    # MinMaxScaler
    if normalizationMethod == 'MinMaxScaler':
        for i in range(len(mySeriesDrop_norm)):
            scaler = MinMaxScaler()
            mySeriesDrop_norm[i] = MinMaxScaler().fit_transform(mySeriesDrop_norm[i])
            mySeriesDrop_norm[i]= mySeriesDrop_norm[i].reshape(len(mySeriesDrop_norm[i]))
    
    # MaxAbsScaler
    elif normalizationMethod == 'MaxAbsScaler':
        for i in range(len(mySeriesDrop_norm)):
            scaler = MaxAbsScaler()
            mySeriesDrop_norm[i] = MaxAbsScaler().fit_transform(mySeriesDrop_norm[i])
            mySeriesDrop_norm[i]= mySeriesDrop_norm[i].reshape(len(mySeriesDrop_norm[i]))
    
    # Plot the first 100 of data
    fig, axs = plt.subplots(10,10,figsize=(30,30), sharex=True, sharey=True)
    for i in range(10):
        for j in range(10):
            if i*10+j+1>len(mySeriesDrop_norm): # pass the others that we can't fill
                continue
            axs[i, j].plot(mySeriesDrop_norm[i*10+j])

    plt.ylim([0,1])
    plt.show()
    
    return mySeriesDrop_norm

In [None]:
mySeriesDrop_maxAbs = normalize(mySeriesDrop,'MaxAbsScaler')

In [None]:
# mySeriesDrop_minMax = normalize(mySeriesDrop,'MinMaxScaler')

In [None]:
# Saving the mySeriesDrop as .pkl file (only has MPPTdata)
# mySeriesDrop_maxAbs.to_pickle('./dataset/pkl_complete/20230116_mySeriesDropNorm.pkl')
mySeriesDrop_maxAbs.to_pickle('./dataset/pkl_complete/20230303_mySeriesDropNorm.pkl')

### 2.5. Smoothing 

Because the data is noisy, we are going to do some 'smoothing' using Savitzky-Golay filter (https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.savgol_filter.html).

In [None]:
# Load the mySeriesDrop that only has MPPTdata
# with open('dataset/pkl_complete/20230116_mySeriesDropNorm.pkl', "rb") as fh:
#     mySeriesDrop = pickle.load(fh)
with open('dataset/pkl_complete/20230303_mySeriesDropNorm.pkl', "rb") as fh:
    mySeriesDrop = pickle.load(fh)

In [None]:
mySeriesDrop = mySeriesDrop_maxAbs

In [None]:
# Plotting the overview of the MPPT data (the first 100 data)

fig, axs = plt.subplots(10,10,figsize=(30,30),sharex=True,sharey=True)

for i in range(10):
    for j in range(10):
        if i*10+j+1>len(mySeriesDrop): # pass the others that we can't fill
            continue
        axs[i, j].plot(mySeriesDrop[i*10+j])
        
plt.ylim([0,1])
plt.show()

We are showing 3 different methods for smoothing:

1. np.convolve (rolling average)

2. scipy.signal.lfilter

3. scipy.signal.savgol (savitzky-golay), which is eventually chosen

In [None]:
# Convert to rolling average/ np.convolve

def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

curve_interest = mySeriesDrop[200]

# Plotting the result
y_smooth_3 = smooth(curve_interest,3)
plt.figure(figsize=(4,2),dpi=300)
plt.plot(curve_interest,'o',alpha=0.05)
plt.plot(smooth(curve_interest,3), 'r-', lw=1)
plt.plot(smooth(curve_interest,50), 'g-', lw=1)

plt.legend(['actual data','convolve:3','convolve:50'])

print(len(curve_interest), len(y_smooth_3))

In [None]:
# Smoothing using l filter

from scipy.signal import lfilter

n1 = 15  # the larger n is, the smoother curve will be
b1 = [1.0 / n1] * n1
a1 = 1

n2 = 30  # the larger n is, the smoother curve will be
b2 = [1.0 / n2] * n2
a2 = 1

curve_interest = mySeriesDrop[200]

# Plotting with l filter
y_smooth_3 = smooth(curve_interest,3)
plt.figure(figsize=(4,2),dpi=300)
plt.plot(curve_interest,'o',alpha=0.05)
plt.plot(lfilter(b1,a1,curve_interest), 'r-', lw=1)
plt.plot(lfilter(b2,a2,curve_interest), 'g-', lw=1)

plt.legend(['actual data','lfilter n:15','lfilter n:30'])

print(len(curve_interest), len(lfilter(b1,a1,curve_interest)))

In [None]:
# Using savitzky-golay filter

from scipy.signal import savgol_filter

curve_interest = mySeriesDrop[200] #1195-1224 JSON-lea_1 sample 1 pixel 0

w1 = savgol_filter(curve_interest, 71, 2)
w2 = savgol_filter(curve_interest, 201, 2)

# Plotting the figure
plt.figure(figsize=(4,2),dpi=300)
plt.plot(curve_interest,'o',alpha=0.05)
plt.plot(w1, 'r-', lw=1)
plt.plot(w2, 'g-', lw=1)

print(len(curve_interest), len(w1), len(w2))

plt.legend(['actual data','savgol window:71','savgol window:201'])

Since Savgol with parameter=71 seems to work the best at smoothing, we are going to use that.

**CHECK IF SAVGOL WORKS IN ALL THE ROWS, IF THE FOLLOWING CELL CAN BE RUN WITH NO ERROR, CONTINUE**

In [None]:
# Convert to savgol: 71

from scipy.signal import savgol_filter

n = 71
mySeriesDrop_savgol = []

# Calculating savgol series for all the rows
for i in range(len(mySeriesDrop)):
    print('row :',i)
    savgol = savgol_filter(mySeriesDrop[i], n,2)
    mySeriesDrop_savgol.append(savgol)

# Trying to plot after savgol filter
fig, axs = plt.subplots(7,7,figsize=(18,18),sharex=True)
sns.set_style('darkgrid')

for i in range(7):
    for j in range(7):
        if i*7+j+1>len(mySeriesDrop_savgol): # pass the others that we can't fill
            continue
        axs[i, j].plot(mySeriesDrop[i*7+j],'o',color='b',alpha=0.05)#.values)
        axs[i, j].plot(mySeriesDrop_savgol[i*7+j],color='r',lw=2)#.values)

plt.ylim([0,1])
plt.show()

In [None]:
#### Save numpy array as .npy instead of .pkl
np.save('dataset/pkl_complete/20230303_mySeriesDrop_savgol.npy',mySeriesDrop_savgol)

## 3. SOM/ self-organizing map

Read more about SOM here: https://en.wikipedia.org/wiki/Self-organizing_map.

**input**: clean, pre-processed MPPT data

**process**:

1. cluster them using SOM, explore 3 different parameters combination to see how consistent the clustering results are
2. plot the clusters and distribution
3. split based on the device architecture, plot them
4. look at both clusters and max. PCE group (see if certain clusters correlate with certain max. PCE group more)
5. trendline of relative change -150hrs and the max. PCE group

**output**: 
1. som clusters
2. plots
3. trendline (what is the x-intercept?)

### 3.1. SOM clustering

In [None]:
# Load the libraries

import math
from minisom import MiniSom
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from PIL import ImageColor

# Load libraries for plotting
import plotly.express as px
import plotly.graph_objs as go

In [None]:
# Preparing color palettes and opacity

opacity = 0.5

colors=[ImageColor.getcolor(px.colors.qualitative.Pastel1[0],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Pastel1[1],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Pastel1[2],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Pastel1[3],'RGB')]

colors_solid=[ImageColor.getcolor(px.colors.qualitative.Set1[0],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[1],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[2],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[3],'RGB')]

colors_rgba=[]
colors_solid_rgba=[]

for i in range(len(colors)):
    colors_rgba.append('rgba'+str(colors[i])[:-1]+','+str(opacity)+')')
    
for i in range(len(colors_solid)):
    colors_solid_rgba.append('rgba'+str(colors_solid[i])[:-1]+','+str(opacity)+')')

In [None]:
# Plot with plotly

import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Functions to plot the series

# Plot using averaged center
    
def plot_som_series_averaged_center(som_x, som_y, win_map, name):
    
    fig = make_subplots(
    rows=som_x, cols=som_y,
    shared_xaxes=True,
    shared_yaxes=True,
    vertical_spacing=0.1,#0.02,#0.1,
    )
    
    # Colors
    opacity = 0.04

    colors=[ImageColor.getcolor(px.colors.qualitative.Antique[4],'RGB'),
            ImageColor.getcolor(px.colors.qualitative.Antique[9],'RGB'),
            ImageColor.getcolor(px.colors.qualitative.Antique[6],'RGB'),
            ImageColor.getcolor(px.colors.qualitative.Antique[8],'RGB')]

    colors_solid=[ImageColor.getcolor(px.colors.qualitative.Set1[0],'RGB'),
                  ImageColor.getcolor(px.colors.qualitative.Set1[1],'RGB'),
                  ImageColor.getcolor(px.colors.qualitative.Set1[2],'RGB'),
                  ImageColor.getcolor(px.colors.qualitative.Set1[3],'RGB')]

    colors_rgba=[]
    colors_solid_rgba=[]

    for i in range(len(colors)):
        colors_rgba.append('rgba'+str(colors[i])[:-1]+','+str(opacity)+')')

    for i in range(len(colors_solid)):
        colors_solid_rgba.append('rgba'+str(colors_solid[i])[:-1]+','+str(opacity)+')')
    
    # Color count
    color_count = 0
    
    # Time
    time = np.linspace(0,hour_limit, 900, endpoint=True)
    
    # Create the subplots
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            cluster_number = x*som_y+y
            if cluster in win_map.keys():

                for series in win_map[cluster]:
                    
                    # Cluster colors
                    if cluster==(0,0):
                        line_color = colors_rgba[0]
                        solid_color = colors_solid_rgba[0]
                    elif cluster==(0,1):
                        line_color = colors_rgba[1]
                        solid_color = colors_solid_rgba[1]
                    elif cluster==(1,0):
                        line_color = colors_rgba[2]
                        solid_color = colors_solid_rgba[2]
                    else:
                        line_color = colors_rgba[3]
                        solid_color = colors_solid_rgba[3]
                        
                    # Plot the traces 
                    fig.add_trace(go.Scatter(x=time, y=series, mode='lines',
                                             name=f"Cluster {cluster_number}",
                                             opacity=0.2,
#                                              line=dict(color='darkgrey'),
                                             line=dict(color=line_color),
                                             showlegend=False),
                                  row=x+1, col=y+1)
                color_count=+1
                
                # Calculate the average
                cluster_mean= np.average(np.vstack(win_map[cluster]),axis=0)
                
                # Plot the average
                fig.add_trace(go.Scatter(x=time, y= cluster_mean, mode='lines',
                                         name=f"Cluster mean {cluster_number}",
                                         line_color='black',
                                         showlegend=False),
                              row=x+1, col=y+1)
            
            # Update the figure
            fig.update_yaxes(range=[-0.1,1.1], row=x+1, col=y+1)
            fig.update_layout(font_family='Arial')

    # Save the figure
    pio.write_image(fig, name+'averagedcenter_'+str(som_x)+'_'+str(som_y)+'.png', 
                    width=1.8*600, height=0.6*600, scale=15) # width=1*600, height=600, scale=15)
    pio.write_image(fig, name+'averagedcenter_big_'+str(som_x)+'_'+str(som_y)+'.png',
                    width=1.8*800, height=0.6*800, scale=15) # width=1*800, height=800, scale=15)
    
    # Showing the figure
    fig.show()
    

# Plot using barycenter 
def plot_som_series_dba_center(som_x, som_y, win_map, name):

    fig = make_subplots(
        rows=som_x, cols=som_y,
        shared_xaxes=True,
        shared_yaxes=True,
        vertical_spacing=0.2,
    )
    
    # Time
    time = np.linspace(0,150, 900, endpoint=True)
    
    # Create the subplots
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            cluster_number = x*som_y+y
            if cluster in win_map.keys():
                for series in win_map[cluster]:    
                    
                    # Plot the traces
                    fig.add_trace(go.Scatter(x=time, y=series, mode='lines',
                                             name=f"Cluster {cluster_number}",
                                             line_color='rgba(130,179,196,0.12)',
                                             showlegend=False),
                                  row=x+1, col=y+1)
                
                # Calculate the barycenter average
                cluster_dtw = np.transpose(dtw_barycenter_averaging(np.vstack(win_map[cluster])))
                
                # Plot the barycenter average
                fig.add_trace(go.Scatter(x=time, y= cluster_dtw[0], mode='lines',
                                         name=f"Cluster dtw {cluster_dtw}",
                                         line_color='rgb(57,103,119)',
                                         showlegend=False),
                              row=x+1, col=y+1)
                    
            # Update the figure
            fig.update_yaxes(range=[-0.1,1.1], row=x+1, col=y+1)
            fig.update_layout(font_family='Arial')

    # Save the figure
    pio.write_image(fig, name+'barryaverage.png', width=1*600, height=600, scale=15)
    fig.show()
    

In [None]:
# Set up name, sigma, learning_rate
sigma = 0.5
learning_rate= 0.1
# hour_limit=150

# Set the number of clusters
som_x = 2
som_y = 2
cluster_count= som_x*som_y

In [None]:
# Reset sns 
sns.reset_orig()

# Calculate the SOM
som = MiniSom(som_x, som_y,len(mySeriesDrop_savgol[0]), sigma=sigma, learning_rate = learning_rate)

som.random_weights_init(mySeriesDrop_savgol)
som.train(mySeriesDrop_savgol, 50000, verbose=True)

# Plot savgol
win_map = som.win_map(mySeriesDrop_savgol)

# Returns the mapping of the winner nodes and inputs
plot_som_series_averaged_center(som_x, som_y, win_map, filedirname)
# plot_som_series_dba_center(som_x, som_y, win_map, filedirname)

In [None]:
# Sorting the SOM results on win_map keys

# CHANGE SEQUENCE HERE: Turn into single digit keys
win_map[0] = win_map.pop((1,1))
win_map[1] = win_map.pop((1,0))
win_map[2] = win_map.pop((0,1))
win_map[3] = win_map.pop((0,0))

win_map.keys()

In [None]:
# Now turn back into the mapping (DON'T CHANGE ANYTHING)
win_map[(0,0)] = win_map.pop(0)
win_map[(0,1)] = win_map.pop(1)
win_map[(1,0)] = win_map.pop(2)
win_map[(1,1)] = win_map.pop(3)

# Returns the mapping of the winner nodes and inputs
plot_som_series_averaged_center(som_x, som_y, win_map, filedirname)

Now, after doing the clustering, let's store the data in pandas dataframe.

In [None]:
# Find out which data row belongs to which cluster
som_shape = (som_x,som_y)
winner_coordinates = np.array([som.winner(x) for x in mySeriesDrop_savgol]).T

# With np.ravel_multi_index we convert the 2-dimensional
# coordinates to a 1-dimensional index
cluster_index_unfixed = np.ravel_multi_index(winner_coordinates, som_shape)

# FIXING the labels sequence to follow the SOM results order
cluster_index = np.empty_like(cluster_index_unfixed)

for i in range(len(cluster_index_unfixed)):
    if(cluster_index_unfixed[i]==0):
        cluster_index[i]=(cluster_index_unfixed[i]+3)
    elif(cluster_index_unfixed[i]==1):
        cluster_index[i]=(cluster_index_unfixed[i]+1)
    elif(cluster_index_unfixed[i]==2):
        cluster_index[i]=(cluster_index_unfixed[i]-1)
    else:
        cluster_index[i]=(cluster_index_unfixed[i]-3)

# Identify the number of clusters
cluster_c = [len(cluster_index[cluster_index==i]) for i in np.unique(cluster_index)]
cluster_n = ["cluster_"+str(i) for i in np.unique(cluster_index)]

fancy_names_for_labels = [f"{label}" for label in cluster_index]
result = pd.DataFrame(zip(mySeries['Filename'],mySeries['Pixel'],
                          mySeries['SampleNumber'],fancy_names_for_labels),
                      columns=["Series",'Pixel',"SampleNumber","Cluster"]).sort_values(by="Cluster")#.set_index("Series")

result['PCE_before_x'] = PCE_df['PCE_before_x']
result['PCE_before_ceil_x'] = PCE_df['PCE_before_ceil_x']
result['PCE_before_median_x'] = PCE_df['PCE_before_median_x']
result['PCE_before_mean_x'] = PCE_df['PCE_before_mean_x']
result['PCE_delta'] = PCE_df['PCE_delta']

# Save result on the .csv file
(result.sort_index()).to_csv(filedirname+'clusters.csv')

result.sort_index()

### 3.2. General SOM cluster plots

Now, let's plot some of the results.

In [None]:
import plotly.express as px

# Plot cluster distribution vertical
fig = px.bar(x=cluster_n, y=cluster_c,labels=dict(x='Clusters',y='Count'))

fig.update_layout(font_family='Arial')
fig.update_traces(marker_color='rgba(57,103,119,0.7)')

# Save the figure
pio.write_image(fig, filedirname+'distribution_v.png',
                width=1*400, height=400, scale=16)
fig.show()

# Plot horizontal distribution 
fig = go.Figure()
fig.add_trace(go.Bar(
    y=cluster_n,
    x=cluster_c,
    orientation='h',
    marker=dict(
        color= 'rgba(57,103,119,0.7)',
    )
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Count'),
                  yaxis=dict(title='Cluster'))

pio.write_image(fig, filedirname+'distribution_h.png', width=1*400, height=1*400, scale=16)
fig.show()

In [None]:
# Colormap PCE
colormap_PCE = {1.0: "#034e7b",
                2.0: "#0570b0",
                3.0: "#3690c0",
                4.0: "#74a9cf",
                5.0: "#a6bddb"}

# Colormap cluster
colormap_cluster = {"0": px.colors.qualitative.Antique[4],
                    "1": px.colors.qualitative.Antique[9],
                    "2": px.colors.qualitative.Antique[6],
                    "3": px.colors.qualitative.Antique[8]}

# Plot histogram for PCE group distribution
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="Cluster", color='PCE_before_x',
                   color_discrete_map=colormap_PCE, opacity=0.75,
#                    histnorm='percent',
                  )
# Update layout
fig.update_layout(xaxis_title="Cluster",
                  legend_title = 'Max. PCE Group',
                  yaxis_title="Count",
                  font_family='Arial',barmode='group')#,barnorm='fraction')

# Save figure
pio.write_image(fig, filedirname+'group_2.png', width=1*400, height=1*400, scale=16)
fig.show()


# Plot histogram for cluster distribution
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="PCE_before_x", color='Cluster',
                   color_discrete_map=colormap_cluster, opacity=0.7,
#                    histnorm='percent',
                  )

# Update layout
fig.update_layout(xaxis_title="Max. PCE Group (%)",# xaxis=dict(range=[7,25]),
                  yaxis_title="Count", font_family='Arial',barmode='group')#,barnorm='fraction')

# Save figure
pio.write_image(fig, filedirname+'group_3.png', width=1*400, height=1*400, scale=16)
fig.show()

In [None]:
# Colormap PCE
colormap_PCE = {1.0: "#034e7b",
                2.0: "#0570b0",
                3.0: "#3690c0",
                4.0: "#74a9cf",
                5.0: "#a6bddb"}

# Colormap cluster
colormap_cluster = {"0": px.colors.qualitative.Antique[4],
                    "1": px.colors.qualitative.Antique[9],
                    "2": px.colors.qualitative.Antique[6],
                    "3": px.colors.qualitative.Antique[8]}

# Plot stacked bar based on PCE group
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="Cluster", color='PCE_before_x',
                   color_discrete_map=colormap_PCE, opacity=0.75,
                   histnorm='percent',
                  )

# Update layout
fig.update_layout(xaxis_title="Cluster",
                  legend_title = 'Max. PCE Group',
                  yaxis_title="Count",
                  font_family='Arial')#,barmode='group')#,barnorm='fraction')

# Save figure
pio.write_image(fig, filedirname+'percent_2.png', width=1*400, height=1*400, scale=16)
fig.show()

# Plot stacked bar based on cluster
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="PCE_before_x", color='Cluster',
                   color_discrete_map=colormap_cluster, opacity=0.75,
                   histnorm='percent',
                  )

# Update layout
fig.update_layout(xaxis_title="Max. PCE Group",#xaxis=dict(range=[7,26]),
                  yaxis_title="Count", font_family='Arial',bargap=0.1)#,barmode='group')#,barnorm='fraction')

# Save figure
pio.write_image(fig, filedirname+'percent_3.png', width=1*400, height=1*400, scale=16)
fig.show()

In [None]:
# Colormap PCE
colormap_PCE = {1.0: "#034e7b",
                2.0: "#0570b0",
                3.0: "#3690c0",
                4.0: "#74a9cf",
                5.0: "#a6bddb"}

# Colormap cluster
colormap_cluster = {"0": px.colors.qualitative.Antique[4],
                    "1": px.colors.qualitative.Antique[9],
                    "2": px.colors.qualitative.Antique[6],
                    "3": px.colors.qualitative.Antique[8]}

# Plot normalized bar based on PCE group
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="Cluster", color='PCE_before_x',
                   color_discrete_map=colormap_PCE, opacity=0.75,
#                    histnorm='percent',
                  )

# Update layout
fig.update_layout(xaxis_title="Cluster",
                  legend_title = 'Max. PCE Group',
                  yaxis_title="Count",
                  font_family='Arial',barnorm='fraction')

# Save figure
pio.write_image(fig, filedirname+'fraction_2.png', width=1*400, height=1*400, scale=16)
fig.show()

# Plot normalized bar based on cluster
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_x']),
                   x="PCE_before_x", color='Cluster',
                   color_discrete_map=colormap_cluster, opacity=0.75,
#                    histnorm='percent',
                  )

# Update layout
fig.update_layout(xaxis_title="Max. PCE Group",#xaxis=dict(range=[9,26]),
                  yaxis_title="Count", font_family='Arial',barnorm='fraction',
                  bargap=0.1)

# Save figure
pio.write_image(fig, filedirname+'fraction_3.png', width=1*400, height=1*400, scale=16)
fig.show()

### 3.3. Split based on n-i-p or p-i-n

Let's split based on the architecture (n-i-p: the 'normal' architecture, p-i-n: the 'inverted' architecture).

In [None]:
# Insert FrontContact to the result dataframe
result_arch = pd.concat([result.sort_index(),
                         mySeries['FrontContact'].reset_index(drop=True)],
                        axis=1)

# Save it as a new list
(result_arch.sort_index()).to_csv(filedirname+'architecture.csv')
result_arch

In [None]:
# Functions to plot the histogram based on cluster

def plot_histogram_cluster_plotly(som_x, som_y, result_arch,name):
    
    # Create subplots and define their properties
    fig = make_subplots(
        rows=som_x, cols=som_y,
        shared_xaxes=True,
        shared_yaxes=True,
        vertical_spacing=0.2, #0.03
        specs=[[{"type": "scatter"},
                {"type": "scatter"}],
               [{"type": "scatter"},
                {"type": "scatter"}]],
    )
    
    # Going through SOM results and plot them
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            cluster_number = x*som_y+y
            
            # Select specific cluster
            selected = result_arch.loc[result_arch['Cluster'] == str(cluster_number)]
            
            # Select the ITO-based devices
            selected_ITO = selected.loc[selected['FrontContact'] == 'ITO']
            
            # Select the FTO-based devices
            selected_FTO = selected.loc[selected['FrontContact'] == 'FTO']
            
            # Plot the histograms
            # Plot the ITO
            fig.add_trace(go.Histogram(x = selected_ITO['PCE_before_x'],
                                       marker_color='rgb(57,103,119)',
                                       opacity=0.75),
                          row=x+1,col=y+1)
            # Plot the FTO
            fig.add_trace(go.Histogram(x = selected_FTO['PCE_before_x'],
                                       marker_color='rgb(130,179,196,0.12)',
                                       opacity=0.75),
                          row=x+1,col=y+1)
    
    # Update figure properties
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(range=[0,380])
    fig.update_layout(boxgap=0.05,
                      font_family='Arial',
                      showlegend=False)
    
    pio.write_image(fig, name, width=1*400, height=400, scale=16)

    fig.show()
    
    return selected

def plot_histogram_cluster_plotly_percent_subplot(som_x, som_y, result_arch,name):
    
    # Create subplots and define their properties
    fig = make_subplots(
        rows=som_x, cols=som_y,
        shared_xaxes=True,
        shared_yaxes=True,
        vertical_spacing=0.06,#0.2, #0.03
        specs=[[{"type": "scatter"},
                {"type": "scatter"}],
               [{"type": "scatter"},
                {"type": "scatter"}]],
    )
    
    # Going through SOM results and plot them
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            cluster_number = x*som_y+y
            
            # Select specific cluster
            selected = result_arch.loc[result_arch['Cluster'] == str(cluster_number)]
            
            # Plot the histograms
            fig.add_trace(go.Histogram(x = selected['FrontContact'],
                                       opacity=0.75),
                          row=x+1,col=y+1)
    
    # Update figure properties
    fig.update_xaxes(showticklabels=False)
    fig.update_layout(boxgap=0.2,
                      font_family='Arial',
                      showlegend=False)
    
    pio.write_image(fig, name, width=1*400, height=400, scale=16)

    fig.show()
    
    return selected

def plot_histogram_cluster_plotly_percent_all(som_x, som_y, result_arch,name):
    
    # Define colormap for each of the PCE
    colormap_PCE = {1.0: "#034e7b",
                    2.0: "#0570b0",
                    3.0: "#3690c0",
                    4.0: "#74a9cf",
                    5.0: "#a6bddb"}
    
    # Create subplots and define their properties
    fig = make_subplots(
        rows=som_x, cols=som_y,
        shared_xaxes=True,
        shared_yaxes=True,
        vertical_spacing=0.1,#0.2, #0.03
        horizontal_spacing = 0.1,
        specs=[[{"type": "scatter"},
                {"type": "scatter"}],
               [{"type": "scatter"},
                {"type": "scatter"}]],
    )
    
    # Going through SOM results and plot them
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            cluster_number = x*som_y+y
            
            # Select specific cluster
            selected = result_arch.loc[result_arch['Cluster'] == str(cluster_number)]
            
            # Select specific max. PCE group
            selected_1 = selected.loc[selected['PCE_before_x'] == 1]
            selected_2 = selected.loc[selected['PCE_before_x'] == 2]
            selected_3 = selected.loc[selected['PCE_before_x'] == 3]
            selected_4 = selected.loc[selected['PCE_before_x'] == 4]
            selected_5 = selected.loc[selected['PCE_before_x'] == 5]
            
            # Plot the bar plot
            fig.add_trace(go.Bar(x = selected_1.sort_values(by=['Cluster','PCE_before_x'])['FrontContact'],
                                 y = selected_1.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_ceil_x'],
                                 offsetgroup = 0),
                          row=x+1, col=y+1)
            
            fig.add_trace(go.Bar(x = selected_2.sort_values(by=['Cluster','PCE_before_x'])['FrontContact'],
                                 y = selected_2.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_ceil_x'],
                                 offsetgroup = 0, base = selected_1.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_x']),
                          row=x+1, col=y+1)
            
            fig.add_trace(go.Bar(x = selected_3.sort_values(by=['Cluster','PCE_before_x'])['FrontContact'],
                                 y = selected_3.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_ceil_x'],
                                 offsetgroup = 0, base = selected_2.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_x']),
                          row=x+1, col=y+1)
            
            fig.add_trace(go.Bar(x = selected_4.sort_values(by=['Cluster','PCE_before_x'])['FrontContact'],
                                 y = selected_4.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_ceil_x'],
                                 offsetgroup = 0, base = selected_3.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_x']),
                          row=x+1, col=y+1)
            
            fig.add_trace(go.Bar(x = selected_5.sort_values(by=['Cluster','PCE_before_x'])['FrontContact'],
                                 y = selected_5.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_ceil_x'],
                                 offsetgroup = 0, base = selected_4.sort_values(by=['Cluster','PCE_before_x'])['PCE_before_x']),
                          row=x+1, col=y+1)
    
    # Update the figure properties
        fig.update_layout(font_family='Arial',
                      showlegend=False)
    
    pio.write_image(fig, name, width=1*400, height=400, scale=16)

    fig.show()
    
    return selected

In [None]:
# Plot the 2 architectures based on cluster
selected = plot_histogram_cluster_plotly(som_x, som_y, result_arch,filedirname+'histogram_contact.png')

In [None]:
# Plot the clusters based on the architectures
selected = plot_histogram_cluster_plotly_percent_all(som_x, som_y, result_arch,
                                                     filedirname+'histogram_contact_percentage.png')

In [None]:
### Separated by cluster, for each max. PCE group

colormap_PCE = ['#034e7b', '#0570b0', '#3690c0', '#74a9cf', '#a6bddb']
opacity = 0.75

x = ['n-i-p', 'p-i-n']

n_cluster = result_arch['Cluster'].nunique()
groupCount = result_arch.groupby(['Cluster','PCE_before_x','FrontContact'])['Cluster'].count()

fig = make_subplots(
    rows=som_x, cols=som_y,
    shared_xaxes=True,
    shared_yaxes=True,
    vertical_spacing=0.1,#0.2, #0.03
    horizontal_spacing = 0.1,
    specs=[[{"type": "scatter"},
            {"type": "scatter"}],
           [{"type": "scatter"},
            {"type": "scatter"}]],
    )

# See if certain cluster is non-existent
if {'1',4.0,'FTO'}.issubset(groupCount) == False:
    val_1 = 0
else:
    val_1 = groupCount['1'][4.0]['FTO']
    
if {'2',3.0,'FTO'}.issubset(groupCount) == False:
    val_2 = 0
else:
    val_2 = groupCount['2'][3.0]['FTO']
    
if {'2',4.0,'FTO'}.issubset(groupCount) == False:
    val_3 = 0
else:
    val_3 = groupCount['2'][4.0]['FTO']
    
if {'3',4.0,'FTO'}.issubset(groupCount) == False:
    val_4 = 0
else:
    val_4 = groupCount['3'][4.0]['FTO']

if {'1',3.0,'FTO'}.issubset(groupCount) == False:
    val_5 = 0
else:
    val_5 = groupCount['1'][3.0]['FTO']

if {'1',5.0,'FTO'}.issubset(groupCount) == False:
    val_6 = 0
else:
    val_6 = groupCount['1'][5.0]['FTO']

if {'2',5.0,'FTO'}.issubset(groupCount) == False:
    val_7 = 0
else:
    val_7 = groupCount['2'][5.0]['FTO']

if {'3',5.0,'FTO'}.issubset(groupCount) == False:
    val_8 = 0
else:
    val_8 = groupCount['3'][5.0]['FTO']

if {'3',5.0,'ITO'}.issubset(groupCount) == False:
    val_9 = 0
else:
    val_9 = groupCount['3'][5.0]['ITO']

if {'3',3.0,'FTO'}.issubset(groupCount) == False:
    val_10 = 0
else:
    val_10 = groupCount['3'][3.0]['FTO']

if {'0',3.0,'FTO'}.issubset(groupCount) == False:
    val_11 = 0
else:
    val_11 = groupCount['0'][3.0]['FTO']
    
if {'0',4.0,'FTO'}.issubset(groupCount) == False:
    val_12 = 0
else:
    val_12 = groupCount['0'][4.0]['FTO']
    
if {'0',5.0,'FTO'}.issubset(groupCount) == False:
    val_13 = 0
else:
    val_13 = groupCount['0'][5.0]['FTO']

if {'2',5.0,'ITO'}.issubset(groupCount) == False:
    val_14 = 0
else:
    val_14 = groupCount['2'][5.0]['ITO']
    
if {'0',5.0,'ITO'}.issubset(groupCount) == False:
    val_15 = 0
else:
    val_15 = groupCount['0'][5.0]['ITO']
    
if {'3',2.0,'ITO'}.issubset(groupCount) == False:
    val_16 = 0
else:
    val_16 = groupCount['3'][2.0]['ITO']
    
if {'3',1.0,'FTO'}.issubset(groupCount) == False:
    val_17 = 0
else:
    val_17 = groupCount['3'][1.0]['FTO']

# Cluster 1
fig.add_trace(go.Bar(x=x, y=[groupCount['0'][1.0]['FTO'],
                             groupCount['0'][1.0]['ITO']],
                     name='< 10%', marker_color=colormap_PCE[0],
                     opacity=opacity),
              row=1, col=1)
fig.add_trace(go.Bar(x=x, y=[groupCount['0'][2.0]['FTO'],
                             groupCount['0'][2.0]['ITO']], 
                     name='10-14%', marker_color=colormap_PCE[1],
                     opacity=opacity),
              row=1, col=1)
fig.add_trace(go.Bar(x=x, y=[val_11,
                             groupCount['0'][3.0]['ITO']], 
                     name='14-16.6%', marker_color=colormap_PCE[2],
                     opacity=opacity),
              row=1, col=1)
fig.add_trace(go.Bar(x=x, y=[val_12,
                             groupCount['0'][4.0]['ITO']], 
                     name='16.6-19.2%', marker_color=colormap_PCE[3],
                     opacity=opacity),
              row=1, col=1)
fig.add_trace(go.Bar(x=x, y=[val_13,
                             val_15], 
                     name='> 19.2%', marker_color=colormap_PCE[4],
                     opacity=opacity),
              row=1, col=1)


# Cluster 2    
fig.add_trace(go.Bar(x=x, y=[groupCount['1'][1.0]['FTO'],
                             groupCount['1'][1.0]['ITO']],
                     name='< 10%', marker_color=colormap_PCE[0],
                     opacity=opacity),
              row=1, col=2)
fig.add_trace(go.Bar(x=x, y=[groupCount['1'][2.0]['FTO'],
                             groupCount['1'][2.0]['ITO']],
                     name='10-14%', marker_color=colormap_PCE[1],
                     opacity=opacity),
              row=1, col=2)
fig.add_trace(go.Bar(x=x, y=[val_5,
                             groupCount['1'][3.0]['ITO']],
                     name='14-16.6%', marker_color=colormap_PCE[2],
                     opacity=opacity),
              row=1, col=2)
fig.add_trace(go.Bar(x=x, y=[val_1,
                             groupCount['1'][4.0]['ITO']], 
                     name='16.6-19.2%', marker_color=colormap_PCE[3],
                     opacity=opacity),
              row=1, col=2)
fig.add_trace(go.Bar(x=x, y=[val_6,
                             groupCount['1'][5.0]['ITO']], 
                     name='> 19.2%', marker_color=colormap_PCE[4],
                     opacity=opacity),
              row=1, col=2)

# Cluster 3    
fig.add_trace(go.Bar(x=x, y=[groupCount['2'][1.0]['FTO'],
                             groupCount['2'][1.0]['ITO']],
                     name='< 10%', marker_color=colormap_PCE[0],
                     opacity=opacity),
              row=2, col=1)
fig.add_trace(go.Bar(x=x, y=[groupCount['2'][2.0]['FTO'],
                             groupCount['2'][2.0]['ITO']],
                     name='10-14%', marker_color=colormap_PCE[1],
                     opacity=opacity),
              row=2, col=1)
fig.add_trace(go.Bar(x=x, y=[val_2,
                             groupCount['2'][3.0]['ITO']],
                     name='14-16.6%', marker_color=colormap_PCE[2],
                     opacity=opacity),
              row=2, col=1)
fig.add_trace(go.Bar(x=x, y=[val_3,
                             groupCount['2'][4.0]['ITO']],
                     name='16.6-19.2%', marker_color=colormap_PCE[3],
                     opacity=opacity),
              row=2, col=1)
fig.add_trace(go.Bar(x=x, y=[val_7,
                             val_14],
                     name='> 19.2%', marker_color=colormap_PCE[4],
                     opacity=opacity),
              row=2, col=1)

# Cluster 4 
fig.add_trace(go.Bar(x=x, y=[val_17,
                             groupCount['3'][1.0]['ITO']],
                     name='< 10%', marker_color=colormap_PCE[0],
                     opacity=opacity),
              row=2, col=2)
fig.add_trace(go.Bar(x=x, y=[val_16,
                             groupCount['3'][2.0]['ITO']], 
                     name='10-14%', marker_color=colormap_PCE[1],
                     opacity=opacity),
              row=2, col=2)
fig.add_trace(go.Bar(x=x, y=[val_10,
                             groupCount['3'][3.0]['ITO']],
                     name='14-16.6%', marker_color=colormap_PCE[2],
                     opacity=opacity),
              row=2, col=2)
fig.add_trace(go.Bar(x=x, y=[val_4,
                             groupCount['3'][4.0]['ITO']], 
                     name='16.6-19.2%', marker_color=colormap_PCE[3],
                     opacity=opacity),
              row=2, col=2)
fig.add_trace(go.Bar(x=x, y=[val_8,
                             val_9], 
                     name='> 19.2%', marker_color=colormap_PCE[4],
                     opacity=opacity),
              row=2, col=2)

# Update layout
fig.update_layout(barmode='stack', font_family='Arial', showlegend=False)

# Save figure
pio.write_image(fig, filedirname+'cluster_nip.png', width=1*600, height=600, scale=12)
fig.show()

In [None]:
### Cluster-based, percent

colormap_PCE = ['#034e7b', '#0570b0', '#3690c0', '#74a9cf', '#a6bddb']
colormap_cluster = ['#045a8d', '#2b8cbe', '#74a9cf', '#bdc9e1']
opacity = 0.75
width = 0.5

x = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']

n_cluster = result_arch['Cluster'].nunique()
groupCount = result_arch.groupby(['Cluster','PCE_before_x','FrontContact'])['Cluster'].count()

fig = make_subplots(
    rows=1, cols=2,
    shared_xaxes=False,
    shared_yaxes=True,
    horizontal_spacing = 0.005,
    )

# n-i-p
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['FTO'],
                            groupCount['1'][1.0]['FTO'],
                            groupCount['2'][1.0]['FTO'],
                            val_17],
                     name='< 10%',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][2.0]['FTO'],
                            groupCount['1'][2.0]['FTO'],
                            groupCount['2'][2.0]['FTO'],
                            val_16],
                     name='10-14%',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_11,
                            val_5,
                            val_2,
                            val_10],
                     name='14-16.6%',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_12,
                            val_1,
                            val_3,
                            val_4],
                     name='16.6-19.2%',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_13,
                            val_6,
                            val_7,
                            val_8],
                     name='> 19.2%',marker_color=colormap_PCE[4],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

# p-i-n
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['ITO'],
                            groupCount['1'][1.0]['ITO'],
                            groupCount['2'][1.0]['ITO'],
                            groupCount['3'][1.0]['ITO']],
                     name='< 10%',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][2.0]['ITO'],
                            groupCount['1'][2.0]['ITO'],
                            groupCount['2'][2.0]['ITO'],
                            groupCount['3'][2.0]['ITO']],
                     name='10-14%',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][3.0]['ITO'],
                            groupCount['1'][3.0]['ITO'],
                            groupCount['2'][3.0]['ITO'],
                            groupCount['3'][3.0]['ITO']],
                     name='14-16.6%',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][4.0]['ITO'],
                            groupCount['1'][4.0]['ITO'],
                            groupCount['2'][4.0]['ITO'],
                            groupCount['3'][4.0]['ITO']],
                     name='16.6-19.2%',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[val_15,
                            groupCount['1'][5.0]['ITO'],
                            val_14,
                            val_9],
                     name='> 19.2%',marker_color=colormap_PCE[4],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[1.05,0])
fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,1.05])

fig.update_layout(barmode='relative', barnorm='fraction',font_family='Arial', showlegend=False, #barmode='stack'
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},)

pio.write_image(fig, filedirname+'butterfly_arch_cluster_based_percent.png', width=1*600, height=0.75*600, scale=12)

fig.show()

In [None]:
### Cluster based, count

colormap_PCE = ['#034e7b', '#0570b0', '#3690c0', '#74a9cf', '#a6bddb']
colormap_cluster = ['#045a8d', '#2b8cbe', '#74a9cf', '#bdc9e1']
opacity = 0.75
width = 0.5

x = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']

n_cluster = result_arch['Cluster'].nunique()
groupCount = result_arch.groupby(['Cluster','PCE_before_x','FrontContact'])['Cluster'].count()

fig = make_subplots(
    rows=1, cols=2,
    shared_xaxes=False,
    shared_yaxes=True,
    horizontal_spacing = 0.005,
    )

# n-i-p
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['FTO'],
                            groupCount['1'][1.0]['FTO'],
                            groupCount['2'][1.0]['FTO'],
                            val_17],
                     name='< 10%',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][2.0]['FTO'],
                            groupCount['1'][2.0]['FTO'],
                            groupCount['2'][2.0]['FTO'],
                            val_16],
                     name='10-14%',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_11,
                            val_5,
                            val_2,
                            val_10],
                     name='14-16.6%',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_12,
                            val_1,
                            val_3,
                            val_4],
                     name='16.6-19.2%',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_13,
                            val_6,
                            val_7,
                            val_8],
                     name='> 19.2%',marker_color=colormap_PCE[4],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

# p-i-n
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['ITO'],
                            groupCount['1'][1.0]['ITO'],
                            groupCount['2'][1.0]['ITO'],
                            groupCount['3'][1.0]['ITO']],
                     name='< 10%',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][2.0]['ITO'],
                            groupCount['1'][2.0]['ITO'],
                            groupCount['2'][2.0]['ITO'],
                            groupCount['3'][2.0]['ITO']],
                     name='10-14%',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][3.0]['ITO'],
                            groupCount['1'][3.0]['ITO'],
                            groupCount['2'][3.0]['ITO'],
                            groupCount['3'][3.0]['ITO']],
                     name='14-16.6%',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['0'][4.0]['ITO'],
                            groupCount['1'][4.0]['ITO'],
                            groupCount['2'][4.0]['ITO'],
                            groupCount['3'][4.0]['ITO']],
                     name='16.6-19.2%',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[val_15,
                            groupCount['1'][5.0]['ITO'],
                            val_14,
                            val_9],
                     name='> 19.2%',marker_color=colormap_PCE[4],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)


fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[1100,0])
fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,1100])

# fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[500,0])
# fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,500])

fig.update_layout(barmode='stack',font_family='Arial', showlegend=False, #barmode='stack'
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},)

pio.write_image(fig, filedirname+'butterfly_arch_cluster_based.png', width=1*600, height=0.75*600, scale=12)

fig.show()

In [None]:
## PCE-based, percentage

colormap_PCE = ['#034e7b', '#0570b0', '#3690c0', '#74a9cf', '#a6bddb']
colormap_cluster = ['#045a8d', '#2b8cbe', '#74a9cf', '#bdc9e1']

colormap_cluster = [px.colors.qualitative.Antique[4],
                    px.colors.qualitative.Antique[9],
                    px.colors.qualitative.Antique[6],
                    px.colors.qualitative.Antique[8]]

opacity = 0.75
width = 0.5

x = ['< 10%', '10-14%','14-16.6%','16.6-19.2%', '> 19.2%']

n_cluster = result_arch['Cluster'].nunique()
groupCount = result_arch.groupby(['Cluster','PCE_before_x','FrontContact'])['Cluster'].count()

fig = make_subplots(
    rows=1, cols=2,
    shared_xaxes=False,
    shared_yaxes=True,
    horizontal_spacing = 0.005,
    )
    
# n-i-p
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['FTO'],
                            groupCount['0'][2.0]['FTO'],
                            val_11,
                            val_12,
                            val_13],
                     name='Cluster 1',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['1'][1.0]['FTO'],
                            groupCount['1'][2.0]['FTO'],
                            val_5,
                            val_1,
                            val_6],
                     name='Cluster 2',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['2'][1.0]['FTO'],
                            groupCount['2'][2.0]['FTO'],
                            val_2,
                            val_3,
                            val_7],
                     name='Cluster 3',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_17,
                            val_16,
                            val_10,
                            val_4,
                            val_8],
                     name='Cluster 4',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

# p-i-n
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['ITO'],
                            groupCount['0'][2.0]['ITO'],
                            groupCount['0'][3.0]['ITO'],
                            groupCount['0'][4.0]['ITO'],
                            val_15],
                     name='Cluster 1',marker_color=colormap_PCE[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['1'][1.0]['ITO'],
                            groupCount['1'][2.0]['ITO'],
                            groupCount['1'][3.0]['ITO'],
                            groupCount['1'][4.0]['ITO'],
                            groupCount['1'][5.0]['ITO']],
                     name='Cluster 2',marker_color=colormap_PCE[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['2'][1.0]['ITO'],
                            groupCount['2'][2.0]['ITO'],
                            groupCount['2'][3.0]['ITO'],
                            groupCount['2'][4.0]['ITO'],
                            val_14],
                     name='Cluster 3',marker_color=colormap_PCE[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['3'][1.0]['ITO'],
                            groupCount['3'][2.0]['ITO'],
                            groupCount['3'][3.0]['ITO'],
                            groupCount['3'][4.0]['ITO'],
                            val_9],
                     name='Cluster 4',marker_color=colormap_PCE[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[1.05,0])
fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,1.05])

fig.update_layout(barmode='relative', barnorm='fraction',font_family='Arial', showlegend=False, #barmode='stack'
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},)

pio.write_image(fig, filedirname+'butterfly_arch_PCE_based_percent.png', width=1*600, height=0.75*600, scale=12)

fig.show()

In [None]:
### PCE-based, count

colormap_PCE = ['#034e7b', '#0570b0', '#3690c0', '#74a9cf', '#a6bddb']
# colormap_cluster = ['#045a8d', '#2b8cbe', '#74a9cf', '#bdc9e1']

colormap_cluster = [px.colors.qualitative.Antique[4],
                    px.colors.qualitative.Antique[9],
                    px.colors.qualitative.Antique[6],
                    px.colors.qualitative.Antique[8]]

opacity = 0.75
width = 0.5

x = ['< 10%', '10-14%','14-16.6%','16.6-19.2%', '> 19.2%']

n_cluster = result_arch['Cluster'].nunique()
groupCount = result_arch.groupby(['Cluster','PCE_before_x','FrontContact'])['Cluster'].count()

fig = make_subplots(
    rows=1, cols=2,
    shared_xaxes=False,
    shared_yaxes=True,
    horizontal_spacing = 0.005,
    )

# n-i-p
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['FTO'],
                            groupCount['0'][2.0]['FTO'],
                            val_11,
                            val_12,
                            val_13],
                     name='Cluster 1',marker_color=colormap_cluster[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['1'][1.0]['FTO'],
                            groupCount['1'][2.0]['FTO'],
                            val_5,
                            val_1,
                            val_6],
                     name='Cluster 2',marker_color=colormap_cluster[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[groupCount['2'][1.0]['FTO'],
                            groupCount['2'][2.0]['FTO'],
                            val_2,
                            val_3,
                            val_7],
                     name='Cluster 3',marker_color=colormap_cluster[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

fig.add_trace(go.Bar(y=x,x=[val_17,
                            val_16,
                            val_10,
                            val_4,
                            val_8],
                     name='Cluster 4',marker_color=colormap_cluster[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=1)

# p-i-n
fig.add_trace(go.Bar(y=x,x=[groupCount['0'][1.0]['ITO'],
                            groupCount['0'][2.0]['ITO'],
                            groupCount['0'][3.0]['ITO'],
                            groupCount['0'][4.0]['ITO'],
                            val_15],
                     name='Cluster 1',marker_color=colormap_cluster[0],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['1'][1.0]['ITO'],
                            groupCount['1'][2.0]['ITO'],
                            groupCount['1'][3.0]['ITO'],
                            groupCount['1'][4.0]['ITO'],
                            groupCount['1'][5.0]['ITO']],
                     name='Cluster 2',marker_color=colormap_cluster[1],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['2'][1.0]['ITO'],
                            groupCount['2'][2.0]['ITO'],
                            groupCount['2'][3.0]['ITO'],
                            groupCount['2'][4.0]['ITO'],
                            val_14],
                     name='Cluster 3',marker_color=colormap_cluster[2],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.add_trace(go.Bar(y=x,x=[groupCount['3'][1.0]['ITO'],
                            groupCount['3'][2.0]['ITO'],
                            groupCount['3'][3.0]['ITO'],
                            groupCount['3'][4.0]['ITO'],
                            val_9],
                     name='Cluster 4',marker_color=colormap_cluster[3],
                     opacity=opacity,orientation='h', width=width),
              row=1,col=2)

fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[500,0])
fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,500])

# fig.update_xaxes(showticklabels=False,title_text="n-i-p", row=1, col=1, range=[400,0])
# fig.update_xaxes(showticklabels=False,title_text="p-i-n", row=1, col=2, range=[0,400])

fig.update_layout(barmode='stack', font_family='Arial', showlegend=False, #barmode='stack'
                  xaxis1={'side': 'top'},
                  xaxis2={'side': 'top'},)

pio.write_image(fig, filedirname+'butterfly_arch_PCE_based.png', width=1*600, height=0.75*600, scale=12)

fig.show()

### 3.4. Looking at both clusters and max. PCE group

In [None]:
# Load data for the 
cluster_PCE = pd.read_csv(filedirname+'architecture.csv').drop(['Unnamed: 0'],axis=1)
cluster_PCE

In [None]:
print('Cluster list:')
cluster_PCE['Cluster'].value_counts()

In [None]:
cluster_PCE['PCE_before_x'].value_counts()

In [None]:
# cluster_PCE['Series'].value_counts()
cluster_PCE['PCE_before_x'].value_counts()

In [None]:
# cluster_PCE['Series'].value_counts()
cluster_PCE['PCE_before_ceil_x'].value_counts()

In [None]:
# Add cluster data to the PCE_df
PCE_df_sorted = ((PCE_df).sort_index()).reset_index(drop=True)
PCE_df_sorted['Cluster'] = cluster_PCE['Cluster']
PCE_df_sorted

In [None]:
import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib
import random
import plotly.graph_objects as go

fig = go.Figure()

unique_x = PCE_df['PCE_before_x'].unique()

colors = n_colors('rgb(8,29,88)', 'rgb(127,205,187)', 6, colortype='rgb')
colors_box = n_colors('rgb(2,7,22)', 'rgb(30,50,45)', 6, colortype='rgb')
colors_line = n_colors('rgb(0,5,15)', 'rgb(15,25,23)', 6, colortype='rgb')
colors_scatter = n_colors('rgb(0,109,44)', 'rgb(178,226,226)', 4, colortype='rgb')

# Plot the violin plot on the background
for (i,color,color_line) in zip(unique_x, colors, colors_line):
    fig.add_trace(go.Violin(x=(PCE_df['PCE_before_x'][PCE_df['PCE_before_x'] == i])+0,
                            y=PCE_df['PCE_delta'][PCE_df['PCE_before_x'] == i],
                            box_visible=False,
                            fillcolor = color,
                            opacity = 0.4,
                            line = dict(color=color_line),
                            jitter=True,
                            meanline_visible=True
                           )
                 )

# Plot the scattered points
for (i) in zip(unique_x):
    x = cluster_PCE['PCE_before_x'][cluster_PCE['PCE_before_x'] == i]
    fig.add_trace(go.Scatter(x= x + 0.65*np.random.rand(len(x))-0.325, #-0.5 to center it at 0
                             y=cluster_PCE['PCE_delta'][cluster_PCE['PCE_before_x'] == i],
                             mode='markers',
                             marker_color=cluster_PCE['Cluster'],
                             marker_colorscale='deep_r',
                             marker_size=6,
                             opacity = 0.8,
                            )
                 )


# Update properties of the figure
fig.update_layout(xaxis_title="Maximum PCE group",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  boxgap = 0.85,
                  font_family='Arial',
                  showlegend=False)
    
fig.show()

# Save a figure 
pio.write_image(fig, filedirname+'group_violin_1.png',
                width=600, height=400, scale=12)

In [None]:
# Separate for different cluster

import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib
import random

# Colors_scatter
colors_scatter=[ImageColor.getcolor(px.colors.qualitative.Antique[4],'RGB'),
                ImageColor.getcolor(px.colors.qualitative.Antique[9],'RGB'),
                ImageColor.getcolor(px.colors.qualitative.Antique[6],'RGB'),
                ImageColor.getcolor(px.colors.qualitative.Antique[8],'RGB')]

# Plot for each cluster
for cluster in range(4): # Number of clusters: 4
    
    fig = go.Figure()

    colors = n_colors('rgb(8,29,88)', 'rgb(127,205,187)', 6, colortype='rgb')
    colors_box = n_colors('rgb(2,7,22)', 'rgb(30,50,45)', 6, colortype='rgb')
    colors_line = n_colors('rgb(0,5,15)', 'rgb(15,25,23)', 6, colortype='rgb')

    # Plot the violin plot for specific cluster
    for (i,color,color_line) in zip(unique_x, colors, colors_line):
        fig.add_trace(go.Violin(x=(PCE_df['PCE_before_x'][PCE_df['PCE_before_x'] == i])+0,
                                y=PCE_df_sorted['PCE_delta'][(PCE_df_sorted['PCE_before_x'] == i) & (PCE_df_sorted['Cluster']== cluster)],
                                box_visible=False, 
                                fillcolor = color,
                                opacity = 0.4,
                                line = dict(color=color_line),
                                jitter=True,
                                meanline_visible=True
                               )
                     )
    
    # Colors scatter    
    if cluster == 0:
        color_scatter = px.colors.qualitative.Antique[4]#'black'#colors_scatter[0]
    elif cluster == 1:
        color_scatter = px.colors.qualitative.Antique[9]#'blue'#colors_scatter[1]
    elif cluster ==2:
        color_scatter = px.colors.qualitative.Antique[6]#'red'#colors_scatter[2]
    else:
        color_scatter = px.colors.qualitative.Antique[8]#'green'#colors_scatter[3]
    
    # Plot the scattered data points
    for (i) in zip(unique_x):
        x = cluster_PCE['PCE_before_x'][(cluster_PCE['PCE_before_x'] == i) & (cluster_PCE['Cluster'] == cluster)]
        fig.add_trace(go.Scatter(x= x + 0.65*np.random.rand(len(x))-0.325, #-0.5 to center it at 0
                                 y=PCE_df_sorted['PCE_delta'][(PCE_df_sorted['PCE_before_x'] == i) & (PCE_df_sorted['Cluster']== cluster)],
                                 mode='markers',
                                 marker_color=color_scatter,
                                 marker_size=6,
                                 opacity = 0.7,
                                )
                     )

    # Update properties of the figure
    fig.update_traces(marker={'size': 4})
    
    fig.update_layout(xaxis_title="Maximum PCE group (%)",
                      yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                      boxgap = 0.85,
                      font_family='Arial',
                      showlegend=False)
    fig.update_yaxes(range=[-20,120],showgrid=True)
    fig.update_xaxes(showgrid=False,showticklabels=False)
    fig.update_layout(hovermode="y unified")

    fig.show()

    # Save a figure
    pio.write_image(fig, filedirname+'group_violin_cluster_'+str(cluster)+'_2.png',
                    width=425, height=400, scale=20)

### 3.5. Plot the trendline of the mean and interquartile

This will give us some ideas what is the maximum, feasible PCE with 0% relative change in max. PCE after 150 hours.

In [None]:
import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib

##### CALCULATION FOR THE Y-AXIS, THE RELATIVE CHANGE IN MAX.PCE

PCE_df_sorted_upto20 = PCE_df_sorted

# Add the PCE_before_median_x
PCE_df_sorted_upto20['PCE_before_x'] = PCE_df_sorted['PCE_before_x']
PCE_df_sorted_upto20['PCE_before_median_x'] = PCE_df_sorted['PCE_before_median_x']
PCE_df_sorted_upto20['PCE_before_mean_x'] = PCE_df_sorted['PCE_before_median_x']

# Add the PCE_before_mean_x
PCE_df_sorted_upto20['PCE_before_mean_x'].loc[PCE_df_sorted['PCE_before_x'] == 1] = PCE_df_sorted['PCE_before'].loc[PCE_df_sorted['PCE_before_x'] == 1].mean()
PCE_df_sorted_upto20['PCE_before_mean_x'].loc[PCE_df_sorted['PCE_before_x'] == 2] = PCE_df_sorted['PCE_before'].loc[PCE_df_sorted['PCE_before_x'] == 2].mean()
PCE_df_sorted_upto20['PCE_before_mean_x'].loc[PCE_df_sorted['PCE_before_x'] == 3] = PCE_df_sorted['PCE_before'].loc[PCE_df_sorted['PCE_before_x'] == 3].mean()
PCE_df_sorted_upto20['PCE_before_mean_x'].loc[PCE_df_sorted['PCE_before_x'] == 4] = PCE_df_sorted['PCE_before'].loc[PCE_df_sorted['PCE_before_x'] == 4].mean()
PCE_df_sorted_upto20['PCE_before_mean_x'].loc[PCE_df_sorted['PCE_before_x'] == 5] = PCE_df_sorted['PCE_before'].loc[PCE_df_sorted['PCE_before_x'] == 5].mean()

# Extract median
PCE_delta_median = (PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_delta'].median()).to_frame()
PCE_delta_median = PCE_delta_median.rename_axis('PCE_before_x').reset_index()
PCE_delta_median.rename(columns={'PCE_delta':'PCE_delta_median'},inplace=True)

# Extract mean
PCE_delta_mean = (PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_delta'].mean()).to_frame()
PCE_delta_mean = PCE_delta_mean.rename_axis('PCE_before_x').reset_index()
PCE_delta_mean.rename(columns={'PCE_delta':'PCE_delta_mean'},inplace=True)

# Extract 25th and 75th percentile
PCE_delta_quartile_1 = PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_delta'].quantile(0.25).to_frame()
PCE_delta_quartile_3 = PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_delta'].quantile(0.75).to_frame()

PCE_delta_quartile_1 = PCE_delta_quartile_1.rename_axis('PCE_before_x').reset_index()
PCE_delta_quartile_3 = PCE_delta_quartile_3.rename_axis('PCE_before_x').reset_index()

PCE_delta_quartile_1.rename(columns={'PCE_delta':'PCE_delta_25th'},inplace=True)
PCE_delta_quartile_3.rename(columns={'PCE_delta':'PCE_delta_75th'},inplace=True)


##### CALCULATION FOR THE X-AXIS, THE PCE_before_mean, PCE_before_median, PCE_before_25th, PCE_before_75th
# Extract median x-axis
PCE_before_median = (PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_before'].median()).to_frame()
PCE_before_median = PCE_before_median.rename_axis('PCE_before_x').reset_index()
PCE_before_median.rename(columns={'PCE_before':'PCE_before_median'},inplace=True)

# Extract mean x-axis
PCE_before_mean = (PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_before'].mean()).to_frame()
PCE_before_mean = PCE_before_mean.rename_axis('PCE_before_x').reset_index()
PCE_before_mean.rename(columns={'PCE_before':'PCE_before_mean'},inplace=True)

# Extract 25th and 75th percentile
PCE_before_quartile_1 = PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_before'].quantile(0.25).to_frame()
PCE_before_quartile_3 = PCE_df_sorted_upto20.groupby('PCE_before_x')['PCE_before'].quantile(0.75).to_frame()

PCE_before_quartile_1 = PCE_before_quartile_1.rename_axis('PCE_before_x').reset_index()
PCE_before_quartile_3 = PCE_before_quartile_3.rename_axis('PCE_before_x').reset_index()

PCE_before_quartile_1.rename(columns={'PCE_before':'PCE_before_25th'},inplace=True)
PCE_before_quartile_3.rename(columns={'PCE_before':'PCE_before_75th'},inplace=True)

##### Merge dataframe
PCE_delta_quartiles = pd.concat([PCE_delta_median,
                                 PCE_delta_mean.drop(columns=['PCE_before_x']),
                                 PCE_delta_quartile_1.drop(columns=['PCE_before_x']),
                                 PCE_delta_quartile_3.drop(columns=['PCE_before_x']),
                                 PCE_before_median.drop(columns=['PCE_before_x']),
                                 PCE_before_mean.drop(columns=['PCE_before_x']),
                                 PCE_before_quartile_1.drop(columns=['PCE_before_x']),
                                 PCE_before_quartile_3.drop(columns=['PCE_before_x'])],
                                axis=1)

# Save PCE_delta_quartiles
PCE_delta_quartiles.to_csv(filedirname+'PCE_delta_quartiles_inc24.csv')

PCE_delta_quartiles

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
# from sklearn.datasets import make_regression

# Doing linear regression on MEDIAN
print('For MEDIAN:')
X = (PCE_delta_quartiles['PCE_before_median'].to_numpy()).reshape(-1,1)
y = (PCE_delta_quartiles['PCE_delta_median'].to_numpy()).reshape(-1,1)

# Fitting 
model= LinearRegression().fit(X,y)
model_2 = sm.OLS(y,X).fit()
y_hat = model.predict(X)

# Fitting stretched
X_stretch = np.array([0,35]).reshape(-1,1)
y_hat_stretch = model.predict(X_stretch)

MSE=mean_squared_error(y,y_hat)
MAE=median_absolute_error(y,y_hat)

print('MSE: ',MSE)
print('MAE: ',MAE)
print('Model coefficients: ',model.coef_)
print('y-intercept: ',model.intercept_)
print('x-intercept: ',-model.intercept_/model.coef_)

# Prediction
combined_array = np.column_stack((X,y_hat))
prediction=pd.DataFrame(combined_array,columns=['X','y_hat'])

combined_array_stretch = np.column_stack((X_stretch,y_hat_stretch))
prediction_stretch = pd.DataFrame(combined_array_stretch,
                                  columns=['X','y_hat'])

MAE_np = MAE*np.ones(len(combined_array))
MSE_np = MSE*np.ones(len(combined_array))

# Plot figure
fig = go.Figure()

# Plot the regression line
fig.add_trace(go.Scatter(x=prediction_stretch['X'],y=prediction_stretch['y_hat'],
                         mode='lines', name='Regression',
                         line=dict(dash='dash',color='rgb(116,169,207)')))

# Plot the scatter median line
fig.add_trace(go.Scatter(x=PCE_delta_quartiles['PCE_before_median'],
                         y=PCE_delta_quartiles['PCE_delta_median'],
                         error_y=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_delta_75th']-PCE_delta_quartiles['PCE_delta_median'],
                                      arrayminus=PCE_delta_quartiles['PCE_delta_median']-PCE_delta_quartiles['PCE_delta_25th']),
                         error_x=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_before_75th']-PCE_delta_quartiles['PCE_before_median'],
                                      arrayminus=PCE_delta_quartiles['PCE_before_median']-PCE_delta_quartiles['PCE_before_25th']),
                         mode='markers',name='Median',
                         line=dict(color='rgb(5,112,176)')))

# Update figure properties
fig.update_layout(xaxis_title="Maximum PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial',
                  showlegend=True)


fig.show()

# Save a figure of 300dpi
# pio.write_image(fig, filedirname+'median_PCE_delta_fit_inc24.png',
#                 width=0.7*800, height=0.7*600, scale=12)

pio.write_image(fig, filedirname+'median_PCE_delta_fit_inc24_stretched.png',
                width=0.7*800, height=0.7*600, scale=12)

# Model OLS summary
model_2.summary()

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
# from sklearn.datasets import make_regression

# Doing linear regression on MEDIAN
print('For MEDIAN:')
X = (PCE_delta_quartiles['PCE_before_median'].to_numpy()).reshape(-1,1)
y = (PCE_delta_quartiles['PCE_delta_median'].to_numpy()).reshape(-1,1)

# Fitting 
model= LinearRegression().fit(X,y)
model_2 = sm.OLS(y,X).fit()
y_hat = model.predict(X)

# Fitting stretched
X_stretch = np.array([0,35]).reshape(-1,1)
y_hat_stretch = model.predict(X_stretch)

MSE=mean_squared_error(y,y_hat)
MAE=median_absolute_error(y,y_hat)

print('MSE: ',MSE)
print('MAE: ',MAE)
print('Model coefficients: ',model.coef_)
print('y-intercept: ',model.intercept_)
print('x-intercept: ',-model.intercept_/model.coef_)

# Prediction
combined_array = np.column_stack((X,y_hat))
prediction=pd.DataFrame(combined_array,columns=['X','y_hat'])

combined_array_stretch = np.column_stack((X_stretch,y_hat_stretch))
prediction_stretch = pd.DataFrame(combined_array_stretch,
                                  columns=['X','y_hat'])

MAE_np = MAE*np.ones(len(combined_array))
MSE_np = MSE*np.ones(len(combined_array))

# Plot figure
fig = go.Figure()

# Plot the regression line
fig.add_trace(go.Scatter(x=prediction['X'],y=prediction['y_hat'],
                         mode='lines', name='Regression',
                         line=dict(dash='dash',color='rgb(116,169,207)')))

# Plot the scatter median line
fig.add_trace(go.Scatter(x=PCE_delta_quartiles['PCE_before_median'],
                         y=PCE_delta_quartiles['PCE_delta_median'],
                         error_y=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_delta_75th']-PCE_delta_quartiles['PCE_delta_median'],
                                      arrayminus=PCE_delta_quartiles['PCE_delta_median']-PCE_delta_quartiles['PCE_delta_25th']),
                         error_x=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_before_75th']-PCE_delta_quartiles['PCE_before_median'],
                                      arrayminus=PCE_delta_quartiles['PCE_before_median']-PCE_delta_quartiles['PCE_before_25th']),
                         mode='markers',name='Median',
                         line=dict(color='rgb(5,112,176)')))

# Update figure properties
fig.update_layout(xaxis_title="Maximum PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial',
                  showlegend=True)


fig.show()

# Save a figure of 300dpi
# pio.write_image(fig, filedirname+'median_PCE_delta_fit_inc24.png',
#                 width=0.7*800, height=0.7*600, scale=12)

pio.write_image(fig, filedirname+'median_PCE_delta_fit_inc24.png',
                width=0.7*800, height=0.7*600, scale=12)

# Model OLS summary
model_2.summary()

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
# from sklearn.datasets import make_regression

# Doing linear regression on MEAN
print('For MEAN:')
X = (PCE_delta_quartiles['PCE_before_mean'].to_numpy()).reshape(-1,1)
y = (PCE_delta_quartiles['PCE_delta_mean'].to_numpy()).reshape(-1,1)

# Fitting 
model= LinearRegression().fit(X,y)
model_2 = sm.OLS(y,X).fit()
y_hat = model.predict(X)

# Fitting stretched
X_stretch = np.array([0,35]).reshape(-1,1)
y_hat_stretch = model.predict(X_stretch)

MSE=mean_squared_error(y,y_hat)
MAE=median_absolute_error(y,y_hat)

print('MSE: ',MSE)
print('MAE: ',MAE)
print('Model coefficients: ',model.coef_)
print('y-intercept: ',model.intercept_)
print('x-intercept: ',-model.intercept_/model.coef_)

# Prediction
combined_array = np.column_stack((X,y_hat))
prediction=pd.DataFrame(combined_array,columns=['X','y_hat'])

combined_array_stretch = np.column_stack((X_stretch,y_hat_stretch))
prediction_stretch = pd.DataFrame(combined_array_stretch,
                                  columns=['X','y_hat'])

MAE_np = MAE*np.ones(len(combined_array))
MSE_np = MSE*np.ones(len(combined_array))

# Plot figure
fig = go.Figure()

# Plot the regression line
fig.add_trace(go.Scatter(x=prediction_stretch['X'],y=prediction_stretch['y_hat'],
                         mode='lines', name='Regression',
                         line=dict(dash='dash',color='rgb(116,169,207)')))

# Plot the scatter median line
fig.add_trace(go.Scatter(x=PCE_delta_quartiles['PCE_before_mean'],
                         y=PCE_delta_quartiles['PCE_delta_mean'],
                         error_y=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_delta_75th']-PCE_delta_quartiles['PCE_delta_mean'],
                                      arrayminus=PCE_delta_quartiles['PCE_delta_mean']-PCE_delta_quartiles['PCE_delta_25th']),
                         error_x=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_before_75th']-PCE_delta_quartiles['PCE_before_mean'],
                                      arrayminus=PCE_delta_quartiles['PCE_before_mean']-PCE_delta_quartiles['PCE_before_25th']),
                         mode='markers',name='Mean',
                         line=dict(color='rgb(5,112,176)')))

# Update figure properties
fig.update_layout(xaxis_title="Maximum PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial',
                  showlegend=True)
    
fig.show()

# Save a figure of 300dpi
pio.write_image(fig, filedirname+'mean_PCE_delta_fit_inc24_stretched.png',
                width=0.7*800, height=0.7*600, scale=12)

# Model OLS summary
model_2.summary()

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
# from sklearn.datasets import make_regression

# Doing linear regression on MEAN
print('For MEAN:')
X = (PCE_delta_quartiles['PCE_before_mean'].to_numpy()).reshape(-1,1)
y = (PCE_delta_quartiles['PCE_delta_mean'].to_numpy()).reshape(-1,1)

# Fitting 
model= LinearRegression().fit(X,y)
model_2 = sm.OLS(y,X).fit()
y_hat = model.predict(X)

# Fitting stretched
X_stretch = np.array([0,35]).reshape(-1,1)
y_hat_stretch = model.predict(X_stretch)

MSE=mean_squared_error(y,y_hat)
MAE=median_absolute_error(y,y_hat)

print('MSE: ',MSE)
print('MAE: ',MAE)
print('Model coefficients: ',model.coef_)
print('y-intercept: ',model.intercept_)
print('x-intercept: ',-model.intercept_/model.coef_)

# Prediction
combined_array = np.column_stack((X,y_hat))
prediction=pd.DataFrame(combined_array,columns=['X','y_hat'])

combined_array_stretch = np.column_stack((X_stretch,y_hat_stretch))
prediction_stretch = pd.DataFrame(combined_array_stretch,
                                  columns=['X','y_hat'])

MAE_np = MAE*np.ones(len(combined_array))
MSE_np = MSE*np.ones(len(combined_array))

# Plot figure
fig = go.Figure()

# Plot the regression line
fig.add_trace(go.Scatter(x=prediction['X'],y=prediction['y_hat'],
                         mode='lines', name='Regression',
                         line=dict(dash='dash',color='rgb(116,169,207)')))

# Plot the scatter median line
fig.add_trace(go.Scatter(x=PCE_delta_quartiles['PCE_before_mean'],
                         y=PCE_delta_quartiles['PCE_delta_mean'],
                         error_y=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_delta_75th']-PCE_delta_quartiles['PCE_delta_mean'],
                                      arrayminus=PCE_delta_quartiles['PCE_delta_mean']-PCE_delta_quartiles['PCE_delta_25th']),
                         error_x=dict(type='data', symmetric=False,
                                      array=PCE_delta_quartiles['PCE_before_75th']-PCE_delta_quartiles['PCE_before_mean'],
                                      arrayminus=PCE_delta_quartiles['PCE_before_mean']-PCE_delta_quartiles['PCE_before_25th']),
                         mode='markers',name='Mean',
                         line=dict(color='rgb(5,112,176)')))

# Update figure properties
fig.update_layout(xaxis_title="Maximum PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial',
                  showlegend=True)
    
fig.show()

# Save a figure of 300dpi
pio.write_image(fig, filedirname+'mean_PCE_delta_fit_inc24.png',
                width=0.7*800, height=0.7*600, scale=12)

# Model OLS summary
model_2.summary()

#### What if we tried using all the scattered points/ not groups?

In [None]:
# Plot the overview
fig = go.Figure(data=go.Scatter(x=PCE_df_sorted['PCE_before'],
                                y=PCE_df_sorted['PCE_delta'],
                                mode='markers'))
fig.update_layout(yaxis=dict(range=[-2.1,2.2]),xaxis_title="Maximum PCE (%)",yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)")
fig.update_yaxes(type='log')

# To display the figure in the output screen
fig.show()

In [None]:
# Plot the overview and OLS trendline

fig = px.scatter(PCE_df_sorted, x='PCE_before', y='PCE_delta',
                 trendline='ols', #trendline_scope='overall',
#                  trendline_options=dict(log_x=True),
                 trendline_color_override='red')

fig.update_layout(xaxis_title="Maximum PCE (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial')

# Save a figure of 300dpi
pio.write_image(fig, filedirname+'ols_trendline.png',
                width=800, height=600, scale=5)

# yaxis=dict(range=[-2.1,2.2]),
# fig.update_yaxes(type='log')

# To display the figure in the output screen
fig.show()

results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()

### What if we dropped cluster 4 and replot the violin plot?

In [None]:
PCE_df_sorted

In [None]:
# Exclude cluster 4
PCE_df_sorted_excCluster4 = PCE_df_sorted[PCE_df_sorted['Cluster'] != 3]

PCE_df_sorted_excCluster4

In [None]:
import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import matplotlib

fig = go.Figure()

print('Unique ceil: ',unique_ceil)
print('Median: ',unique_median)
print('Mean: ',unique_mean)

# Color palette for the figure to make it pretty
colors = n_colors('rgb(8,29,88)', 'rgb(127,205,187)', n_group, colortype='rgb')
colors_box = n_colors('rgb(2,7,22)', 'rgb(30,50,45)', n_group, colortype='rgb')
colors_line = n_colors('rgb(0,5,15)', 'rgb(15,25,23)', n_group, colortype='rgb')

# Plotting the violin and boxplot
for (i,color,color_line) in zip(unique_ceil, colors, colors_line):
    fig.add_trace(go.Violin(x=PCE_df_sorted_excCluster4['PCE_before_x'][PCE_df_sorted_excCluster4['PCE_before_ceil_x'] == i],
                            y=PCE_df_sorted_excCluster4['PCE_delta'][PCE_df_sorted_excCluster4['PCE_before_ceil_x'] == i],
                            box_visible=False,
                            fillcolor = color,
                            opacity = 0.4,
                            line = dict(color=color_line),
                            jitter=True,
                            meanline_visible=False))

for (i,color,color_line) in zip(unique_ceil, colors, colors_line):
    fig.add_trace(go.Box(x=PCE_df_sorted_excCluster4['PCE_before_x'][PCE_df_sorted_excCluster4['PCE_before_ceil_x'] == i],
                            y=PCE_df_sorted_excCluster4['PCE_delta'][PCE_df_sorted_excCluster4['PCE_before_ceil_x'] == i],
                            marker_color = color,
                            opacity = 0.55,
                            line_color = color_line,
                            fillcolor = color,
                            jitter=True,
                            boxmean=True))

fig.update_layout(xaxis_title="Max. PCE group (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  boxgap = 0.85,
                  font_family='Arial',
                  showlegend=False)
    
fig.show()

# Save the figure 
pio.write_image(fig, filedirname+'all_data_changedegradation_excCluster4_4.png',
                width=900, height=600, scale=22)

pio.write_image(fig, filedirname+'all_data_changedegradation_excCluster4_3.png',
                width=700, height=450, scale=25)



In [None]:
# Plot the overview and OLS trendline

fig = px.scatter(PCE_df_sorted_excCluster4, x='PCE_before', y='PCE_delta',
                 trendline='ols', #trendline_scope='overall',
#                  trendline_options=dict(log_x=True),
                 trendline_color_override='red')

fig.update_layout(xaxis_title="Maximum PCE (%)",
                  yaxis_title="Relative change in max. PCE (after 150 hrs.) (%)",
                  font_family='Arial')

# Save a figure of 300dpi
pio.write_image(fig, filedirname+'ols_trendline_excCluster4.png',
                width=800, height=600, scale=5)

# yaxis=dict(range=[-2.1,2.2]),
# fig.update_yaxes(type='log')

# To display the figure in the output screen
fig.show()

results = px.get_trendline_results(fig)
results.px_fit_results.iloc[0].summary()

### 3.6. Look at the Pearson's correlation coefficient

This looks at the correlation between variables.

In [None]:
# Load the .pkl file consisting the whole dataset
with open('dataset/pkl_complete/20230303_mySeries.pkl', "rb") as fh:
    mySeries = pickle.load(fh)

mySeries

In [None]:
# Load data for the clusters
cluster_PCE = pd.read_csv(filedirname+'architecture.csv').drop(['Unnamed: 0'],axis=1)

# Rename
cluster_PCE = cluster_PCE.rename({'Series': 'Filename'}, axis=1)  # new method
cluster_PCE

In [None]:
# PCE_df merge with clusterList
PCE_cluster_df = pd.merge(mySeries, cluster_PCE, how='outer', indicator=True)
PCE_cluster_df

In [None]:
# Counting for specific cluster and temperature

print("Counting T < 30C:")
for i in range(4): # clusters
    count = PCE_cluster_df.loc[(PCE_cluster_df['Cluster']==i)&(PCE_cluster_df['T_avg']< 30)]
    print('For cluster: ',i, ' and T < 30C ',
          '# data points:', count.shape[0])
    
print("Counting T > 80C:")
for i in range(4): # clusters
    count = PCE_cluster_df.loc[(PCE_cluster_df['Cluster']==i)&(PCE_cluster_df['T_avg']> 80)]
    print('For cluster: ',i, ' and T > 80C ',
          '# data points:', count.shape[0])

In [None]:
# One hot encoding

def onehot(df, column, prefix):
    '''
    One-hot encode a specific column in dataframe
    '''
    # concat with specific columns
    df_modified = pd.concat([df,pd.get_dummies(df[column], prefix=prefix)],axis=1)
    
    # drop the column
    df_modified.drop([column],axis=1, inplace=True)
    
    return df_modified

In [None]:
# One-hot-encode the dataframe of 'Cluster' and 'Filter' because they are discrete variables/
# not continuous

PCE_cluster_df_onehot = onehot(PCE_cluster_df,'Cluster','Cluster')
PCE_cluster_df_onehot = onehot(PCE_cluster_df_onehot,'Filter','Filter')
PCE_cluster_df_onehot

In [None]:
# Look at the columns in the dataframe
PCE_cluster_df_onehot.columns.values.tolist()

In [None]:
# Select specific columns to be calculated for correlation coefficient
selected= PCE_cluster_df_onehot[['T_avg',
                                 'Irr_avg',
                                 'Filter_KFU15',
                                 'Filter_none',
                                 'Cluster_0',
                                 'Cluster_1',
                                 'Cluster_2',
                                 'Cluster_3',
                                ]]


selected.to_csv(filedirname+'T_avg_dist_cluster_onehot.csv')

In [None]:
# Using Pearson Correlation on the whole dataset
cor = selected.corr()
mask = np.zeros_like(cor)

plt.rcParams['font.family']="Arial"

# Masking half of the table
mask[np.triu_indices_from(mask)] = True

# Plot the correlation
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(7,7))
    ax = sns.heatmap(cor, annot=True,annot_kws={"size": 11},fmt=".2f",
                     cmap=plt.cm.RdBu_r,linewidths=0,vmin=-1,vmax=1,mask=mask, square=True)

plt.savefig(filedirname+'pearsoncorrelation_clusterfilter.png',
            dpi=600)

## 4. Time series k-means clustering

Read more about k-means clustering here: https://en.wikipedia.org/wiki/K-means_clustering.

**input**: clean, pre-processed mySeriesDrop_savgol from above

**process**:

1. do k-means clustering with the same number of clusters as som
2. visualization using PCA: k-means, affinity propagation, and dbscan (this one is too long)
3. visualization using t-SNE: k-means, affinity propagation
4. plot the cluster distribution
5. plot elbow method and silhouette method for optimum number of clusters

**output**: 
1. optimum number for clustering
2. 2d map of the data points, and their clusters

### 4.1. Direct k-means clustering

In [None]:
# Load mySeriesDrop_savgol, uncomment to check:
# mySeriesDrop_savgol

# Load mySeriesDrop_savgol
mySeriesDrop_savgol=np.load('dataset/pkl_complete/20230303_mySeriesDrop_savgol.npy')

In [None]:
import math

# Using the same number of clusters as the SOM
som_x = 2
som_y = 2
cluster_count= som_x*som_y

# K-means clustering
km = TimeSeriesKMeans(n_clusters=cluster_count, metric="dtw")
labels = km.fit_predict(mySeriesDrop_savgol)

In [None]:
# Saving models as pickle
km.to_pickle(filedirname+'TimeSeriesKMeans_4clusters_2.pkl')

# Save numpy array as .npy instead of .pkl
np.save(filedirname+'TimeSeriesKMeans_labels_dtw_4clusters_2.npy',labels)

In [None]:
# Set up name to save files

# filedirname = '20230116_run/sigma_0p5_learningrate_0p1/20230116_sigma_0p5_learningrate_0p1_'

# If not, let's load labels and km
with open(filedirname+'TimeSeriesKMeans_4clusters_2.pkl', "rb") as fh:
    km = pickle.load(fh)
labels=np.load(filedirname+'TimeSeriesKMeans_labels_dtw_4clusters_2.npy')

In [None]:
# Fixing the labels sequence to follow the SOM results order

labels_fixed = np.empty_like(labels)
for i in range(len(labels)):
    if(labels[i]==0):
        labels_fixed[i]=(labels[i]+1)
    elif(labels[i]==1):
        labels_fixed[i]=(labels[i]+1)
    elif(labels[i]==2):
        labels_fixed[i]=(labels[i]+1)
    else:
        labels_fixed[i]=(labels[i]-3)

In [None]:
from plotly.subplots import make_subplots
import plotly.io as pio
import colorlover as cl
from plotly.colors import n_colors
import plotly.graph_objects as go
import plotly.express as px

# Now, let's plot the results (based on the savgol/ smoothed )
plot_count = som_x

# Time
time = np.linspace(0,150, 900, endpoint=True)

# Plot figure
fig = make_subplots(
    rows=som_x, cols=som_y,
    shared_xaxes=True,
    shared_yaxes=True,
    vertical_spacing=0.1,
    )

row_i = 0
column_j = 0

# Colors
opacity = 0.04

colors=[ImageColor.getcolor(px.colors.qualitative.Antique[4],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Antique[9],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Antique[6],'RGB'),
        ImageColor.getcolor(px.colors.qualitative.Antique[8],'RGB')]

colors_solid=[ImageColor.getcolor(px.colors.qualitative.Set1[0],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[1],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[2],'RGB'),
              ImageColor.getcolor(px.colors.qualitative.Set1[3],'RGB')]

colors_rgba=[]
colors_solid_rgba=[]


for i in range(len(colors)):
    colors_rgba.append('rgba'+str(colors[i])[:-1]+','+str(opacity)+')')

for i in range(len(colors_solid)):
    colors_solid_rgba.append('rgba'+str(colors_solid[i])[:-1]+','+str(opacity)+')')

# For each label there is, plot every series with that label
for label in set(labels_fixed):
    cluster = []
    
    # Plot for the labels
    for i in range(len(labels_fixed)):
        if(labels_fixed[i]==label):
            
            # Cluster colors
            if label==0:
                line_color = colors_rgba[0]
                solid_color = colors_solid_rgba[0]
            elif label==1:
                line_color = colors_rgba[1]
                solid_color = colors_solid_rgba[1]
            elif label==2:
                line_color = colors_rgba[2]
                solid_color = colors_solid_rgba[2]
            else:
                line_color = colors_rgba[3]
                solid_color = colors_solid_rgba[3]
            
            # Add trace
            fig.add_trace(go.Scatter(x=time, y=mySeriesDrop_savgol[i],
                                     mode='lines',
                                     name=f"Cluster {label}",
                                     line_color=line_color,#'rgba(130,179,196,0.12)',
                                     showlegend=False),
                          row=row_i+1, col=column_j+1)
            cluster.append(mySeriesDrop_savgol[i]) # Append the series to take the average
    
    # Plot the average within the cluster
    if len(cluster) > 0:
        fig.add_trace(go.Scatter(x=time,y=np.average(np.vstack(cluster),axis=0),
                                 mode='lines',
                                 name=f'Cluster {label}',
                                 line_color='black',#'rgb(57,103,119)',
                                 showlegend=False),
                      row=row_i+1, col=column_j+1)
    
    # Go to the next row, column
    column_j+=1
    if column_j%plot_count == 0:
        row_i+=1
        column_j=0

# Update the figure
fig.update_yaxes(range=[-0.1,1.1])#, row=x+1, col=y+1)
fig.update_layout(font_family='Arial')

# Save the figure
pio.write_image(fig, filedirname+'TimeSeriesKMeans_4clusters_dtw_2.png', width=1*600, height=600, scale=15)
      
fig.show()

In [None]:
type(labels_fixed)

In [None]:
# Count for each cluster
cluster_c = [len(labels_fixed[labels_fixed==i]) for i in range(cluster_count)]
cluster_n = ["cluster_"+str(i) for i in range(cluster_count)]

import plotly.express as px

# Plot the bar plot of cluster distribution
fig = px.bar(x=cluster_n, y=cluster_c,labels=dict(x='Clusters',y='Count'))

fig.update_layout(font_family='Arial')
fig.update_traces(marker_color='rgba(57,103,119,0.7)')

# Save the figure
pio.write_image(fig, filedirname+'distribution_TimeSeriesKMeans_4clusters_v_2.png',
                width=1*400, height=400, scale=16)
fig.show()

In [None]:
import plotly.express as px

# Plot horizontal distribution
fig = go.Figure()
fig.add_trace(go.Bar(
    y=cluster_n,
    x=cluster_c,
    orientation='h',
    marker=dict(
        color= 'rgba(57,103,119,0.7)',
    )
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Count'),
                  yaxis=dict(title='Cluster'))

# Save the figure
pio.write_image(fig, filedirname+'distribution_TimeSeriesKMeans_4clusters_h_2.png',
                width=1*400, height=1*400, scale=16)

fig.show()

In [None]:
# Save the clustering results in one dataframe
fancy_names_for_labels = [f"{label}" for label in labels_fixed]
result = pd.DataFrame(zip(mySeries['Filename'],mySeries['Pixel'],mySeries['SampleNumber'],fancy_names_for_labels),
                      columns=["Series",'Pixel',"SampleNumber","Cluster"]).sort_values(by="Cluster")#.set_index("Series")

result['PCE_before_x'] = PCE_df['PCE_before_x']
result['PCE_before_ceil_x'] = PCE_df['PCE_before_ceil_x']
result['PCE_before_median_x'] = PCE_df['PCE_before_median_x']
result['PCE_before_mean_x'] = PCE_df['PCE_before_mean_x']

# Save the results as csv
(result.sort_index()).to_csv(filedirname+'TimeSeriesKMeans_DTW_4clusters_newdata_2.csv')
(result.sort_index())

In [None]:
# Plot the distribution

colormap_cluster = {"0": "#045a8d",
                    "1": "#2b8cbe",
                    "2": "#74a9cf",
                    "3": "#bdc9e1"}

# Colormap cluster
colormap_cluster = {"0": px.colors.qualitative.Antique[4],
                    "1": px.colors.qualitative.Antique[9],
                    "2": px.colors.qualitative.Antique[6],
                    "3": px.colors.qualitative.Antique[8]}

# Plot the figure
fig = px.histogram(result.sort_values(by=['Cluster','PCE_before_ceil_x']),
                   x="PCE_before_x", color='Cluster',
                   color_discrete_map=colormap_cluster, opacity=0.75,
                  )

# fig.update_traces(xbins_size=2)

fig.update_layout(xaxis_title="Max. PCE Group",#xaxis=dict(range=[9,26]),
                  yaxis_title="Count", font_family='Arial',barnorm='fraction',
                  bargap=0.1)

pio.write_image(fig, filedirname+'TimeSeriesKMeans_4clusters_fraction_3_2.png',
                width=1*400, height=1*400, scale=16)

fig.show()

### 4.2. Visualizing results with PCA/ PCA clustering

Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

#### k-means

For n = x

In [None]:
##### import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
import math

data = mySeriesDrop_savgol
cluster_count = 9

# Data dimensionality reduction
reduced_data = PCA(n_components=2).fit_transform(data)
print("Data dimensionality reduction: done")
        
# Loop over cluster_count
for cluster in range(cluster_count):
    # Only runs for cluster >=2
    if cluster >=2:
        print('Cluster ', cluster,' is running')       
        
        # K-means clustering
        kmeans = KMeans(init="k-means++", n_clusters=cluster, n_init=4)
        print("k-means initialization: done")

        # K-means fitting and predicting
        labels = kmeans.fit_predict(reduced_data)
        print('inertia: ', kmeans.inertia_)
        print("k-means fitting and predicting: done")
        
        # Step size of the mesh. Decrease to increase the quality of the VQ.
        h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

        # Plot the decision boundary. For that, we will assign a color to each
        x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
        y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        print("creating meshgrid: done")

        # Obtain labels for each point in mesh. Use last trained model.
        Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
        print("prediction for the meshgrid: done")

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.figure(figsize=(5,5),dpi=300)
        plt.clf()
        plt.imshow(
            Z,
            interpolation="nearest",
            extent=(xx.min(), xx.max(), yy.min(), yy.max()),
            cmap=plt.cm.Paired,
            aspect="auto",
            origin="lower",
        )

        plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

        # Plot the centroids as a white X
        centroids = kmeans.cluster_centers_
        plt.scatter(
            centroids[:, 0],
            centroids[:, 1],
            marker="x",
            s=169,
            linewidths=3,
            color="w",
            zorder=10,
        )

        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.xticks(())
        plt.yticks(())
        
        # Save figure
        plt.show()
        plt.savefig(filedirname+'pca_kmeans_dtw_'+str(cluster)+'_cluster.png',
                    dpi=200)

For 1 specific cluster.

In [None]:
##### import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation

data = mySeriesDrop_savgol
cluster_count = 4

# Data dimensionality reduction
reduced_data = PCA(n_components=2).fit_transform(data)
print("data dimensionality reduction: done")

# K-means clustering
kmeans = KMeans(init="k-means++", n_clusters=cluster_count, n_init=4)
print("k-means initialization: done")

# K-means fitting and predicting
labels = kmeans.fit_predict(reduced_data)
print('inertia: ', kmeans.inertia_)
print("k-means fitting and predicting: done")

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print("creating meshgrid: done")

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
print("prediction for the meshgrid: done")

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(5,5),dpi=300)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

#### Affinity propagation

In [None]:
##### import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

data = mySeriesDrop_savgol
cluster_count = 4

# Data dimensionality reduction
reduced_data = PCA(n_components=2).fit_transform(data)
print("data dimensionality reduction: done")

# Affinity propagation initialization
AP = AffinityPropagation(preference=-50, random_state=0,
                         damping = 0.8, max_iter = 1000)
print("affinity propagation initialization: done")

# Affinity propagation fitting and prediction
labels = AP.fit_predict(reduced_data)
print("affinity propagation fitting and predicting: done")

cluster_centers_indices = AP.cluster_centers_indices_
labels = AP.labels_

n_clusters_ = len(cluster_centers_indices)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print("creating meshgrid: done")

# Obtain labels for each point in mesh. Use last trained model.
Z = AP.predict(np.c_[xx.ravel(), yy.ravel()])
print("prediction for the meshgrid: done")

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(5,5),dpi=300)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

# Plot the centroids as a white X
# centroids = kmeans.cluster_centers_
# plt.scatter(
#     cluster_centers_indices[:, 0],
#     cluster_centers_indices[:, 1],
#     marker="x",
#     s=169,
#     linewidths=3,
#     color="w",
#     zorder=10,
# )

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

print("Estimated number of clusters: %d" % n_clusters_)
print(
    "Silhouette Coefficient: %0.3f"
    % metrics.silhouette_score(reduced_data, labels, metric="sqeuclidean")
)

#### DBSCAN

DBSCAN takes a long time and I haven't successfully done it from the PC (it probably needs a more powerful computer).

In [None]:
##### import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics

data = mySeriesDrop_savgol
cluster_count = 4

# Data dimensionality reduction
reduced_data = PCA(n_components=2).fit_transform(data)
print("data dimensionality reduction: done")

# DBSCAN initialization
db = DBSCAN(eps=0.5, min_samples=10)
print("dbscan initialization: done")

# DBCAN fitting and prediction
labels = db.fit_predict(reduced_data)
print("DBSCAN fitting and predicting: done")

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print("block4")

# Obtain labels for each point in mesh. Use last trained model.
Z = db.fit_predict(np.c_[xx.ravel(), yy.ravel()])
print("prediction for the meshgrid: done")

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(5,5),dpi=300)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

# Plot the centroids as a white X
# centroids = kmeans.cluster_centers_
# plt.scatter(
#     cluster_centers_indices[:, 0],
#     cluster_centers_indices[:, 1],
#     marker="x",
#     s=169,
#     linewidths=3,
#     color="w",
#     zorder=10,
# )

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

print("Estimated number of clusters: %d" % n_clusters_)

print(
    "Silhouette Coefficient: %0.3f"
    % metrics.silhouette_score(reduced_data, labels, metric="sqeuclidean")
)

### 4.3. Visualizing results with t-SNE/ t-SNE clustering 

Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html

#### k-means

For all clusters up to n.

In [None]:
##### Generate all cluster counts
import math
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn import metrics

data = mySeriesDrop_savgol
cluster_count = 9

# Data dimensionality reduction
reduced_data = TSNE(n_components=2).fit_transform(data)
# reduced_data = reduced_data.astype('double')
print("Data dimensionality reduction: done")
        
# Loop over cluster_count
for cluster in range(cluster_count):
    # Only runs for cluster >=2
    if cluster >=2:
        print('Cluster ', cluster,' is running')       
        
        # K-means clustering
        kmeans = KMeans(init="k-means++", n_clusters=cluster, n_init=4)
        print("k-means initialization: done")

        # K-means fitting and predicting
        labels = kmeans.fit_predict(reduced_data)
        print('inertia: ', kmeans.inertia_)
        print("k-means fitting and predicting: done")
        
        # Step size of the mesh. Decrease to increase the quality of the VQ.
        h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

        # Plot the decision boundary. For that, we will assign a color to each
        x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
        y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        print("creating meshgrid: done")

        # Obtain labels for each point in mesh. Use last trained model.
#         xx_double = np.array(xx, dtype=np.double)
#         yy_double = np.array(yy, dtype=np.double)
        Z = kmeans.predict((np.c_[xx.ravel(), yy.ravel()]).astype('float32'))
        print("prediction for the meshgrid: done")

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.figure(figsize=(5,5),dpi=300)
        plt.clf()
        plt.imshow(
            Z,
            interpolation="nearest",
            extent=(xx.min(), xx.max(), yy.min(), yy.max()),
            cmap=plt.cm.Paired,
            aspect="auto",
            origin="lower",
        )

        plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

        # Plot the centroids as a white X
        centroids = kmeans.cluster_centers_
        plt.scatter(
            centroids[:, 0],
            centroids[:, 1],
            marker="x",
            s=169,
            linewidths=3,
            color="w",
            zorder=10,
        )

        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)
        plt.xticks(())
        plt.yticks(())
        
        # Save figure
        plt.show()
        plt.savefig(filedirname+'tsne_kmeans_dtw_'+str(cluster)+'_cluster.png',
                    dpi=200)

In [None]:
##### Just one cluster_count
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn import metrics

data = mySeriesDrop_savgol
cluster_count = 4

# Data dimensionality reduction
reduced_data = TSNE(n_components=2).fit_transform(data)
print("data dimensionality reduction: done")

# K-means initialization
kmeans = KMeans(init="k-means++", n_clusters=cluster_count, n_init=4)
print("k-means initialization: done")

# K-means fitting and prediction
labels = kmeans.fit_predict(reduced_data)
print('inertia: ', kmeans.inertia_)
print("Silhouette Coefficient: %0.3f"% metrics.silhouette_score(reduced_data, labels, metric="sqeuclidean"))
print("k-means fitting and predicting: done")

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
#                     dtype='float32')
print("creating meshgrid: done")

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict((np.c_[xx.ravel(), yy.ravel()]).astype('float32'))#,dtype='float32')
print("prediction for the meshgrid: done")

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(5,5),dpi=300)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

#### Affinity propagation

In [None]:
##### import matplotlib.pyplot as plt
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

data = mySeriesDrop_savgol
cluster_count = 4

# Data dimensionality reduction
reduced_data = TSNE(n_components=2).fit_transform(data)
print("data dimensionality reduction: done")

# Affinity propagation initialization
AP = AffinityPropagation(preference=-50, random_state=0,
                         damping = 0.5, max_iter = 1000)
print("affinity propagation initialization: done")

# Affinity propagation fitting and prediction
labels = AP.fit_predict(reduced_data)
print("affinity propagation fitting and predicting: done")

cluster_centers_indices = AP.cluster_centers_indices_
labels = AP.labels_

n_clusters_ = len(cluster_centers_indices)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
print("creating meshgrid: done")

# Obtain labels for each point in mesh. Use last trained model.
Z = AP.predict((np.c_[xx.ravel(), yy.ravel()]).astype('float32'))
print("prediction for the meshgrid: done")

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(5,5),dpi=300)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap= plt.cm.Paired,#plt.cm.Paired,tab20b,Set3,Pastel2
    aspect="auto",
    origin="lower",
)

plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2)

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

print("Estimated number of clusters: %d" % n_clusters_)

print(
    "Silhouette Coefficient: %0.3f"
    % metrics.silhouette_score(reduced_data, labels, metric="sqeuclidean")
)

#### Visualizing which dataset has which cluster

In [None]:
# Plot the overview
fig, ax = plt.subplots(figsize=(6,12),dpi=300)
grouped = result.groupby(['Series','Cluster']).nunique()['Pixel'].reset_index([0,1])
piv_grouped = grouped.pivot(index='Series', columns='Cluster', values='Pixel')
ax = sns.heatmap(piv_grouped, cmap='viridis', linewidths=0.5, annot=True)

In [None]:
fig, ax = plt.subplots(figsize=(4,10),dpi=300)
sns.stripplot(x='Cluster', y='Series', data=result, jitter=True,dodge=True, palette='viridis',orient='h')
# plt.xticks(rotation=9)

### 4.4. Elbow plot

WCSS: https://www.analyticsvidhya.com/blog/2021/01/in-depth-intuition-of-k-means-clustering-algorithm-in-machine-learning/#:~:text=Elbow%20Method,-In%20the%20Elbow&text=WCSS%20is%20the%20sum%20of,is%20largest%20when%20K%20%3D%201.

WCSS ( Within-Cluster Sum of Square ). WCSS is the sum of squared distance between each point and the centroid in a cluster. When we plot the WCSS with the K value, the plot looks like an Elbow. As the number of clusters increases, the WCSS value will start to decrease. WCSS value is largest when K = 1.

In [None]:
# array_sum = np.sum(mySeriesDrop_np)
array_sum = np.sum(mySeriesDrop_savgol)
array_has_nan = np. isnan(array_sum)
print(array_has_nan)

In [None]:
# %%timeit #-r 1

from sklearn.cluster import KMeans

# Going through all the cluster range, train and calculate kmeans inertia
wcss = []
for i in range(3, 10, 2): #(2, 11, 2)
    print('Number clusters: ',i)
    kmeans = TimeSeriesKMeans(n_clusters = i, init = 'k-means++', metric='dtw',random_state = 42)
    kmeans.fit_predict(mySeriesDrop_savgol)
    wcss.append(kmeans.inertia_)
    print('kmeans inertia: ', kmeans.inertia_)

In [None]:
# 2, 4, 6, 8, 10
wcss=np.array((0.9214629415516136,0.47160450376150304,
               0.21113949891080824,0.17260246241418095,
               0.15017415612876284))

# 2, 3, 4, 5, 6, 7, 8, 9, 10
wcss=np.array((0.9214629415516136,0.6112529441516568,
               0.47160450376150304,0.39482188524371586,
               0.21113949891080824,0.1905352810535253,
               0.17260246241418095,0.1650502694264613,
               0.15017415612876284))

In [None]:
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio

# Plot WCSS
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(2,11,1),
    y=wcss,
    mode='lines+markers',
    line=dict(color='rgb(5,112,176)'),
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Number of clusters'),
                  yaxis=dict(title='WCSS'))

# Save the figure
pio.write_image(fig, filedirname+'kmeans_wcss_result_cluster_2_10_gap1_mySeriesDrop_savgol.png',
                width=1*300, height=1*300, scale=16)

fig.show()

In [None]:
# Save wcss as a dataset
np.save(filedirname+'kmeans_wcss_result_cluster_2_10_gap1_mySeriesDrop_savgol.npy',wcss)

### 4.5. Silhouette value method

In [None]:
from sklearn.metrics import silhouette_samples,silhouette_score

clusters_range = range(2,11,2)
results = []

# Going through all the cluster range, train and calculate kmeans inertia
for i in clusters_range:
    print('Number clusters: ',i)
    clusterer = TimeSeriesKMeans(n_clusters = i, init = 'k-means++', metric='dtw',random_state = 42)
    cluster_labels = clusterer.fit_predict(mySeriesDrop_savgol)#(mySeriesDrop_np)
    silhouette_avg = silhouette_score(mySeriesDrop_savgol,cluster_labels) #(mySeriesDrop_np, cluster_labels) 
    print('kmeans inertia: ', clusterer.inertia_, ', silhouette_avg: ', silhouette_avg)
    results.append([i, silhouette_avg])

result = pd.DataFrame(results, columns=['n_clusters','silhouette_score'])
pivot_km = pd.pivot_table(result, index='n_clusters', values='silhouette_score')

In [None]:
plt.figure()
sns.heatmap(pivot_km, annot=True, linewidths=0.5, fmt='.3f',cmap= sns.cm.mako_r) #sns.cm.rocket_r)

In [None]:
result

In [None]:
# Plot WCSS
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=result['n_clusters'],
    y=result['silhouette_score'],
    mode='lines+markers',
    line=dict(color='rgb(5,112,176)'),
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Number of clusters'),
                  yaxis=dict(title='Silhouette score'))

# Save the figure
pio.write_image(fig, filedirname+'kmeans_silhouette_result_cluster_2_10_mySeriesDrop_savgol.png',
                width=1*300, height=1*300, scale=16)

fig.show()

In [None]:
# Save result/ silhouette score as a dataset
np.save(filedirname+'kmeans_silhouette_result_cluster_2_10_mySeriesDrop_savgol.npy',result)
result.to_csv(filedirname+'kmeans_silhouette_result_cluster_2_10_mySeriesDrop_savgol.csv',index=False)

## 5. Side notes/ figures

### 5.1. The schematic of degradation

**ROUGH SCHEMATIC**

In [None]:
# Plot schematic of degradation

from scipy.signal import savgol_filter

x=np.array([0, 0.25, 0.5,
            1, 1.5, 1.75,
            2, 5])
# x_savgol = savgol_filter(x,1,2)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x,
#     y=np.array([4,4.75,5,4.75,4,-6]),
    y=np.array([18.7, 19.44, 19.75,
                20, 19.75, 19.44,
                18.7, 8]),
    mode='lines',
    line=dict(color='rgb(5,112,176)'),
#     marker=dict(
#         color= 'rgba(57,103,119,0.7)',
#     )
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Degradation time (hours)',
                             showticklabels=False),
                  yaxis=dict(title='PCE',
                             showticklabels=False))

# Save the figure
pio.write_image(fig, filedirname+'schematic.png',
                width=1*450, height=1*450, scale=15)

fig.show()

**ACTUAL DATA**

In [None]:
# Look at specific row
MPPTint = mySeriesDrop[170].reset_index()
MPPTint

In [None]:
# Calculate MPPT_t_delta in hours
MPPTint['MPPT_t_delta']=MPPTint['MPPT_t']-MPPTint['MPPT_t'].loc[0]
MPPTint['MPPT_t_delta_hours']=MPPTint['MPPT_t_delta'] / pd.Timedelta(hours=1)
MPPTint

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=MPPTint['MPPT_t_delta_hours'],
#     y=np.array([4,4.75,5,4.75,4,-6]),
    y=MPPTint['MPPT_EFF'],
    mode='lines',
    line=dict(color='rgb(5,112,176)'),
#     marker=dict(
#         color= 'rgba(57,103,119,0.7)',
#     )
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Degradation time (hours)',
                             showticklabels=True),
                  yaxis=dict(title='PCE (%)',
                             showticklabels=True,
                             range=[9,14]))

# Save the figure
pio.write_image(fig, filedirname+'schematic_2.png',
                width=1*400, height=1*400, scale=30)

fig.show()

### 5.2. The date range of degradation

In [None]:
# Load the .pkl file consisting the whole dataset
with open('dataset/pkl_complete/20230116_mySeries.pkl', "rb") as fh:
    mySeries = pickle.load(fh)

In [None]:
# Iterate the rows and create a list of early date

dateList = []
for index,row in mySeries.iterrows():
    dateRow = mySeries['MPPTdata'][index].iloc[0]
    dateList.append(dateRow)
    
# Convert the dateList into pandas dataframe
dateListDf = pd.DataFrame(dateList)

# Sort by date, so we know the starting point and ending date of our dataset
dateListDf.sort_index()

### 5.3. The number of each cluster for each max. PCE group

In [None]:
# Load the data of interest
name_int = '20230303_run_revision_excN2/sigma_0p5_learningrate_0p1_150h/20230303_sigma_0p5_learningrate_0p1_whole_'
cluster_PCE_int = pd.read_csv(name_int+'clusters.csv').drop(['Unnamed: 0'],axis=1)
PCE_df_group_int = pd.read_csv(name_int+'PCE_df_grouping.csv')
PCE_df_group_int_sort = (PCE_df_group_int.sort_values('Unnamed: 0')).reset_index()

cluster_PCE_int['PCE_delta']=PCE_df_group_int_sort['PCE_delta']
PCE_df_group_int_sort

In [None]:
# Group of interest calculation

for i in range(4): # Clusters
    for j in range(5): # PCE_before_x
        count = cluster_PCE_int.loc[(cluster_PCE_int['Cluster']==i)&(cluster_PCE_int['PCE_before_x']==j+1)]
        print('For cluster: ',i, ' and PCE_before_x: ',j+1,
              '# data points:', count.shape[0])

### 5.4. Histogram cluster

In [None]:
# Grouping the data
cluster_1 = cluster_PCE_int['PCE_delta'].loc[(cluster_PCE_int['Cluster']==0)]
cluster_2 = cluster_PCE_int['PCE_delta'].loc[(cluster_PCE_int['Cluster']==1)]
cluster_3 = cluster_PCE_int['PCE_delta'].loc[(cluster_PCE_int['Cluster']==2)]
cluster_4 = cluster_PCE_int['PCE_delta'].loc[(cluster_PCE_int['Cluster']==3)]

# Colors, labels, and data for plotting
hist_data = [cluster_1, cluster_2, cluster_3, cluster_4]
group_labels = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']
colors = [px.colors.qualitative.Antique[4],
          px.colors.qualitative.Antique[9],
          px.colors.qualitative.Antique[6],
          px.colors.qualitative.Antique[8]]

In [None]:
import plotly.figure_factory as ff

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=2.5,
                         colors=colors)

fig.update_layout(font_family='Arial')
pio.write_image(fig, name_int+'rug_dist_PCE_delta.png',
                width=1*900, height=1*400, scale=12)
pio.write_image(fig, name_int+'rug_dist_PCE_delta_2.png',
                width=1*600, height=1*400, scale=12)
fig.show()

# Create distplot inset (the top)
fig = ff.create_distplot(hist_data, group_labels, bin_size=2.5,
                         colors=colors)

fig.update_layout(font_family='Arial')
fig.update_xaxes(range=[-2,90])
fig.update_yaxes(range=[-0.005,0.07])
pio.write_image(fig, name_int+'rug_dist_PCE_delta_inset_top.png',
                width=1*900, height=1*400, scale=12)
pio.write_image(fig, name_int+'rug_dist_PCE_delta_inset_top_2.png',
                width=1*600, height=1*400, scale=12)
fig.show()

# Create distplot inset (the bottom)
fig = ff.create_distplot(hist_data, group_labels, bin_size=2.5,
                         colors=colors)

fig.update_layout(font_family='Arial')
fig.update_xaxes(range=[-2,90])
pio.write_image(fig, name_int+'rug_dist_PCE_delta_inset_bottom.png',
                width=1*900, height=1*400, scale=12)
pio.write_image(fig, name_int+'rug_dist_PCE_delta_inset_bottom_2.png',
                width=1*600, height=1*400, scale=12)
fig.show()

### 5.5. Quantization error SOM


In [None]:
# These values are generated by trying out different combinations for
# som_x and som_y in the main part of the code
x = np.array((2,3,4,5,6,7,8,9,10))
# y = np.array((4.601026493221584,
#               2.6705675682441763,
#               (2.2654151964607387+2.265415146163151)/2,
#               1.9117405974000221,
#               (1.7968617485228293+1.7968617202716537)/2,
#               1.640786144382946,
#               (1.664956689074108+1.6152290972613579)/2,
#               (1.5372271345790458+1.5415926075899875)/2,
#               (1.483733025928272+1.4750974961167933)/2))

y = np.array((3.4653711937400833,
              2.7734776665323935,
              (2.4188140391965622+2.4188140345317097)/2,
              1.97200974884117,
              (1.8233955273321616+1.823395449115016)/2,
              1.791192306527944,
              (1.6143132145605121+1.6141542228347283)/2,
              (1.5764475628465437+1.5582839343215373)/2,
              (1.4706708588067163+1.4986430155342734)/2))

In [None]:
# Plot quantization error
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode='lines+markers',
    line=dict(color='rgb(5,112,176)'),
))

fig.update_layout(font_family='Arial',
                  xaxis=dict(title='Number of clusters'),
                  yaxis=dict(title='Quantization error'))

# Save the figure
pio.write_image(fig, filedirname+'som_quantizationError_result_cluster_2_10_mySeriesDrop_savgol.png',
                width=1*300, height=1*300, scale=16)

fig.show()