### Python packages used in this code

In [1]:
import platform
import sys
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from pylab import rcParams
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns

%matplotlib inline

In [2]:
"""
Environments

--Platform--
OS : Windows-10-10.0.19044-SP0
--Version--
python :  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy : 1.23.1
pandas : 1.4.3
"""

print('--Platform--')
print('OS :', platform.platform())
print('--Version--')
print('python : ', sys.version)
print('numpy :', np.__version__)
print('pandas :', pd.__version__)

--Platform--
OS : Windows-10-10.0.19044-SP0
--Version--
python :  3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]
numpy : 1.23.1
pandas : 1.4.3


# Preparation

## Create output directories

In [3]:
if not os.path.isdir('../30_Output/20_Plot/100_CheckData'):
    os.makedirs('../30_Output/20_Plot/100_CheckData')
if not os.path.isdir('../30_Output/30_csv/100_CheckData'):
    os.makedirs('../30_Output/30_csv/100_CheckData')
if not os.path.isdir('../30_Output/40_pkl/100_CheckData'):
    os.makedirs('../30_Output/40_pkl/100_CheckData')

# Main codes

## Load data

In [4]:
x_all = pd.read_csv('../10_Data/PI_lib_FFKM.csv', index_col=2)
ys_all = pd.read_csv('../10_Data/MD_PI1070.csv', index_col=0)
# y_all = pd.read_csv('../10_Data/PI_Cp.csv', index_col=0)
y_all = pd.read_csv('../10_Data/PI_Cp_masked.csv', index_col=0)

## Check data size

In [5]:
# Create flag matrix (0:not exist, 1:exist)
idx_x = pd.Series(np.repeat(1, len(x_all.index.values)), index=x_all.index.values).rename('x')
idx_y = pd.Series(np.repeat(1, len(y_all.index.values)), index=y_all.index.values).rename('y')
idx_ys = pd.Series(np.repeat(1, len(ys_all.index.values)), index=ys_all.index.values).rename('ys')
df_idx = pd.concat([idx_x, idx_y, idx_ys], axis=1, join='outer').fillna(0)

# Count data
df_count = pd.DataFrame(columns=['x : Input','y : Cp(PolyInfo)','ys : Cp(MD)','Count'])
for ix in [0,1]:
    for iy in [0,1]:
        for iys in [0,1]:
            df_count.loc[str(ix)+str(iy)+str(iys)] = [ix, iy, iys, sum((df_idx['x']==ix)&(df_idx['y']==iy)&(df_idx['ys']==iys))]

print(df_count)

     x : Input  y : Cp(PolyInfo)  ys : Cp(MD)  Count
000          0                 0            0      0
001          0                 0            1      0
010          0                 1            0     32
011          0                 1            1      0
100          1                 0            0     58
101          1                 0            1   1007
110          1                 1            0      2
111          1                 1            1     70


## Save data to be used

In [6]:
# PIDs of the data
PID = df_idx[(df_idx['x']==1)&(df_idx['y']==1)&(df_idx['ys']==1)].index
print(PID)

Index(['P010001', 'P010002', 'P010003', 'P010004', 'P010007', 'P010014',
       'P010070', 'P010071', 'P010072', 'P020001', 'P020041', 'P020061',
       'P020062', 'P020108', 'P020110', 'P020151', 'P030001', 'P030010',
       'P030024', 'P030026', 'P030037', 'P030071', 'P030121', 'P030132',
       'P030177', 'P040002', 'P040003', 'P040006', 'P040007', 'P040017',
       'P040048', 'P040049', 'P040052', 'P040054', 'P040056', 'P040060',
       'P050002', 'P050003', 'P060002', 'P070013', 'P070014', 'P070064',
       'P070076', 'P070079', 'P070369', 'P070466', 'P070557', 'P070561',
       'P080097', 'P090020', 'P090027', 'P090043', 'P090134', 'P100004',
       'P100005', 'P100009', 'P100024', 'P100025', 'P100456', 'P130002',
       'P150011', 'P340017', 'P340018', 'P340070', 'P340071', 'P380140',
       'P392501', 'P412164', 'P460064', 'P462237'],
      dtype='object')


In [7]:
# Inputs, outputs and source features 
x = x_all.loc[PID].iloc[:,2:]
y = y_all.loc[PID]['Cp mean']
ys = ys_all.loc[PID]['Cp']

# Create the dictionary
data_dict = {
    'x_all': x_all,
    'y_all' : y_all,
    'ys_all' : ys_all,
    'idx_count' : df_count,
    'PID' : PID,
    'x' : x,
    'y' : y,
    'ys' : ys
}

# Save
# f = open('../30_Output/40_pkl/100_CheckData/100_Data.pkl','wb')
f = open('../30_Output/40_pkl/100_CheckData/100_Data_masked.pkl','wb')
pickle.dump(data_dict,f)
f.close()

## Check data distribution

### Output

In [None]:
# For paper
df_plt = pd.DataFrame([y,ys], index=['CP_PI','CP_MD']).T
sns.set_style(style='ticks')
g = sns.jointplot(
    x='CP_PI', 
    y='CP_MD',
    kind="reg",
    data=df_plt,
    xlim=(500,5000),
    ylim=(500,5000),
    marginal_kws={'bins':20, 'color':'steelblue'},
    joint_kws={'fit_reg':False, 'scatter_kws':{'s':50, 'edgecolor':'white'}}              
)
g.ax_joint.grid(color='gray', linestyle='dotted', linewidth=1.1, alpha=0.4, zorder=1)
g.ax_joint.axline((0, 0), (.1, .1), linestyle='dotted', linewidth=1.1, alpha=0.4, zorder=2, color='gray')
g.ax_joint.set_xlabel('Experimentally-observed heat capacity (log J/kg'+r'$\cdot$'+'K)', size=13, labelpad=8)
g.ax_joint.set_ylabel('MD-calculated heat capacity (log J/kg'+r'$\cdot$'+'K)', size=13, labelpad=8)
g.ax_joint.text(2590,700,'Correlation : 0.7788', size=18, backgroundcolor='white')
plt.savefig('../30_Output/20_Plot/100_CheckData/100_CheckData_y_paper_joint.pdf')