### The goal of this notebook is find material systems with high melting point, high ITR. 4 different models was used to make the predictions: KRR, XGB and DNN. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
import random
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as ConstantKernel
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale

# Load data

In [2]:
ms_ITR = pd.read_excel('ms_ITR_600K_20210626.xlsx')
top = 1000

# Material system properties

## High melting point, high ITR

## KRR model

In [3]:
# Rank the ms_ITR entries by krr_ITR and select top x. 
ms_ITR['krr_rank'] = ms_ITR['krr_ITR'].rank()
ms_ITR.sort_values('krr_ITR', ascending=False,inplace=True)
data1 = ms_ITR[:top]
data1
print(data1['fmelt'].describe())
print(data1['smelt'].describe())

# Further filter out high melting poing material systems with fmelt and smelt > 600 K
krr_high_mp = data1.loc[(data1['fmelt'] >= 600) & (data1['smelt'] >= 600)]
krr_high_mp_index = krr_high_mp.index
krr_high_mp.to_excel('krr_high_mp_600K_20210626.xlsx')

#Select the top 100 ITR ranked by KRR model. This is used to compare with literature models
krr_high_ITR = ms_ITR[:100]
krr_high_ITR.to_excel('krr_high_ITR_600K_20210626.xlsx')
krr_high_ITR_index = krr_high_ITR.index
print('Done!')


count    1000.000000
mean     1210.541000
std       438.768949
min       523.000000
25%       860.000000
50%      1230.000000
75%      1422.000000
max      3660.000000
Name: fmelt, dtype: float64
count    1000.000000
mean     1734.120500
std       705.636314
min       341.000000
25%      1293.000000
50%      1680.000000
75%      2083.000000
max      4510.000000
Name: smelt, dtype: float64
Done!


## XGB model

In [4]:
# Rank the ms_ITR entries by xgb_ITR and select top x. 
ms_ITR['xgb_rank'] = ms_ITR['xgb_ITR'].rank()
ms_ITR.sort_values('xgb_ITR', ascending=False,inplace=True)
data1 = ms_ITR[:top]
data1
print(data1['fmelt'].describe())
print(data1['smelt'].describe())

# Further filter out high melting poing material systems with fmelt and smelt > 600 K
xgb_high_mp = data1.loc[(data1['fmelt'] >= 600) & (data1['smelt'] >= 600)]
xgb_high_mp_index = xgb_high_mp.index
xgb_high_mp.to_excel('xgb_high_mp_600K_20210626.xlsx')

#Select the top 100 ITR ranked by xgb model. This is used to compare with literature models
xgb_high_ITR = ms_ITR[:100]
xgb_high_ITR.to_excel('xgb_high_ITR_600K_20210626.xlsx')
xgb_high_ITR_index = xgb_high_ITR.index
print('Done!')

count    1000.000000
mean      541.602500
std       302.916832
min       146.000000
25%       414.000000
50%       505.000000
75%       545.000000
max      2943.000000
Name: fmelt, dtype: float64
count    1000.00000
mean     2062.07275
std      1202.38146
min       146.00000
25%      1016.00000
50%      1820.50000
75%      3008.00000
max      4510.00000
Name: smelt, dtype: float64
Done!


## DNN model

In [5]:
# Rank the ms_ITR entries by dnn_ITR and select top x. 
ms_ITR['dnn_rank'] = ms_ITR['dnn_ITR'].rank()
ms_ITR.sort_values('dnn_ITR', ascending=False,inplace=True)
data2 = ms_ITR[:top]
print(data2['fmelt'].describe())
print(data2['smelt'].describe())

# Further filter out high melting poing material systems with fmelt and smelt > 600 K
dnn_high_mp = data2.loc[(data2['fmelt'] >= 600) & (data2['smelt'] >= 600)]
dnn_high_mp_index = dnn_high_mp.index
# print(type(dnn_high_mp_index))
dnn_high_mp.to_excel('dnn_high_mp_600K_20210626.xlsx')

#Select the top 100 ITR ranked by xgb model. This is used to compare with literature models
dnn_high_ITR = ms_ITR[:100]
dnn_high_ITR.to_excel('dnn_high_ITR_600K_20210626.xlsx')
dnn_high_ITR_index = dnn_high_ITR.index
print('Done!')

count    1000.000000
mean      975.709750
std       355.968032
min       146.000000
25%       723.000000
50%       985.000000
75%      1211.000000
max      2748.000000
Name: fmelt, dtype: float64
count    1000.000000
mean     1151.117000
std       644.443492
min       146.000000
25%       732.000000
50%       935.000000
75%      1390.000000
max      3700.000000
Name: smelt, dtype: float64
Done!


## Ensemble model

In [6]:
# Rank the ms_ITR entries by en_ITR and select top x. 
ms_ITR['en_rank'] = ms_ITR['en_ITR'].rank()
ms_ITR.sort_values('en_ITR', ascending=False,inplace=True)
data1 = ms_ITR[:top]
data1
print(data1['fmelt'].describe())
print(data1['smelt'].describe())

# Further filter out high melting poing material systems with fmelt and smelt > 600 K
en_high_mp = data1.loc[(data1['fmelt'] >= 600) & (data1['smelt'] >= 600)]
en_high_mp_index = en_high_mp.index
en_high_mp.to_excel('en_high_mp_600K_20210626.xlsx')

#Select the top 100 ITR ranked by xgb model. This is used to compare with literature models
en_high_ITR = ms_ITR[:20]
en_high_ITR.to_excel('en_high_ITR_600K_20210626.xlsx')
en_high_ITR_index = en_high_ITR.index
print('Done!')

count    1000.000000
mean      783.372250
std       371.784431
min       146.000000
25%       545.000000
50%       553.000000
75%      1126.000000
max      1928.000000
Name: fmelt, dtype: float64
count    1000.000000
mean     1923.819500
std      1062.039851
min       146.000000
25%      1183.000000
50%      1683.000000
75%      2555.000000
max      4510.000000
Name: smelt, dtype: float64
Done!


# Results evaluation

## High mp, high ITR material systems predicted by Ensemble model

In [24]:
# high MP high ITR predicted by ensemble model
df_en = ms_ITR[ms_ITR.index.isin(en_high_mp_index)]
df_en = df_en[['fMaterial', 'fFormula', 'fmelt', 'sMaterial', 'sFormula', 'smelt', 'krr_ITR', 'xgb_ITR', 'dnn_ITR', 'en_ITR']]
print(df_en.shape)
df_en[:30]

(439, 10)


Unnamed: 0,fMaterial,fFormula,fmelt,sMaterial,sFormula,smelt,krr_ITR,xgb_ITR,dnn_ITR,en_ITR
65055,Platinum sulfide,PtS,1663.0,Diamond,C,3823.0,41.741267,77.455482,32.068253,50.421667
65056,Platinum sulfide,PtS,1663.0,Graphene,gp-C,4510.0,40.398841,72.331673,34.170319,48.966944
65074,Platinum sulfide,PtS,1663.0,graphite,g-C,4260.0,40.555537,72.331673,32.541473,48.476228
67431,Palladium telluride,PdTe,1019.0,Diamond,C,3823.0,49.000974,48.967209,43.478565,47.148915
65946,Zinc sulfide,ZnS,1293.0,Diamond,C,3823.0,45.100245,72.7668,23.00745,46.958164
68025,Platinum telluride,PtTe,1230.0,Diamond,C,3823.0,47.929191,49.67429,37.471508,45.024996
23178,Lead oxide,PbO,758.0,Diamond,C,3823.0,39.671768,73.157845,22.099325,44.976313
73074,Lithium chloride,LiCl,887.0,Diamond,C,3823.0,35.213951,70.105171,28.35734,44.55882
65947,Zinc sulfide,ZnS,1293.0,Graphene,gp-C,4510.0,43.514722,66.290848,23.415897,44.407154
68026,Platinum telluride,PtTe,1230.0,Graphene,gp-C,4510.0,47.652498,43.198326,41.604836,44.151887


## High ITR prediction by Ensemble model

In [23]:
indexes = [en_high_ITR_index]
df = ms_ITR[ms_ITR.index.isin(en_high_ITR_index)]
df = df[['fMaterial', 'fFormula', 'fmelt', 'sMaterial', 'sFormula', 'smelt', 'en_ITR']]
df

Unnamed: 0,fMaterial,fFormula,fmelt,sMaterial,sFormula,smelt,en_ITR
1793,Bismuth,Bi,545.0,Diamond,C,3823.0,65.674138
1812,Bismuth,Bi,545.0,graphite,g-C,4260.0,63.573216
1794,Bismuth,Bi,545.0,Graphene,gp-C,4510.0,63.340331
1833,Bismuth,Bi,545.0,Sulfur,S,393.0,55.655019
1832,Bismuth,Bi,545.0,Phosphorus,P,831.0,55.384094
1831,Bismuth,Bi,545.0,Boron,B,2092.0,51.527788
65055,Platinum sulfide,PtS,1663.0,Diamond,C,3823.0,50.421667
65056,Platinum sulfide,PtS,1663.0,Graphene,gp-C,4510.0,48.966944
1866,Bismuth,Bi,545.0,Mercuryix oxide,HgO,773.0,48.894515
65074,Platinum sulfide,PtS,1663.0,graphite,g-C,4260.0,48.476228
