In [2]:
import pandas as pd
import numpy as np
from pymatgen.core import Composition

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns

# Splitting data into training and testing
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

config = {
    "mathtext.fontset":'stix',
    "font.family":'serif',
    "font.serif": ['Times New Roman'],
    "font.size": 24,
    'axes.unicode_minus': False 
}
rcParams.update(config)
plt.rcParams['axes.unicode_minus'] = False  
large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (8, 6),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.rcParams['figure.dpi'] = 300 

In [3]:
df01= pd.read_csv('./Data/New_Eg_6.6w.csv')
df01

Unnamed: 0,formula,hse_fold0,hse_fold1,hse_fold2,hse_fold3,hse_fold4,mean_hse,mean_eh,A_Density,B_Density,X_Density,B_dipole Polarizability,A_number of Valence Electrons,X_number of Valence Electrons,B_number,A_number of s+p Electrons,B_number of s+p Electrons,X_number of s+p Electrons,A_number of d Electrons,X_number of d Electrons,avg ionic char,B_Electronegativity,mean
0,Cs0.47Rb0.53Ca1Cl3,5.809389,5.248012,6.039049,5.751761,5.965189,5.762680,0.107727,1.6898,1.5400,0.00963,160.80,1.0,21.0,20.00,1.0,2.00,21.0,0.0,0.0,0.172973,1.0000,5.891347
1,Cs0.47Rb0.53Sr1Cl3,5.700661,5.061179,5.825304,5.644604,5.909904,5.628331,0.057384,1.6898,2.6400,0.00963,197.20,1.0,21.0,38.00,1.0,2.00,21.0,0.0,0.0,0.174792,0.9500,5.770118
2,Cs0.47Rb0.53Mn1Cl3,2.738756,2.699255,2.715833,2.640517,2.822403,2.723353,0.013267,1.6898,7.4400,0.00963,68.00,1.0,21.0,25.00,1.0,2.00,21.0,0.0,0.0,0.152377,1.5500,2.698590
3,Cs0.47Rb0.53Fe1Cl3,2.821851,2.349560,3.023580,2.866029,0.000000,2.212204,0.080834,1.6898,7.8700,0.00963,62.00,1.0,21.0,26.00,1.0,2.00,21.0,0.0,0.0,0.142088,1.8300,2.765255
4,Cs0.47Rb0.53Pd1Cl3,0.000000,2.958490,0.000000,3.009616,3.122243,1.818070,0.034107,1.6898,12.0000,0.00963,26.14,1.0,21.0,46.00,1.0,0.00,21.0,0.0,0.0,0.130062,2.2000,3.030116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65956,Rb1V0.49Sn0.51Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.070500,1.5300,6.7118,9.36000,69.66,1.0,21.0,36.77,1.0,3.02,21.0,0.0,0.0,0.125254,1.7983,0.000000
65957,Rb1V0.49Pb0.51Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.063449,1.5300,8.7569,9.36000,66.60,1.0,21.0,53.09,1.0,3.02,21.0,0.0,0.0,0.121586,1.9870,0.000000
65958,Rb1Eu0.51V0.49Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.064295,1.5300,5.6663,9.36000,136.47,1.0,21.0,43.40,1.0,2.00,21.0,0.0,0.0,0.139952,1.4107,0.000000
65959,Rb1Tm0.51V0.49Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.036594,1.5300,7.7471,9.36000,116.07,1.0,21.0,46.46,1.0,2.00,21.0,0.0,0.0,0.138803,1.4362,0.000000


In [4]:
df01['Metal_or_not'] = df01['mean'].apply(lambda x: 0 if x <= 0.1 else 1)
df01

Unnamed: 0,formula,hse_fold0,hse_fold1,hse_fold2,hse_fold3,hse_fold4,mean_hse,mean_eh,A_Density,B_Density,X_Density,B_dipole Polarizability,A_number of Valence Electrons,X_number of Valence Electrons,B_number,A_number of s+p Electrons,B_number of s+p Electrons,X_number of s+p Electrons,A_number of d Electrons,X_number of d Electrons,avg ionic char,B_Electronegativity,mean,Metal_or_not
0,Cs0.47Rb0.53Ca1Cl3,5.809389,5.248012,6.039049,5.751761,5.965189,5.762680,0.107727,1.6898,1.5400,0.00963,160.80,1.0,21.0,20.00,1.0,2.00,21.0,0.0,0.0,0.172973,1.0000,5.891347,1
1,Cs0.47Rb0.53Sr1Cl3,5.700661,5.061179,5.825304,5.644604,5.909904,5.628331,0.057384,1.6898,2.6400,0.00963,197.20,1.0,21.0,38.00,1.0,2.00,21.0,0.0,0.0,0.174792,0.9500,5.770118,1
2,Cs0.47Rb0.53Mn1Cl3,2.738756,2.699255,2.715833,2.640517,2.822403,2.723353,0.013267,1.6898,7.4400,0.00963,68.00,1.0,21.0,25.00,1.0,2.00,21.0,0.0,0.0,0.152377,1.5500,2.698590,1
3,Cs0.47Rb0.53Fe1Cl3,2.821851,2.349560,3.023580,2.866029,0.000000,2.212204,0.080834,1.6898,7.8700,0.00963,62.00,1.0,21.0,26.00,1.0,2.00,21.0,0.0,0.0,0.142088,1.8300,2.765255,1
4,Cs0.47Rb0.53Pd1Cl3,0.000000,2.958490,0.000000,3.009616,3.122243,1.818070,0.034107,1.6898,12.0000,0.00963,26.14,1.0,21.0,46.00,1.0,0.00,21.0,0.0,0.0,0.130062,2.2000,3.030116,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65956,Rb1V0.49Sn0.51Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.070500,1.5300,6.7118,9.36000,69.66,1.0,21.0,36.77,1.0,3.02,21.0,0.0,0.0,0.125254,1.7983,0.000000,0
65957,Rb1V0.49Pb0.51Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.063449,1.5300,8.7569,9.36000,66.60,1.0,21.0,53.09,1.0,3.02,21.0,0.0,0.0,0.121586,1.9870,0.000000,0
65958,Rb1Eu0.51V0.49Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.064295,1.5300,5.6663,9.36000,136.47,1.0,21.0,43.40,1.0,2.00,21.0,0.0,0.0,0.139952,1.4107,0.000000,0
65959,Rb1Tm0.51V0.49Br3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.036594,1.5300,7.7471,9.36000,116.07,1.0,21.0,46.46,1.0,2.00,21.0,0.0,0.0,0.138803,1.4362,0.000000,0


In [5]:
R_Eg = df01[df01['Metal_or_not'] != 0].reset_index(drop=True)
R_Eg

Unnamed: 0,formula,hse_fold0,hse_fold1,hse_fold2,hse_fold3,hse_fold4,mean_hse,mean_eh,A_Density,B_Density,X_Density,B_dipole Polarizability,A_number of Valence Electrons,X_number of Valence Electrons,B_number,A_number of s+p Electrons,B_number of s+p Electrons,X_number of s+p Electrons,A_number of d Electrons,X_number of d Electrons,avg ionic char,B_Electronegativity,mean,Metal_or_not
0,Cs0.47Rb0.53Ca1Cl3,5.809389,5.248012,6.039049,5.751761,5.965189,5.762680,0.107727,1.6898,1.5400,0.00963,160.8000,1.0,21.0,20.00,1.0,2.00,21.0,0.0,0.0,0.172973,1.0000,5.891347,1
1,Cs0.47Rb0.53Sr1Cl3,5.700661,5.061179,5.825304,5.644604,5.909904,5.628331,0.057384,1.6898,2.6400,0.00963,197.2000,1.0,21.0,38.00,1.0,2.00,21.0,0.0,0.0,0.174792,0.9500,5.770118,1
2,Cs0.47Rb0.53Mn1Cl3,2.738756,2.699255,2.715833,2.640517,2.822403,2.723353,0.013267,1.6898,7.4400,0.00963,68.0000,1.0,21.0,25.00,1.0,2.00,21.0,0.0,0.0,0.152377,1.5500,2.698590,1
3,Cs0.47Rb0.53Fe1Cl3,2.821851,2.349560,3.023580,2.866029,0.000000,2.212204,0.080834,1.6898,7.8700,0.00963,62.0000,1.0,21.0,26.00,1.0,2.00,21.0,0.0,0.0,0.142088,1.8300,2.765255,1
4,Cs0.47Rb0.53Pd1Cl3,0.000000,2.958490,0.000000,3.009616,3.122243,1.818070,0.034107,1.6898,12.0000,0.00963,26.1400,1.0,21.0,46.00,1.0,0.00,21.0,0.0,0.0,0.130062,2.2000,3.030116,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49856,Rb1Yb0.49Zn0.51Br3,4.006095,3.842813,4.002011,3.841367,4.344862,4.007430,0.056400,1.5300,7.0516,9.36000,87.8317,1.0,21.0,49.60,1.0,2.00,21.0,0.0,0.0,0.141535,1.3805,3.923072,1
49857,Rb1Yb0.49Cd0.51Br3,3.893540,3.384940,2.059169,3.612357,4.300837,3.450169,0.055445,1.5300,7.8472,9.36000,91.5700,1.0,21.0,58.78,1.0,2.00,21.0,0.0,0.0,0.140888,1.4009,3.797919,1
49858,Rb1Yb0.49Ge0.51Br3,3.376031,3.720644,3.780582,3.486729,3.735521,3.619901,0.088085,1.5300,6.1285,9.36000,88.5100,1.0,21.0,50.62,1.0,3.02,21.0,0.0,0.0,0.136542,1.5641,3.745582,1
49859,Rb1Yb0.49Sn0.51Br3,3.120219,3.471664,3.428249,3.293875,3.088541,3.280510,0.054852,1.5300,7.1332,9.36000,95.1400,1.0,21.0,59.80,1.0,3.02,21.0,0.0,0.0,0.137113,1.5386,3.397929,1


In [6]:
from sklearn.model_selection import train_test_split
X = R_Eg.iloc[:, 8:-2]
y=R_Eg['mean']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=0)
print(X_train.shape,X_test.shape)

(39888, 14) (9973, 14)


In [8]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from tqdm import tqdm
XGBR = XGBRegressor(n_jobs=-1,random_state=42)
XGBR.fit(X_train, y_train)
feature_importance=pd.DataFrame()
feature_importance['features'] = X.columns
feature_importance['feature_importances'] = XGBR.feature_importances_
threshold = np.sort(feature_importance['feature_importances'].values)
score = []
n_features = []
cols = []
for i in tqdm(threshold):
    selector = SelectFromModel(XGBR,threshold=i)
    X_embedded = selector.fit_transform(X_train, y_train)
    once = cross_val_score(XGBR,X_embedded,y_train,cv=5,scoring="r2",n_jobs=-1).mean()#交叉验证
    score.append(once)
    n_features.append(X_embedded.shape[1])
    cols.append(X.columns[selector.get_support()])
result = pd.DataFrame({"col":cols,"score":score,"n_features":n_features})
result

100%|██████████| 14/14 [00:22<00:00,  1.58s/it]


Unnamed: 0,col,score,n_features
0,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
1,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
2,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
3,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
4,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
5,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,14
6,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.803473,8
7,"Index(['B_Density', 'X_Density', 'B_dipole Pol...",0.799507,7
8,"Index(['B_Density', 'B_dipole Polarizability',...",0.799719,6
9,"Index(['B_Density', 'B_dipole Polarizability',...",0.782835,5


In [9]:
to_select = result.sort_values(by='score',ascending=False)
feature_labels = to_select[to_select['n_features']==6]['col'].values[0]
X_train = X_train[feature_labels]
X_test = X_test[feature_labels]
X_train.shape, X_test.shape

((39888, 6), (9973, 6))

In [10]:
print(feature_labels)

Index(['B_Density', 'B_dipole Polarizability', 'B_number',
       'B_number of s+p Electrons', 'avg ionic char', 'B_Electronegativity'],
      dtype='object')


In [11]:
XGBR.fit(X_train,y_train)
y_pred = XGBR.predict(X_test)

In [12]:
from sklearn.metrics import *
from math import sqrt
print(f'r2:{r2_score(y_test,y_pred)}')
print(f'mae:{mean_absolute_error(y_test,y_pred)}')
print(f'rmse: {sqrt(mean_squared_error(y_test, y_pred))}')

r2:0.8139058750055591
mae:0.4955671969621889
rmse: 0.7005543748825268


# 下面跑LGBMR：

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=0)
print(X_train.shape,X_test.shape)

(39888, 14) (9973, 14)


In [14]:
from lightgbm import LGBMRegressor
LGBMR = LGBMRegressor(n_jobs=-1, random_state=42)
LGBMR.fit(X_train, y_train)
feature_importance=pd.DataFrame()
feature_importance['features'] = X.columns
feature_importance['feature_importances'] = LGBMR.feature_importances_
threshold = np.sort(feature_importance['feature_importances'].values)
score = []
n_features = []
cols = []
for i in tqdm(threshold):
    selector = SelectFromModel(LGBMR,threshold=i)
    X_embedded = selector.fit_transform(X_train, y_train)
    once = cross_val_score(LGBMR,X_embedded,y_train,cv=5,scoring="r2",n_jobs=-1).mean()#交叉验证
    score.append(once)
    n_features.append(X_embedded.shape[1])
    cols.append(X.columns[selector.get_support()])
result = pd.DataFrame({"col":cols,"score":score,"n_features":n_features})
result

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



  0%|          | 0/14 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



  7%|▋         | 1/14 [04:06<53:19, 246.10s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 14%|█▍        | 2/14 [08:12<49:12, 246.05s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 21%|██▏       | 3/14 [12:17<45:01, 245.56s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001622 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 29%|██▊       | 4/14 [16:21<40:52, 245.24s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 36%|███▌      | 5/14 [21:44<40:58, 273.12s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 43%|████▎     | 6/14 [25:50<35:10, 263.85s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 50%|█████     | 7/14 [30:21<31:03, 266.22s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 57%|█████▋    | 8/14 [36:32<29:57, 299.62s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.056163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 64%|██████▍   | 9/14 [42:26<26:23, 316.66s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.051221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 71%|███████▏  | 10/14 [48:18<21:50, 327.57s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 79%|███████▊  | 11/14 [54:11<16:46, 335.45s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 86%|████████▌ | 12/14 [1:00:10<11:25, 342.59s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425



 93%|█████████▎| 13/14 [1:05:03<05:27, 327.45s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 9
[LightGBM] [Info] Start training from score 2.530425


100%|██████████| 14/14 [1:09:36<00:00, 298.35s/it]


Unnamed: 0,col,score,n_features
0,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
1,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
2,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
3,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
4,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
5,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,14
6,"Index(['A_Density', 'B_Density', 'X_Density', ...",0.738173,8
7,"Index(['A_Density', 'B_Density', 'B_dipole Pol...",0.737561,7
8,"Index(['B_Density', 'B_dipole Polarizability',...",0.736147,6
9,"Index(['B_Density', 'B_dipole Polarizability',...",0.663208,5


In [15]:
to_select = result.sort_values(by='score',ascending=False)
feature_labels = to_select[to_select['n_features']==6]['col'].values[0]
X_train = X_train[feature_labels]
X_test = X_test[feature_labels]
X_train.shape, X_test.shape

((39888, 6), (9973, 6))

In [16]:
print(feature_labels)

Index(['B_Density', 'B_dipole Polarizability', 'B_number',
       'B_number of s+p Electrons', 'avg ionic char', 'B_Electronegativity'],
      dtype='object')


In [17]:
LGBMR.fit(X_train,y_train)
y_pred = LGBMR.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023753 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1455
[LightGBM] [Info] Number of data points in the train set: 39888, number of used features: 6
[LightGBM] [Info] Start training from score 2.530425


In [18]:
print(f'r2:{r2_score(y_test,y_pred)}')
print(f'mae:{mean_absolute_error(y_test,y_pred)}')
print(f'rmse: {sqrt(mean_squared_error(y_test, y_pred))}')

r2:0.7470124678604537
mae:0.6024880457588514
rmse: 0.8168175822612217
