In [148]:
#import library

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from collections import defaultdict
import math
import os

In this research we use the ionic conductivity dataset taken from https://github.com/FALL-ML/materials-discovery. The formula names in the dataset have irregular formats, so we have changed them all to the same format. We have combined stoichiometry on the same elements and have changed so that all stoichiometric ratio are to the right of the element. For example 0.7Li2S-0.3P2S5 is changed to Li1.4S2.2P0.6.	

In [149]:
base_dir = 'D:/GITHUB/ML_SSE'

dataset_dir = os.path.join(base_dir, 'data/dataset.csv')

#read dataset
dataset = pd.read_csv(dataset_dir, encoding= 'unicode_escape')

dataset_org = dataset.copy()

In [150]:
dataset.columns

Index(['id', 'formula', 'target', 'balanced_charge', 'Ea (eV)', 'space group',
       'ICSD Entry', 'formula_preprocessed'],
      dtype='object')

Discard data that has uncertain ionic conductivity. For example '<-10'. This data has a string type, different from most data which has a float type

In [151]:
#drop target with string type

dataset = dataset[pd.to_numeric(dataset['target'], errors='coerce').notnull()]
dataset['target'] = dataset['target'].astype(float)

#drop formula with unbalanced stoichiometri
dataset = dataset[dataset['balanced_charge']==1]

In [152]:
dataset

Unnamed: 0,id,formula,target,balanced_charge,Ea (eV),space group,ICSD Entry,formula_preprocessed
0,1,LiAlSi3O8,1.300000e-10,1,,2.0,81980.0,LiAlSi3O8
1,2,LiSn2(PO4)3,2.040000e-09,1,,2.0,83832.0,LiSn2(PO4)3
2,3,Li7BiO6,8.800000e-07,1,0.580,2.0,155950.0,Li7BiO6
3,4,Li7SbO6,6.700000e-08,1,0.700,2.0,413370.0,Li7SbO6
4,5,Li7P3S11,1.700000e-02,1,0.170,2.0,157654.0,Li7P3S11
...,...,...,...,...,...,...,...,...
1341,1342,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,2.260000e-04,1,0.390,,,Li6.6La2.5Y0.5Zr1.6Ta0.4O12
1342,1343,Li2ZrS3,7.300000e-06,1,,,,Li2ZrS3
1343,1344,Li2.2Zn0.1Zr0.9S3,1.200000e-04,1,,,,Li2.2Zn0.1Zr0.9S3
1344,1345,0.7Li2S-0.3P2S5,8.100000e-05,1,0.425,,,Li1.4S2.2P0.6


In [153]:
#convert target to log scale

def calc_log(list_val):
    final_values = []
    for v in list_val:
        final_values.append(math.log10(v))
    return final_values

list_log = calc_log(dataset['target'])

dataset['target_log']=list_log
dataset

Unnamed: 0,id,formula,target,balanced_charge,Ea (eV),space group,ICSD Entry,formula_preprocessed,target_log
0,1,LiAlSi3O8,1.300000e-10,1,,2.0,81980.0,LiAlSi3O8,-9.886057
1,2,LiSn2(PO4)3,2.040000e-09,1,,2.0,83832.0,LiSn2(PO4)3,-8.690370
2,3,Li7BiO6,8.800000e-07,1,0.580,2.0,155950.0,Li7BiO6,-6.055517
3,4,Li7SbO6,6.700000e-08,1,0.700,2.0,413370.0,Li7SbO6,-7.173925
4,5,Li7P3S11,1.700000e-02,1,0.170,2.0,157654.0,Li7P3S11,-1.769551
...,...,...,...,...,...,...,...,...,...
1341,1342,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,2.260000e-04,1,0.390,,,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,-3.645892
1342,1343,Li2ZrS3,7.300000e-06,1,,,,Li2ZrS3,-5.136677
1343,1344,Li2.2Zn0.1Zr0.9S3,1.200000e-04,1,,,,Li2.2Zn0.1Zr0.9S3,-3.920819
1344,1345,0.7Li2S-0.3P2S5,8.100000e-05,1,0.425,,,Li1.4S2.2P0.6,-4.091515


Open grouped elements. For example LiSn2(PO4)3 to LiSn2P3O12


In [154]:
import importlib
from open_bracket import *
# importlib.reload(open_bracket)

dataset["formula_preprocessed2"] = dataset["formula_preprocessed"].apply(lambda x:open_bracket_multi(x))
dataset
# open_bracket_multi('As200(PO4)0.5((PO2)0.2Be0.5(CO)0.2)2')

Unnamed: 0,id,formula,target,balanced_charge,Ea (eV),space group,ICSD Entry,formula_preprocessed,target_log,formula_preprocessed2
0,1,LiAlSi3O8,1.300000e-10,1,,2.0,81980.0,LiAlSi3O8,-9.886057,LiAlSi3O8
1,2,LiSn2(PO4)3,2.040000e-09,1,,2.0,83832.0,LiSn2(PO4)3,-8.690370,LiSn2P3O12
2,3,Li7BiO6,8.800000e-07,1,0.580,2.0,155950.0,Li7BiO6,-6.055517,Li7BiO6
3,4,Li7SbO6,6.700000e-08,1,0.700,2.0,413370.0,Li7SbO6,-7.173925,Li7SbO6
4,5,Li7P3S11,1.700000e-02,1,0.170,2.0,157654.0,Li7P3S11,-1.769551,Li7P3S11
...,...,...,...,...,...,...,...,...,...,...
1341,1342,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,2.260000e-04,1,0.390,,,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,-3.645892,Li6.6La2.5Y0.5Zr1.6Ta0.4O12
1342,1343,Li2ZrS3,7.300000e-06,1,,,,Li2ZrS3,-5.136677,Li2ZrS3
1343,1344,Li2.2Zn0.1Zr0.9S3,1.200000e-04,1,,,,Li2.2Zn0.1Zr0.9S3,-3.920819,Li2.2Zn0.1Zr0.9S3
1344,1345,0.7Li2S-0.3P2S5,8.100000e-05,1,0.425,,,Li1.4S2.2P0.6,-4.091515,Li1.4S2.2P0.6


There are hundreds of duplicate element. The duplicate element are combined with average.



In [155]:
duplicate_data = dataset[dataset.duplicated(['formula_preprocessed2'], keep=False)]

keys = list(duplicate_data.columns.values)
i1 = dataset.set_index(keys).index
i2 = duplicate_data.set_index(keys).index

non_duplicate_data = dataset[~i1.isin(i2)]

In [156]:
duplicate_data_ = duplicate_data.loc[:,['formula_preprocessed2', 'target_log']]
duplicate_data_

Unnamed: 0,formula_preprocessed2,target_log
4,Li7P3S11,-1.769551
5,Li7P3S11,-2.494850
14,Li7P3S11,-2.065502
16,Li3.8Ge0.8P0.2S4,-5.749580
17,Li3.6Ge0.6P0.4S4,-3.756962
...,...,...
1330,Li1.5Sc0.5Ti1.5P3O12,-6.266001
1336,LiTi2P3O12,-5.143876
1338,Li6.6La3Zr1.6Ta0.4O12,-3.504456
1344,Li1.4S2.2P0.6,-4.091515


In [157]:
duplicate_data_combined = duplicate_data_.groupby(['formula_preprocessed2']).mean()
duplicate_data_combined['formula_preprocessed2'] = duplicate_data_combined.index
duplicate_data_combined = duplicate_data_combined.reset_index(drop=True)
duplicate_data_combined

Unnamed: 0,target_log,formula_preprocessed2
0,-2.778603,I0.3Li1.24P0.18S1.2Si0.14
1,-3.577451,La0.51Li0.34TiO2.94
2,-3.202646,La0.52Li0.45TiO3
3,-2.529914,La0.54Li0.36TiO3
4,-3.044754,La0.54Li0.39TiO3
...,...,...
129,-4.453637,LiS1.5Ge0.5
130,-5.060051,LiS1.5Si0.5
131,-4.972964,LiTi2P3O12
132,-7.230193,LiZr2P3O12


In [158]:
duplicate_data = duplicate_data[~duplicate_data.duplicated(['formula_preprocessed2'], keep='first')]
processed_duplicate_data = pd.merge(duplicate_data_combined, duplicate_data, left_on='formula_preprocessed2', right_on='formula_preprocessed2', how='left')
processed_duplicate_data

Unnamed: 0,target_log_x,formula_preprocessed2,id,formula,target,balanced_charge,Ea (eV),space group,ICSD Entry,formula_preprocessed,target_log_y
0,-2.778603,I0.3Li1.24P0.18S1.2Si0.14,867,0.14 SiS2-0.09 P2S5-0.47 Li2S-0.30 LiI,1.320000e-03,1,0.34,,,I0.3Li1.24P0.18S1.2Si0.14,-2.879426
1,-3.577451,La0.51Li0.34TiO2.94,564,La0.51Li0.34TiO2.94,7.000000e-05,1,0.36,221.0,,La0.51Li0.34TiO2.94,-4.154902
2,-3.202646,La0.52Li0.45TiO3,200,La0.52Li0.45TiO3,5.010000e-04,1,,123.0,50434.0,La0.52Li0.45TiO3,-3.300162
3,-2.529914,La0.54Li0.36TiO3,550,La0.54Li0.36TiO3,9.790000e-03,1,,221.0,,La0.54Li0.36TiO3,-2.009217
4,-3.044754,La0.54Li0.39TiO3,199,La0.54Li0.39TiO3,6.510000e-04,1,,123.0,,La0.54Li0.39TiO3,-3.186419
...,...,...,...,...,...,...,...,...,...,...,...
129,-4.453637,LiS1.5Ge0.5,791,0.5(Li2S) 0.5(GeS2),3.300000e-05,1,0.35,,,Li1.0S1.5Ge0.5,-4.481486
130,-5.060051,LiS1.5Si0.5,1016,0.5Li2S-0.5SiS2,1.580000e-06,1,,,,Li1.0S1.5Si0.5,-5.801343
131,-4.972964,LiTi2P3O12,297,LiTi2(PO4)3,1.610000e-04,1,0.21,148.0,,LiTi2(PO4)3,-3.793174
132,-7.230193,LiZr2P3O12,310,LiZr2(PO4)3,2.960000e-10,1,,167.0,201935.0,LiZr2(PO4)3,-9.528708


In [159]:
final_duplicate_data = processed_duplicate_data.loc[:, ['formula', 'formula_preprocessed2', 'space group', 'target_log_x']]
final_duplicate_data.columns = ['formula_unedited', 'formula', 'space group', 'target']
final_nonduplicate_data = non_duplicate_data.loc[:, ['formula', 'formula_preprocessed2', 'space group', 'target_log']]
final_nonduplicate_data.columns = ['formula_unedited', 'formula', 'space group', 'target']

In [160]:
final_data = pd.concat([final_duplicate_data, final_nonduplicate_data])
final_data

Unnamed: 0,formula_unedited,formula,space group,target
0,0.14 SiS2-0.09 P2S5-0.47 Li2S-0.30 LiI,I0.3Li1.24P0.18S1.2Si0.14,,-2.778603
1,La0.51Li0.34TiO2.94,La0.51Li0.34TiO2.94,221.0,-3.577451
2,La0.52Li0.45TiO3,La0.52Li0.45TiO3,123.0,-3.202646
3,La0.54Li0.36TiO3,La0.54Li0.36TiO3,221.0,-2.529914
4,La0.54Li0.39TiO3,La0.54Li0.39TiO3,123.0,-3.044754
...,...,...,...,...
1339,Li6.6La2.875Y0.125Zr1.6Ta0.4O12,Li6.6La2.875Y0.125Zr1.6Ta0.4O12,,-3.498941
1340,Li6.6La2.75Y0.25Zr1.6Ta0.4O12,Li6.6La2.75Y0.25Zr1.6Ta0.4O12,,-3.360514
1341,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,Li6.6La2.5Y0.5Zr1.6Ta0.4O12,,-3.645892
1342,Li2ZrS3,Li2ZrS3,,-5.136677


### Split Dataset 5 Fold CV

In [174]:
random_state = 1

def create_k_fold(data, url):
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

    train, test = train_test_split(data, test_size=0.2, shuffle=True, random_state=random_state)

    kf.get_n_splits(train)
    for id, (train_index, val_index) in enumerate(kf.split(train)):
        i=id+1
        path_ = os.path.join(url, str(i))
        if not os.path.exists(path_):
            os.makedirs(path_)
        train.iloc[train_index].to_csv(os.path.join(path_, 'train.csv'))
        train.iloc[val_index].to_csv(os.path.join(path_, 'val.csv'))
        test.to_csv(os.path.join(path_, 'test.csv'))

In [175]:
# create folder for 5 fold cv
 
# Path 
path = os.path.join(base_dir, 'data/preprocessed_dataset/5fold_cv') 

# Create the directory
try:
    os.makedirs(path)
except:
    pass

In [176]:
create_k_fold(final_data, path)