In [6]:
import re
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

class RFLR:
    def __init__(self):
# use the random forest from sklearn
        self.rf = RandomForestRegressor(n_estimators=30, random_state=1)

 # function to convert the composition string into a table.        
    def convertfeatures(self,inseries):
        stringref=['C', 'Mn', 'Si', 'Cr', 'Ni', 'Mo', 'V', 'N', 'Nb', 'Co', 'W', 'Al', 'Ti'] 
        X=np.empty((len(inseries),len(stringref)))
        X2=inseries
        for i in range(len(inseries)):
            numb=re.findall(r"(?:\d*\.\d+|\d+)",X2[i])
            string=re.findall(r"(\w+?)(?:\d*\.\d+|\d+)",X2[i])
            for j in range(len(stringref)): 
                flag=0
                for l in range(len(string)):
                    if (string[l] == stringref[j]): 
                        X[i,j]=float(numb[l])
                        flag=1
                if (flag<1): 
                    X[i,j]=0
        return X
    
    def train_and_validate(self, train_inputs, train_outputs):
        print("hello")
        y=train_outputs.to_numpy()
        X=self.convertfeatures(train_inputs.to_numpy())
        self.rf.fit(X, y)
        print('training R2 = ' + str(round(self.rf.score(X, y), 3)))
        print('training MAE = %.3f' % mean_absolute_error(y_true=y, y_pred=self.rf.predict(X)))    
           
    def predict(self, test_input, test_outputs):
        Xtest=self.convertfeatures(test_input.to_numpy())
        y=test_outputs.to_numpy()
        print('test MAE = %.3f' % mean_absolute_error(y_true=y, y_pred=self.rf.predict(Xtest)))
        return self.rf.predict(Xtest)
             
from matbench.bench import MatbenchBenchmark

mb = MatbenchBenchmark(autoload=False,subset=['matbench_steels'])

my_model=RFLR()

for task in mb.tasks:
    task.load()
    for fold in task.folds:

        # Inputs are either chemical compositions as strings
        # or crystal structures as pymatgen.Structure objects.
        # Outputs are either floats (regression tasks) or bools (classification tasks)
        train_inputs, train_outputs = task.get_train_and_val_data(fold)

        # train and validate your model
        my_model.train_and_validate(train_inputs, train_outputs)
        # Get testing data
        test_inputs, test_outputs = task.get_test_data(fold, include_target=True)

        # Predict on the testing data
        # Your output should be a pandas series, numpy array, or python iterable
        # where the array elements are floats or bools
        predictions = my_model.predict(test_inputs, test_outputs)

        # Record your data!
        task.record(fold, predictions)

# Save your results
mb.to_file("results.json.gz")

2022-06-28 21:36:17 INFO     Initialized benchmark 'matbench_v0.1' with 1 tasks: 
['matbench_steels']
2022-06-28 21:36:17 INFO     Loading dataset 'matbench_steels'...
2022-06-28 21:36:17 INFO     Dataset 'matbench_steels loaded.
hello
training R2 = 0.975
training MAE = 32.522
test MAE = 97.540
2022-06-28 21:36:17 INFO     Recorded fold matbench_steels-0 successfully.
hello
training R2 = 0.973
training MAE = 34.101
test MAE = 86.279
2022-06-28 21:36:18 INFO     Recorded fold matbench_steels-1 successfully.
hello
training R2 = 0.967
training MAE = 37.004
test MAE = 79.510
2022-06-28 21:36:18 INFO     Recorded fold matbench_steels-2 successfully.
hello
training R2 = 0.964
training MAE = 37.342
test MAE = 94.582
2022-06-28 21:36:18 INFO     Recorded fold matbench_steels-3 successfully.
hello
training R2 = 0.973
training MAE = 35.942
test MAE = 95.037
2022-06-28 21:36:19 INFO     Recorded fold matbench_steels-4 successfully.
2022-06-28 21:36:19 INFO     Successfully wrote MatbenchBenchmark