In [2]:
#MUST BE RUN ON PYTHON 3.7!

from mastml.mastml import Mastml

from mastml.datasets import SklearnDatasets
from mastml.datasets import LocalDatasets
from mastml.datasets import MatminerDatasets
from mastml.datasets import FigshareDatasets
from mastml.datasets import FoundryDatasets
from mastml.data_cleaning import DataCleaning
import numpy as np
import pandas as pd
from copy import copy
import mastml
import os 
from mastml.feature_selectors import SklearnFeatureSelector, EnsembleModelFeatureSelector
from mastml.feature_generators import ElementalFeatureGenerator, OneHotGroupGenerator
from mastml.learning_curve import LearningCurve
from mastml.models import SklearnModel, EnsembleModel
from mastml.preprocessing import SklearnPreprocessor
from mastml.data_splitters import SklearnDataSplitter, NoSplit, LeaveOutPercent
from mastml.hyper_opt import GridSearch, RandomizedSearch, BayesianSearch
import pymatgen as pmg

ModuleNotFoundError: No module named 'mastml'

In [2]:
SAVEPATH = 'MastML/MastML'

mastml = Mastml(savepath=SAVEPATH)
savepath = mastml.get_savepath

In [3]:
target = 'Ctemp'

extra_columns = ['Composition']

In [4]:
d = LocalDatasets(file_path= "Data/DS1-Compounds.csv",
                  target=target, 
                  extra_columns=extra_columns, 
                  group_column='Composition',
                  testdata_columns=None,
                  as_frame=True)

data_dict = d.load_data()

X = data_dict['X']
y = data_dict['y']
X_extra = data_dict['X_extra']
groups = data_dict['groups']



In [5]:
X1 = X2 = X3 = X4 = X5 = copy(X)

In [6]:
X, y = ElementalFeatureGenerator(composition_df=X_extra['Composition'], 
                                 feature_types=['composition_avg', 'arithmetic_avg', 'max', 'min', 'difference'], 
                                 remove_constant_columns=True).evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

Dropping 5/440 generated columns due to missing values


In [None]:
X, y = OneHotGroupGenerator(groups=X_extra['Composition']).evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

In [None]:
preprocessor = SklearnPreprocessor(preprocessor='StandardScaler', as_frame=True)

In [None]:
X = preprocessor.evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)
Xcopy = copy(X) # Reserve this copy of the full dataset for later

In [None]:
model = SklearnModel(model='RandomForestRegressor')
selector = EnsembleModelFeatureSelector(model=model, 
                                        n_features_to_select=20)
X = selector.evaluate(X=X, y=y, savepath=savepath, make_new_dir=True)

In [None]:
#X is ready to be saved now
X.to_csv('DS1-MASTML-Features.csv')

Unnamed: 0,HeatFusion_composition_average,GSmagmom_composition_average,ElasticModulus_composition_average,n_ws^third_composition_average,AtomicVolume_composition_average,BCCmagmom_composition_average,MeltingT_arithmetic_average,CovalentRadius_composition_average,CovalentRadii_composition_average,GSenergy_pa_arithmetic_average,groups
0,0.062372,-0.292148,1.067695,0.624249,-0.687965,-0.376795,0.713109,-0.506542,-0.732983,-0.430734,"[Element Zr, Element Ti, Element Be, Element C..."
1,0.083469,-0.292153,1.256598,0.649470,-0.662435,-0.376795,0.713109,-0.569760,-0.792458,-0.430734,"[Element Zr, Element Ti, Element Be, Element C..."
2,0.068809,-0.323694,1.181025,0.615602,-0.665584,-0.380165,0.713109,-0.545518,-0.773703,-0.430734,"[Element Zr, Element Ti, Element Be, Element C..."
3,0.267366,-0.292160,0.483118,0.336694,-0.436257,-0.376795,0.004676,-0.268842,-0.346026,0.230114,"[Element Zr, Element Be, Element Al, Element C..."
4,0.203736,-0.292160,0.606394,0.381303,-0.506860,-0.376795,0.004676,-0.402021,-0.481757,0.230114,"[Element Zr, Element Be, Element Al, Element C..."
...,...,...,...,...,...,...,...,...,...,...,...
384,0.299513,-0.517462,-0.135403,0.024432,-0.481902,-0.400867,-0.235684,0.002575,-0.123427,0.009275,"[Element Zr, Element Al, Element Cu]"
385,1.974500,1.172306,3.115498,2.028397,-1.685062,-0.220323,1.339063,-2.427701,-2.236884,-0.449051,"[Element Si, Element Ni, Element B]"
386,0.549614,-0.472402,-0.541022,-0.119002,-0.535399,-0.396053,-0.651283,-2.197418,-1.469386,0.378830,"[Element Ce, Element Al, Element Si, Element Ni]"
387,0.610257,-0.359750,-0.382074,-0.002333,-0.571093,-0.384016,-0.651283,-2.159318,-1.460996,0.378830,"[Element Ce, Element Al, Element Si, Element Ni]"


In [None]:
learningcurve = LearningCurve()
learningcurve.evaluate(model=model,
                        X=X,
                        y=y,
                        savepath=savepath,
                        selector=selector,
                        make_new_dir=True)
Xreduced = copy(X)