# We'll make a binary classifier to try to predict the structure of a Heusler alloy

In [1]:
import re
import numpy as np
import os
import random

import pandas as pd
from collections import defaultdict

from sklearn import model_selection, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier

Load in the data

In [2]:
df=pd.DataFrame()
df=pd.read_csv(filepath_or_buffer='{}/ml_225_216git.csv'.format(os.getcwd()))

data_dict=defaultdict(list)

for index, row in df.iterrows():
    #run through dataset and find all same compound entries
    data_dict[row['Compound']].append((row['Space group'],row['Formation Enthalpy'])) 

Notes: <br>
Formation enthalpy: In essence, a measure of stability. The lower this number, the more stable a compound is. <br>
Space group: A way of categorizing the configuration of atoms, can be anywhere from 1 to 250.<br> Here we just consider two, 225 and 216. These are two of the most common Heusler alloy arrangements.

In [3]:
len(data_dict)

60677

In [4]:
#make a dictionary containing only two aflow entries with distinct space groups
new_dict_only_two=defaultdict(list)
for compou, list_obj in data_dict.items():
    #so here we lose some data (eg. two 225 structs, 1 216)
    if(len(list_obj)==2):
        for index,hit in enumerate(list_obj):
            new_dict_only_two[compou].append((list_obj[index][0],list_obj[index][1]))

In [5]:
# approximately 5% data lost in the above step
len(new_dict_only_two)

56877

Now we will make the two space group labels binary numbers, 0 or 1

In [6]:
binary_dict=defaultdict(list)
for compou,list_obj in new_dict_only_two.items():
    if(list_obj[0][0]==225):
        #reduce amount of 225 to even out the class labels
        if(random.random()<0.58):
            binary_dict[compou]=1
    else:
        binary_dict[compou]=0

In [7]:
#Now we can check what elements are in the dictionary

element_list=[]
for compou, obj in binary_dict.items():
    element_str=''
    for char in compou:
        if(char.isnumeric()==True):
            if(element_str):
                element_list.append(element_str)
            element_str=''    
        else:
            element_str+=char

In [8]:
np.unique(element_list)

array(['Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'Ca', 'Cd',
       'Cl', 'Co', 'Cr', 'Cu', 'Fe', 'Ga', 'Ge', 'Hf', 'Hg', 'In', 'Ir',
       'K', 'La', 'Li', 'Mg', 'Mn', 'Mo', 'Na', 'Nb', 'Ni', 'Os', 'P',
       'Pb', 'Pd', 'Pt', 'Re', 'Rh', 'Ru', 'Sb', 'Sc', 'Se', 'Si', 'Sn',
       'Sr', 'Ta', 'Tc', 'Te', 'Ti', 'Tl', 'V', 'W', 'Y', 'Zn', 'Zr'],
      dtype='<U2')

# Now we create dictionaries which map element strings to chemical that data I've guessed to be important in stability! 

In [9]:
# NOTE! The data has been scaled (probably poorly) so these values are not in fact exact

element_to_valence={"Li":1,"Na":1,"K":1,
                    "Be":2,"Mg":2,"Ca":2,"Sr":2,"Ba":2,
                    "B":3,"Al":3,"Sc":3,"Y":3,"La":3,"Ga":3,"In":3,"Tl":3,
                    "Ti":4,"Zr":4,"Hf":4,"Si":4,"Ge":4,"Sn":4,"Pb":4,
                    "V":5,"Nb":5,"Ta":5,"P":5,"As":5,"Sb":5,"Bi":5,
                    "Cr":6,"Mo":6,"W":6,"Se":6,"Te":6,
                    "Mn":7,"Tc":7,"Re":7,"Cl":7,"Br":7,""
                    "Fe":8,"Ru":8,"Os":8,
                    "Co":9,"Rh":9,"Ir":9,
                    "Ni":10,"Pd":10,"Pt":10,
                    "Cu":11,"Ag":11,"Au":11,
                    "Zn":12,"Cd":12,"Hg":12}

element_to_s_orb={"Li":1,"Na":1,"K":1,
                    "Be":2,"Mg":2,"Ca":2,"Sr":2,"Ba":2,
                    "B":2,"Al":2,"Sc":2,"Y":2,"La":2,"Ga":2,"In":2,"Tl":2,
                    "Ti":2,"Zr":2,"Hf":2,"Si":2,"Ge":2,"Sn":2,"Pb":2,
                    "V":2,"Nb":2,"Ta":2,"P":2,"As":2,"Sb":2,"Bi":2,
                    "Cr":2,"Mo":2,"W":2,"Se":2,"Te":2,
                    "Mn":2,"Tc":2,"Re":2,"Cl":2,"Br":2,
                    "Fe":2,"Ru":2,"Os":2,
                    "Co":2,"Rh":2,"Ir":2,
                    "Ni":2,"Pd":2,"Pt":2,
                    "Cu":2,"Ag":2,"Au":2,
                    "Zn":2,"Cd":2,"Hg":2}

element_to_d_orb={"Li":0,"Na":0,"K":0,
                    "Be":0,"Mg":0,"Ca":0,"Sr":0,"Ba":0,
                    "B":0,"Al":0,"Sc":1,"Y":1,"La":1,"Ga":10,"In":10,"Tl":10,
                    "Ti":2,"Zr":2,"Hf":2,"Si":0,"Ge":10,"Sn":10,"Pb":10,
                    "V":3,"Nb":2,"Ta":3,"P":0,"As":10,"Sb":10,"Bi":10,
                    "Cr":4,"Mo":4,"W":4,"Se":10,"Te":10,
                    "Mn":5,"Tc":5,"Re":5,"Cl":0,"Br":10,
                    "Fe":6,"Ru":6,"Os":6,
                    "Co":7,"Rh":7,"Ir":7,
                    "Ni":8,"Pd":8,"Pt":8,
                    "Cu":9,"Ag":9,"Au":9,
                    "Zn":10,"Cd":10,"Hg":10}

element_to_p_orb={"Li":0,"Na":0,"K":0,
                    "Be":0,"Mg":0,"Ca":0,"Sr":0,"Ba":0,
                    "B":1,"Al":1,"Sc":0,"Y":0,"La":0,"Ga":1,"In":1,"Tl":1,
                    "Ti":0,"Zr":0,"Hf":0,"Si":2,"Ge":2,"Sn":2,"Pb":2,
                    "V":0,"Nb":0,"Ta":0,"P":3,"As":3,"Sb":3,"Bi":3,
                    "Cr":0,"Mo":0,"W":0,"Se":4,"Te":4,
                    "Mn":0,"Tc":0,"Re":0,"Cl":5,"Br":5,
                    "Fe":0,"Ru":0,"Os":0,
                    "Co":0,"Rh":0,"Ir":0,
                    "Ni":0,"Pd":0,"Pt":0,
                    "Cu":0,"Ag":0,"Au":0,
                    "Zn":0,"Cd":0,"Hg":0}
                    
element_to_electroneg={"Li":0.98,"Na":0.93,"K":0.82,
                    "Be":1.57,"Mg":1.31,"Ca":1.0,"Sr":0.95,"Ba":0.89,
                    "B":2.04,"Al":1.61,"Sc":1.36,"Y":1.22,"La":1.1,"Ga":1.81,"In":1.78,"Tl":1.62,
                    "Ti":1.54,"Zr":1.33,"Hf":1.3,"Si":1.9,"Ge":2.01,"Sn":1.96,"Pb":1.87,
                    "V":1.63,"Nb":1.6,"Ta":1.5,"P":2.19,"As":2.18,"Sb":2.05,"Bi":2.02,
                    "Cr":1.66,"Mo":2.16,"W":2.36,"Se":2.55,"Te":2.1,
                    "Mn":1.55,"Tc":1.9,"Re":1.9,"Cl":3.16,"Br":2.96,
                    "Fe":1.83,"Ru":2.2,"Os":2.2,
                    "Co":1.9,"Rh":2.28,"Ir":2.2,
                    "Ni":1.91,"Pd":2.2,"Pt":2.28,
                    "Cu":1.9,"Ag":1.93,"Au":2.54,
                    "Zn":1.65,"Cd":1.69,"Hg":2.00}

element_to_ionization_ene={"Li":5.2,"Na":4.95,"K":4.19,
                    "Be":9.0,"Mg":7.37,"Ca":5.6,"Sr":5.5,"Ba":5.03,
                    "B":8.0,"Al":5.78,"Sc":6.33,"Y":6.0,"La":5.38,"Ga":5.79,"In":5.58,"Tl":5.89,
                    "Ti":6.59,"Zr":6.4,"Hf":6.59,"Si":7.87,"Ge":7.62,"Sn":7.1,"Pb":7.2,
                    "V":6.5,"Nb":6.5,"Ta":7.6,"P":10.1,"As":9.5,"Sb":8.3,"Bi":7.0,
                    "Cr":6.5,"Mo":6.8,"W":7.7,"Se":9.4,"Te":8.7,
                    "Mn":7.2,"Tc":7.0,"Re":7.6,"Cl":12.5,"Br":11.4,
                    "Fe":7.6,"Ru":7.1,"Os":8.4,
                    "Co":7.6,"Rh":7.2,"Ir":8.8,
                    "Ni":7.4,"Pd":8.0,"Pt":8.7,
                    "Cu":7.5,"Ag":7.3,"Au":8.9,
                    "Zn":9.1,"Cd":8.7,"Hg":10.0}

element_to_ionization_ene2={"Li":7.5,"Na":4.7,"K":3.16,
                    "Be":1.8,"Mg":1.5,"Ca":1.2,"Sr":1.1,"Ba":1.0,
                    "B":2.5,"Al":1.9,"Sc":1.3,"Y":1.2,"La":1.1,"Ga":2.0,"In":1.9,"Tl":2.04,
                    "Ti":1.4,"Zr":1.3,"Hf":1.5,"Si":1.6,"Ge":1.6,"Sn":1.46,"Pb":1.5,
                    "V":1.465,"Nb":1.4,"Ta":0.0,"P":1.98,"As":1.86,"Sb":1.65,"Bi":7.0,
                    "Cr":1.65,"Mo":1.6,"W":0.0,"Se":2.1,"Te":1.86,
                    "Mn":1.564,"Tc":1.5,"Re":0.0,"Cl":2.38,"Br":2.18,
                    "Fe":1.618,"Ru":1.676,"Os":0,
                    "Co":1.7,"Rh":1.8,"Ir":0,
                    "Ni":1.8,"Pd":1.9,"Pt":1.8563,
                    "Cu":2.0,"Ag":2.149,"Au":2.05,
                    "Zn":1.8,"Cd":1.69,"Hg":1.88}

element_to_atomic_number={"Li":0.3,"Na":1.1,"K":1.9,
                    "Be":0.4,"Mg":1.2,"Ca":2.0,"Sr":3.8,"Ba":5.6,
                    "B":3.,"Al":1.3,"Sc":2.1,"Y":3.9,"La":5.7,"Ga":3.1,"In":4.9,"Tl":8.1,
                    "Ti":2.2,"Zr":4.0,"Hf":7.2,"Si":4.0,"Ge":4.0,"Sn":4.0,"Pb":4,
                    "V":2.3,"Nb":4.1,"Ta":7.3,"P":1.5,"As":3.3,"Sb":5.1,"Bi":8.3,
                    "Cr":2.4,"Mo":4.2,"W":7.4,"Se":3.4,"Te":5.2,
                    "Mn":2.5,"Tc":4.3,"Re":7.5,"Cl":1.7,"Br":3.5,
                    "Fe":2.6,"Ru":4.4,"Os":7.6,
                    "Co":2.7,"Rh":4.5,"Ir":7.7,
                    "Ni":2.8,"Pd":4.6,"Pt":7.8,
                    "Cu":2.9,"Ag":4.7,"Au":7.9,
                    "Zn":3.0,"Cd":4.8,"Hg":8.0}

element_to_atomic_radius={"Li":1.67,"Na":1.9,"K":2.43,
                    "Be":1.12,"Mg":1.45,"Ca":1.94,"Sr":2,"Ba":2,
                    "B":0.87,"Al":1.18,"Sc":1.84,"Y":2.12,"La":2.17,"Ga":1.36,"In":1.56,"Tl":1.56,
                    "Ti":1.8,"Zr":2.06,"Hf":2.08,"Si":1.11,"Ge":1.25,"Sn":1.45,"Pb":1.54,
                    "V":1.7,"Nb":1.98,"Ta":2.0,"P":0.98,"As":1.14,"Sb":1.33,"Bi":1.43,
                    "Cr":1.66,"Mo":1.9,"W":1.93,"Se":1.03,"Te":1.23,
                    "Mn":1.61,"Tc":1.83,"Re":1.88,"Cl":0.79,"Br":0.94,
                    "Fe":1.56,"Ru":1.78,"Os":1.85,
                    "Co":1.52,"Rh":1.73,"Ir":1.8,
                    "Ni":1.49,"Pd":1.69,"Pt":1.77,
                    "Cu":1.45,"Ag":1.65,"Au":1.74,
                    "Zn":1.42,"Cd":1.61,"Hg":1.71}

The functions below just create feature arrays with the above dictionaries, keeping the element with 2 atoms at the front of this feature array.
So for example, Al2Cu1Mn1, fed through compound_to_atomic_radius_list() will return
[1.18, 1.45, 1.61]

This is a pretty hacky way to do things, should probably make a full dict of all values above then specify how many values you want to model in one function but hey, this was faster!

In [48]:
def compound_to_valence_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            if(element_str):
                if(int(char)==2):
                    if(element_str not in np.unique(element_list)):
                        print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                        return
                    if(int(char) not in [1,2]):
                        print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                        return
                    #We want the element with 2 atoms in the alloy to be in the same position
                    element_num_list.insert(0,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_valence[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_electroneg_list(compound):
    element_num_list=[]
    element_str=''
    
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT!  Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first to be the X element
                if(int(char)==2):
                    element_num_list.insert(0,element_to_electroneg[element_str])
                    #element_num_list.insert(1,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_electroneg[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_atomic_no_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first two to be the X position
                if(int(char)==2):
                    element_num_list.insert(0,element_to_atomic_number[element_str])
                    #element_num_list.insert(1,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_atomic_number[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_atomic_radius_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first two to be the X position
                if(int(char)==2):
                    element_num_list.insert(0,element_to_atomic_radius[element_str])
                    #element_num_list.insert(1,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_atomic_radius[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_best_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first two to be the X position
                if(int(char)==2):
                    element_num_list.insert(0,element_to_atomic_radius[element_str])
                    element_num_list.insert(1,element_to_ionization_ene[element_str])
                    element_num_list.insert(2,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_atomic_radius[element_str])
                    element_num_list.append(element_to_ionization_ene[element_str])
                    element_num_list.append(element_to_valence[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_ionization_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        raise ValueError('Compound given must contain numbers after each element')
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        raise ValueError("agy")
        return
    for char in compound:
        if(char.isnumeric()==True):
            
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first two to be the X position
                if(int(char)==2):
                    element_num_list.insert(0,element_to_ionization_ene[element_str])
                    #element_num_list.insert(1,element_to_valence[element_str])
                else:
                    element_num_list.append(element_to_ionization_ene[element_str])
            element_str=''    
        else:
            element_str+=char
            
    return element_num_list

def compound_to_info_list(compound):
    element_num_list=[]
    element_str=''
    if(bool(re.search(r'\d', compound))==False):
        print("ERROR: Compound given must contain numbers after each element eg; Al2Cu1Mn1")
        return
    if( (re.match('^[\w-]+$', compound) is not None) == False):
        print("ERROR: BAD FORMAT! Can only take alphanumeric inputs")
        return
    for char in compound:
        if(char.isnumeric()==True):
            if(element_str):
                if(element_str not in np.unique(element_list)):
                    print("ERROR: You did not give an appropriate compound. Check possible input elements.")
                    return
                if(int(char) not in [1,2]):
                    print("ERROR: The compound must have a full-Heusler structure, so the numbers after elements should be 1 or 2")
                    return
                #force first two to be the X position
                if(int(char)==2):
                    element_num_list.insert(0,element_to_electroneg[element_str])
                    element_num_list.insert(1,element_to_ionization_ene[element_str])
                    element_num_list.insert(2,element_to_ionization_ene2[element_str])
                    element_num_list.insert(3,element_to_atomic_number[element_str])
                    element_num_list.insert(4,element_to_atomic_radius[element_str])
                    element_num_list.insert(5,element_to_valence[element_str])
                    #element_num_list.insert(6,element_to_s_orb[element_str])
                    element_num_list.insert(6,element_to_d_orb[element_str])
                    element_num_list.insert(7,element_to_p_orb[element_str])
                else:
                    element_num_list.append(element_to_electroneg[element_str])
                    element_num_list.append(element_to_ionization_ene[element_str])
                    element_num_list.append(element_to_ionization_ene2[element_str])
                    element_num_list.append(element_to_atomic_number[element_str])
                    element_num_list.append(element_to_atomic_radius[element_str])
                    element_num_list.append(element_to_valence[element_str])
                    #element_num_list.append(element_to_s_orb[element_str])
                    element_num_list.append(element_to_d_orb[element_str])
                    element_num_list.append(element_to_p_orb[element_str])
            element_str=''    
        else:
            element_str+=char
    #order based on electroneg
    #I hypothesize it is important!
    if(element_num_list[8]<element_num_list[16]):
        element_num_list[8], element_num_list[16] = element_num_list[16], element_num_list[8]
        element_num_list[9], element_num_list[17] = element_num_list[17], element_num_list[9]
        element_num_list[10], element_num_list[18] = element_num_list[18], element_num_list[10]
        element_num_list[11], element_num_list[19] = element_num_list[19], element_num_list[11]
        element_num_list[12], element_num_list[20] = element_num_list[20], element_num_list[12]
        element_num_list[13], element_num_list[21] = element_num_list[21], element_num_list[13]
        element_num_list[14], element_num_list[22] = element_num_list[22], element_num_list[14]
        element_num_list[15], element_num_list[23] = element_num_list[23], element_num_list[15]

    return element_num_list



# BUILD SOME MODELS!

I like this method for quickly testing. Since train_test_split() shuffles the data, creating these methods allows us a nice quick way to test consistency of the model on random test data.
As part of my project I set out to test the varying efficacies of different models based on different feature arrays, as a way to quantify the effects of specific chemical attributes of the compound

In [49]:
def valence(model):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),3))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_valence_list(compou))    
        count+=1
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=model
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Valence model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

def electroneg(model):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),3))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_electroneg_list(compou))
        count+=1
    count=0
    emp_err=0
    for i in range(0,len(feature_array)):
        if(bit_array[count]==0 and (feature_array[count][0] < feature_array[count][1]) and (feature_array[count][0] < feature_array[count][2])):
            #print ('empirical thing is wrong')
            emp_err+=1
        count+=1
    print (emp_err)
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=model
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Electroneg model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

def atomic_radius(model):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),3))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_atomic_radius_list(compou))
        count+=1
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=svm.SVC()
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Atomic radius model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

    
    
def atomic_no(model):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),3))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_atomic_no_list(compou))
        count+=1
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=model
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Atomic number model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

def ionization(moel):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),3))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_ionization_list(compou))
        count+=1
        
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=svm.SVC()
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Ionization energy model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

def best_indiv(model):
    count=0
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    feature_array=np.ndarray(shape=(len(binary_dict.items()),9))
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_best_list(compou))
        count+=1
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)

    clf=svm.SVC()
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Combo model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))

    
def FullModel(model):
    feature_array=np.ndarray(shape=(len(binary_dict.items()),24))
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    count=0
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_info_list(compou))
        count+=1
    
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)
    clf=model
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Full model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))
    
def scaled_FullModel(model):
    feature_array=np.ndarray(shape=(len(binary_dict.items()),24))
    bit_array=np.ndarray(shape=(len(binary_dict.items()),))
    count=0
    for compou, bit in binary_dict.items():
        bit_array[count]=bit
        feature_array[count]=(compound_to_info_list(compou))
        count+=1
    
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(feature_array,bit_array,test_size=0.1)
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    clf=model
    clf.fit(X_train,Y_train)
    accuracy=clf.score(X_test,Y_test)
    print ("Scaled full model can predict with approx. {0:.2f}% accuracy whether 225 or 216 structure is energetically favourable".format(100*accuracy))


In [50]:
#just allows us to pass in a compound as string
def predict_Heusler(compound):
    compound_info=compound_to_info_list(compound)
    
    info_array=np.ndarray(shape=(1,21))
    i=0
    for hit in compound_info:
        #print (hit)
        info_array[i]=hit
    print("Model predicts {} to be {}".format(compound,clf.predict(info_array)))
    print("0 is 216, 1 is 225")

I used TPOTClassifier (basically cheating for ML) to run overnight and it converged on a fairly simple pipeline which had a slightly better accuracy than my manual model making (I had settled on using a BaggingClassifier) 

In [None]:
scaled_FullModel(ExtraTreesClassifier(bootstrap=False, criterion='entropy', max_features=0.45, min_samples_leaf=2, min_samples_split=9, n_estimators=100))

The above can achieve a little over 90% accuracy on this real world data. So that's pretty cool, not sure of anyone else who has done something like this :)