# 06-post-analysis

How can we verify if the trained models are correct in their predictions? 

One way is to verify against known candidates. 

In [1]:
import sys
sys.path.insert(0, "../")

import pandas as pd
from pathlib import Path

data_dir = Path.cwd().parent.parent / "data"
models_dir = Path.cwd().parent.parent / "models" 

print("Current data directory {}".format(data_dir))

Current data directory /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates/data


In [2]:
known_candidates = ["C", "Si", "SiGe", "AlN", "GaN",
                   "AlP", "GaP", "AlAs", "ZnO", "ZnS", "ZnSe", "ZnTe", "CdS"]
QD_2D_candidates = ["BN", "MoS2", "WSe2", #2D
                    "LnAs" #QD
                   ] 

In [3]:
InsertApproach = "02-determined-approach"

## Training set
How many entries in our training set consists of known candidates?

In [4]:
trainingSet   = pd.read_pickle(data_dir / InsertApproach / "processed" / "trainingSet.pkl")

In [5]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(known_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate
3,C8,C,1.0
5,C4,C,1.0
6,C2,C,1.0
13,Si2,Si,1.0
16,Si4,Si,1.0
17,C2,C,1.0
47,Al2N2,AlN,0.0
61,Ga2N2,GaN,0.0
62,Ga1N1,GaN,1.0
79,Zn1Se1,ZnSe,1.0


In [6]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate
27,B4N4,BN,0.0
69,B2N2,BN,1.0
112,B4N4,BN,0.0
115,B1N1,BN,1.0
124,W2Se4,WSe2,1.0
180,B2N2,BN,0.0
190,Mo2S4,MoS2,1.0
482,B2N2,BN,1.0
657,B4N4,BN,1.0
1678,B4N4,BN,1.0


## Test set
How many entries in our test set consists of known candidates?

In [7]:
Summary = pd.read_pickle(models_dir / InsertApproach / "summary" / "summary.pkl")
Summary.shape

(22573, 9)

In [8]:
Summary[Summary["pretty_formula"].isin(known_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2731,mp-370,Cd1S1,CdS,1.0,0.996326,1.0,0.816531,1.0,0.853464
2733,mp-380,Zn2Se2,ZnSe,1.0,0.926522,1.0,0.876370,1.0,0.970442
2760,mp-672,Cd2S2,CdS,0.0,0.012748,1.0,0.813226,1.0,0.696672
2891,mp-2133,Zn2O2,ZnO,1.0,0.990415,1.0,0.888682,1.0,0.978306
2922,mp-2469,Cd1S1,CdS,1.0,0.640652,1.0,0.763356,1.0,0.718430
...,...,...,...,...,...,...,...,...,...
24282,mp-1201781,Zn18S18,ZnS,1.0,0.995256,1.0,0.905312,1.0,0.983550
24344,mp-1202023,Zn18S18,ZnS,1.0,0.994891,1.0,0.891981,1.0,0.985025
24385,mp-1202182,Zn18S18,ZnS,1.0,0.995689,1.0,0.902722,1.0,0.984070
24587,mp-1202959,Zn18S18,ZnS,1.0,0.995763,1.0,0.902722,1.0,0.984070


In [9]:
Summary[Summary["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2836,mp-1434,Mo1S2,MoS2,1.0,0.997653,1.0,0.860107,1.0,0.923513


## How many entries does the models agree on? 

In [16]:
"""
Summary[(Summary["RF "] == 1) & 
        (Summary["RF under"] == 1) & 
        (Summary["RF over"] == 1) & 
        (Summary["RF both"] == 1) & 
        (Summary["GB "] == 1) & 
        (Summary["GB under"] == 1) & 
        (Summary["GB over"] == 1) & 
        (Summary["GB both"] == 1)]
"""
Summary[(Summary["LOG "] == 1) & 
        (Summary["RF "] == 1) & 
        (Summary["GB "] == 1)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2703,mvc-12905,Fe4O8,FeO2,1.0,0.999986,1.0,0.922369,1.0,0.976536
2705,mp-157,P4,P,1.0,0.997874,1.0,0.889742,1.0,0.929037
2706,mp-181,K3Ga9,KGa3,1.0,0.987275,1.0,0.808180,1.0,0.859897
2707,mp-189,Si4Ru4,SiRu,1.0,0.999853,1.0,0.849502,1.0,0.836964
2708,mp-200,La4P28,LaP7,1.0,0.578428,1.0,0.861876,1.0,0.652245
...,...,...,...,...,...,...,...,...,...
25270,mp-1304797,Sr12In4Ni4O24,Sr3InNiO6,1.0,0.999998,1.0,0.899608,1.0,0.987114
25271,mp-1539137,Rb1Cr5S8,RbCr5S8,1.0,0.877739,1.0,0.841059,1.0,0.817910
25272,mp-1541522,Bi2P2O8,BiPO4,1.0,0.999996,1.0,0.919829,1.0,0.916250
25274,mp-1542038,Cs2Sn2Se6,CsSnSe3,1.0,0.986675,1.0,0.874979,1.0,0.949795
