# 06-post-analysis

How can we verify if the trained models are correct in their predictions? 

One way is to verify against known candidates. 

In [1]:
import sys
sys.path.insert(0, "../")

import pandas as pd
from pathlib import Path

data_dir = Path.cwd().parent.parent.parent / "data"
models_dir = Path.cwd().parent.parent.parent / "models" 

print("Current data directory {}".format(data_dir))

Current data directory /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates/data


In [2]:
known_candidates = ["C", "Si", "SiGe", "AlN", "GaN",
                   "AlP", "GaP", "AlAs", "ZnO", "ZnS", "ZnSe", "ZnTe", "CdS"]
QD_2D_candidates = ["BN", "MoS2", "WSe2", #2D
                    "LnAs" #QD
                   ] 

In [3]:
InsertApproach = "02-determined-approach"
numberOfPrincipalComponents = 176

## Training set
How many entries in our training set consists of known candidates?

In [4]:
trainingSet   = pd.read_pickle(data_dir / InsertApproach / "processed" / "trainingSet.pkl")

In [5]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(known_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate
3,C8,C,1.0
5,C4,C,1.0
6,C2,C,1.0
13,Si2,Si,1.0
16,Si4,Si,1.0
17,C2,C,1.0
47,Al2N2,AlN,0.0
61,Ga2N2,GaN,0.0
62,Ga1N1,GaN,1.0
79,Zn1Se1,ZnSe,1.0


In [6]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate
27,B4N4,BN,0.0
69,B2N2,BN,1.0
112,B4N4,BN,0.0
115,B1N1,BN,1.0
124,W2Se4,WSe2,1.0
180,B2N2,BN,0.0
190,Mo2S4,MoS2,1.0
482,B2N2,BN,1.0
657,B4N4,BN,1.0
1678,B4N4,BN,1.0


## Test set
How many entries in our test set consists of known candidates?

In [7]:
Summary = pd.read_pickle(models_dir / InsertApproach /  "summary" / Path("PCA-" + str(numberOfPrincipalComponents) + "-" + "summary.pkl"))
Summary.shape

(22576, 9)

In [8]:
Summary[Summary["pretty_formula"].isin(known_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2728,mp-370,Cd1S1,CdS,1.0,0.999245,1.0,0.852893,1.0,0.962888
2730,mp-380,Zn2Se2,ZnSe,0.0,0.021612,1.0,0.896475,1.0,0.726363
2757,mp-672,Cd2S2,CdS,0.0,0.008079,1.0,0.885310,1.0,0.960781
2888,mp-2133,Zn2O2,ZnO,1.0,0.799581,1.0,0.914802,1.0,0.981526
2919,mp-2469,Cd1S1,CdS,1.0,0.883961,1.0,0.861728,1.0,0.912534
...,...,...,...,...,...,...,...,...,...
24282,mp-1201781,Zn18S18,ZnS,1.0,0.987491,1.0,0.905885,1.0,0.983043
24344,mp-1202023,Zn18S18,ZnS,1.0,0.991973,1.0,0.911646,1.0,0.984216
24385,mp-1202182,Zn18S18,ZnS,1.0,0.987980,1.0,0.911646,1.0,0.984216
24587,mp-1202959,Zn18S18,ZnS,1.0,0.987653,1.0,0.911646,1.0,0.984216


In [9]:
Summary[Summary["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2833,mp-1434,Mo1S2,MoS2,1.0,0.992707,1.0,0.882377,1.0,0.899007


## How many entries does the models agree on? 

In [10]:
Summary[(Summary["RF "] == 1) & 
        (Summary["GB "] == 1) & 
        (Summary["LOG "] == 1)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
2700,mvc-12905,Fe4O8,FeO2,1.0,0.999997,1.0,0.908749,1.0,0.986294
2702,mp-157,P4,P,1.0,0.999707,1.0,0.897220,1.0,0.982504
2704,mp-189,Si4Ru4,SiRu,1.0,0.999755,1.0,0.906675,1.0,0.988104
2705,mp-200,La4P28,LaP7,1.0,0.994301,1.0,0.905346,1.0,0.957942
2706,mp-209,Er8Se12,Er2Se3,1.0,1.000000,1.0,0.860066,1.0,0.924752
...,...,...,...,...,...,...,...,...,...
25269,mp-1299874,Li4La16Co4O32,LiLa4CoO8,1.0,0.910680,1.0,0.870438,1.0,0.840089
25270,mp-1304797,Sr12In4Ni4O24,Sr3InNiO6,1.0,1.000000,1.0,0.909219,1.0,0.974638
25272,mp-1541522,Bi2P2O8,BiPO4,1.0,1.000000,1.0,0.908359,1.0,0.960784
25274,mp-1542038,Cs2Sn2Se6,CsSnSe3,1.0,0.549143,1.0,0.900470,1.0,0.957414
