# 06-post-analysis

How can we verify if the trained models are correct in their predictions? 

One way is to verify against known candidates. 

In [1]:
import sys
sys.path.insert(0, "../")

import pandas as pd
from pathlib import Path

data_dir = Path.cwd().parent.parent.parent / "data"
models_dir = Path.cwd().parent.parent.parent / "models" 

print("Current data directory {}".format(data_dir))

Current data directory /home/oliver/Dokumenter/masterprosjekt/predicting-solid-state-qubit-candidates/data


In [2]:
known_candidates = ["C", "Si", "SiGe", "AlN", "GaN",
                   "AlP", "GaP", "AlAs", "ZnO", "ZnS", "ZnSe", "ZnTe", "CdS"]
QD_2D_candidates = ["BN", "MoS2", "WSe2", #2D
                    "LnAs" #QD
                   ] 

In [3]:
InsertApproach = "03-brute-approach"
numberOfPrincipalComponents = 2

## Training set
How many entries in our training set consists of known candidates?

In [4]:
trainingSet   = pd.read_pickle(data_dir / InsertApproach / "processed" / "trainingSet.pkl")

In [5]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(known_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate
1,C8,C,1.0
2,C4,C,1.0
3,C2,C,1.0
4,Si2,Si,1.0
5,Si4,Si,1.0
...,...,...,...
324,C48,C,1.0
325,Si68,Si,1.0
327,Si106,Si,1.0
331,Si46,Si,1.0


In [6]:
trainingSet[["full_formula", "pretty_formula", "candidate"]][trainingSet["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,full_formula,pretty_formula,candidate


## Test set
How many entries in our test set consists of known candidates?

In [7]:
Summary = pd.read_pickle(models_dir / InsertApproach /  "summary" / Path("PCA-" + str(numberOfPrincipalComponents) + "-" + "summary.pkl"))
Summary.shape

(24871, 9)

In [8]:
Summary[Summary["pretty_formula"].isin(known_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob


In [9]:
Summary[Summary["pretty_formula"].isin(QD_2D_candidates)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
378,mp-344,B4N4,BN,0.0,0.365134,0.0,0.314532,0.0,0.03366
477,mp-984,B2N2,BN,0.0,0.39471,0.0,0.314532,0.0,0.053649
546,mp-1434,Mo1S2,MoS2,1.0,0.548509,0.0,0.464836,0.0,0.351214
574,mp-1599,B4N4,BN,0.0,0.063371,0.0,0.314532,0.0,0.109615
577,mp-1639,B1N1,BN,1.0,0.830858,1.0,0.628596,1.0,0.983385
604,mp-1821,W2Se4,WSe2,1.0,0.607618,0.0,0.332671,0.0,0.06518
723,mp-2653,B2N2,BN,1.0,0.838327,1.0,0.628596,1.0,0.983385
747,mp-2815,Mo2S4,MoS2,0.0,0.465134,0.0,0.414694,1.0,0.514803
2312,mp-7991,B2N2,BN,0.0,0.298398,0.0,0.314532,0.0,0.080576
3428,mp-13151,B4N4,BN,1.0,0.714732,1.0,0.628596,1.0,0.990231


## How many entries does the models agree on? 

In [10]:
Summary[(Summary["RF "] == 1) & 
        (Summary["GB "] == 1) & 
        (Summary["LOG "] == 1)]

Unnamed: 0,material_id,full_formula,pretty_formula,LOG,LOG Prob,RF,RF Prob,GB,GB Prob
334,mp-14,Se3,Se,1.0,0.739504,1.0,0.549479,1.0,0.974128
340,mp-111,Ne1,Ne,1.0,0.841726,1.0,0.833336,1.0,0.978457
342,mp-137,Ge12,Ge,1.0,0.876169,1.0,0.793089,1.0,0.979288
345,mp-157,P4,P,1.0,0.742847,1.0,0.715649,1.0,0.886274
346,mp-160,B12,B,1.0,0.577683,1.0,0.548921,1.0,0.867509
...,...,...,...,...,...,...,...,...,...
25181,mp-1244594,La8Mn2S12O2,La4MnS6O,1.0,0.681421,1.0,0.697035,1.0,0.886274
25187,mp-1277483,Li4La16Co4O32,LiLa4CoO8,1.0,0.579077,1.0,0.653459,1.0,0.704526
25189,mp-1288517,La4Fe4As4O4,LaFeAsO,1.0,0.746467,1.0,0.824277,1.0,0.927284
25191,mp-1289745,Ba4Fe2Mo2O12,Ba2FeMoO6,1.0,0.796140,1.0,0.679065,1.0,0.579124
