# 5b. Parameter estimation with missing data & data imputation
This notebook shows how parameter estimation is implemented in Thomas.

In [1]:
%run '_preamble.ipynb'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

available imports:
  import os
  import logging
  import pandas as pd
  import numpy as np

connect to this kernel with:
  jupyter console --existing 286bdf7b-d823-46d7-b3e2-728944488d17

Logging to: "/Users/melle/software-development/thomas-master/logs/5b. Training network parameters.log"
Current date/time: 08-07-2020, 00:13
Current working directory: "/Users/melle/software-development/thomas-master/notebooks"


In [2]:
from thomas.core import examples
from thomas.core import BayesianNetwork, Factor, CPT, JPT
from thomas.core.bayesian_network import DiscreteNetworkNode

try:
    from thomas.jupyter import BayesianNetworkWidget
except:
    pass

from IPython.display import display, HTML

## Example 17.2

### Create the structure

In [3]:
# Create the Nodes
H = DiscreteNetworkNode('H', states=['T', 'F'], position=[165, 29])
S = DiscreteNetworkNode('S', states=['T', 'F'], position=[66,141])
E = DiscreteNetworkNode('E', states=['T', 'F'], position=[288,154])

nodes = [H, S, E]

# Create the edges
edges = [
    ('H', 'S'),
    ('H', 'E'),
]

bn = BayesianNetwork('Example 17.2', nodes, edges)

for node in bn.nodes.values():
    node.reset()
    
    

In [3]:
try:
    view = BayesianNetworkWidget(bn, height=250)
    display(view)
    
except:
    display(bn.get_marginals())

BayesianNetworkWidget(height=250, marginals_and_evidence={'marginals': {'H': {'T': 0.5, 'F': 0.5}, 'S': {'T': …

### Load data to learn parameters with

In [4]:
filename = thomas.core.get_pkg_filename('dataset_17_2.csv')
df = pd.read_csv(filename, sep=';')
df = df[['H', 'S', 'E']]

print(f'df.shape: {df.shape[0]} rows x {df.shape[1]} cols')
df.head(3)

df.shape: 16 rows x 3 cols


Unnamed: 0,H,S,E
0,T,F,T
1,T,F,T
2,F,T,F


In [5]:
# Currently the dataset does not contain any NAs
print(f'This dataset has {df.isna().sum().sum()} NAs')

This dataset has 0 NAs


In [6]:
# Let's introduce a few NAs
df_na = df.copy()

df_na.loc[0,  'H'] = np.nan
df_na.loc[1,  'E'] = np.nan
df_na.loc[2,  'S'] = np.nan
df_na.loc[3,  'S'] = np.nan
df_na.loc[10, 'E'] = np.nan
df_na.loc[10, 'S'] = np.nan
df_na.loc[14, 'E'] = np.nan
df_na.loc[15, 'E'] = np.nan

print(f'And now we have {df_na.isna().sum().sum()} NAs')

df_na

And now we have 8 NAs


Unnamed: 0,H,S,E
0,,F,T
1,T,F,
2,F,,F
3,F,,T
4,T,F,F
5,T,F,T
6,F,F,F
7,T,F,T
8,T,F,T
9,F,F,T


In [7]:
# Initialize the probabilities of the BN using ML estimation (which drops any NAs in the dataset).
bn.ML_estimation(df_na)
try:
    view = BayesianNetworkWidget(bn, height=250)
    display(view)
    
except:
    display(bn.get_marginals())

BayesianNetworkWidget(marginals_and_evidence={'marginals': {'H': {'F': 0.2222222222222222, 'T': 0.777777777777…

### Data imputation

In [8]:
# We can use the estimated probabilities to impute the missing values.
case = df_na.iloc[10]
case

H      T
S    NaN
E    NaN
Name: 10, dtype: object

In [9]:
# Put the variables that have a missing value in 'missing' and those that have
# values in 'evidence'
missing = list(case[case.isna()].index)
evidence = {e: case[e] for e in case.index if e not in missing}

print('missing: ', missing)
print('evidence:', evidence)
print(f'  --> compute P({BayesianNetwork.create_query_string(qd=missing, ev=evidence)})')

missing:  ['S', 'E']
evidence: {'H': 'T'}
  --> compute P(S,E|H=T)


In [10]:
# _complete_case can compute the probability of all combinations of missing values.
# In this case the probability of each combination (i.e. 'weight' is returned as well)
bn.complete_case(case)

Unnamed: 0,H,S,E,weight
0,T,F,F,0.102041
1,T,F,T,0.612245
2,T,T,F,0.040816
3,T,T,T,0.244898


In [11]:
# or it can return the most likely combination (which corresponds to row '1' above)
bn.complete_case(case, include_weights=False)

H    T
S    F
E    T
Name: 1, dtype: object