In [90]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BDeuScore, K2Score, BicScore
from pgmpy.models import BayesianModel
from pgmpy.estimators import ExhaustiveSearch
from pgmpy.estimators import HillClimbSearch

## Data import and merge

In [91]:
df_g = pd.read_csv("geosmin.csv")
df_m = pd.read_csv("microcystin.csv")
df_c = pd.read_csv("cyanobacteria_abundance.csv")
var = pd.read_csv("variable_table.csv")
df_sp = pd.read_csv("solar&prec.csv").drop(["LAT","LON","YEAR","MO","DY"], axis=1)
df_geo = df_g.merge(df_sp,on = 'Date',how = 'inner')
df_mc = df_m.merge(df_sp,on = 'Date',how = 'inner')
df_cyano = df_c.merge(df_sp,on = 'Date',how = 'inner')
df_cyano.head()

Unnamed: 0,Date,sin,cos,DO,elev,pH,Spc,Temp,Turb,Bicarb,...,TN,FC,Chl-a,Fe,SSC,TNTP,NONH,Cyano,PRECTOT,ALLSKY_SFC_SW_DWN
0,2002-08-07,-0.59,-0.81,5.91,1419.99,8.42,852.0,27.0,11.6,200.0,...,0.62,1.0,3.1,358,13,5.64,0.38,48,0.0,22.07
1,2002-09-04,-0.9,-0.44,6.7,1420.36,8.49,831.0,25.9,12.4,182.0,...,0.626,3.0,5.2,125,11,5.69,1.75,2278,0.02,22.86
2,2003-02-10,0.65,0.76,14.42,1421.65,8.0,891.0,1.2,1.6,198.0,...,0.566,1.0,8.7,95,4,37.73,0.67,2228,0.18,13.21
3,2003-06-20,0.2,-0.98,6.16,1421.69,8.28,807.0,22.3,4.8,198.0,...,0.789,1.0,2.0,534,3,15.78,4.25,2934,0.96,15.84
4,2003-07-07,-0.09,-1.0,7.98,1421.35,8.42,809.0,25.6,24.1,191.0,...,0.816,107.0,12.3,351,18,6.53,2.0,5326,0.36,22.03


## Predict the missing data
Predicts states of all the missing variables.

## Split the data into training/test set
75% of the response variable data were used in training. The data was selected at random.

In [92]:
y_geo = df_geo.loc[:, 'Geo']
x_geo = df_geo.loc[:, 'Date':'ALLSKY_SFC_SW_DWN']
y_mc = df_mc.loc[:, 'MC']
x_mc = df_mc.loc[:, 'Date':'ALLSKY_SFC_SW_DWN']
y_cyano = df_cyano.loc[:, 'Cyano']
x_cyano = df_cyano.loc[:, 'Date':'ALLSKY_SFC_SW_DWN']
x_geo_train, x_geo_test, y_geo_train, y_geo_test = train_test_split(x_geo, y_geo, train_size=0.75)
x_mc_train, x_mc_test, y_mc_train, y_mc_test = train_test_split(x_mc, y_mc, train_size=0.75)
x_cyano_train, x_cyano_test, y_cyano_train, y_cyano_test = train_test_split(x_cyano, y_cyano, train_size=0.75)
geo_train = pd.concat([x_geo_train, y_geo_train], axis = 1)
mc_train = pd.concat([x_mc_train, y_mc_train], axis = 1)
cyano_train = pd.concat([x_cyano_train, y_cyano_train], axis = 1)
mc_train

Unnamed: 0,Date,sin,cos,DO,elev,pH,Spc,Temp,Turb,Bicarb,...,FC,Chl-a,Fe,SSC,TNTP,NONH,MC,PRECTOT,ALLSKY_SFC_SW_DWN,MC.1
169,2014-11-20,-0.65,0.76,12.65,1419.98,8.68,929,4.1,5.5,205.0,...,3.0,13.4,280.0,9.0,11.58,0.13,0.20,0.00,11.37,0.20
171,2015-01-13,0.22,0.98,14.64,1420.04,8.68,981,0.5,4.4,216.0,...,1.0,13.6,210.0,3.0,16.25,0.33,0.05,0.00,10.37,0.05
101,2010-05-18,0.69,-0.72,8.04,1421.93,8.22,814,17.6,21.0,180.0,...,0.5,1.2,1210.0,18.0,10.62,1.00,0.05,1.89,23.73,0.05
69,2009-01-06,0.10,0.99,13.19,1421.62,8.25,842,0.8,7.3,204.0,...,1.0,21.1,380.0,8.0,13.43,31.33,0.05,0.00,5.79,0.05
76,2009-03-16,0.96,0.28,11.47,1421.87,8.62,847,6.4,9.3,193.0,...,1.0,27.5,490.0,11.0,27.70,0.33,0.05,0.00,20.69,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,2008-08-18,-0.74,-0.67,5.36,1421.32,8.09,818,24.6,14.5,189.0,...,1.0,7.2,730.0,14.0,7.67,6.00,0.19,0.29,12.10,0.19
90,2009-10-19,-0.95,0.31,9.58,1421.86,8.26,752,12.2,14.8,176.0,...,140.0,2.5,850.0,14.0,11.11,10.50,0.05,0.02,15.62,0.05
157,2013-11-18,-0.67,0.74,10.68,1421.48,8.26,791,9.3,20.5,205.0,...,7.0,6.4,1140.0,33.0,11.50,3.19,0.12,0.00,12.05,0.12
74,2009-03-03,0.88,0.48,12.27,1421.61,8.84,845,4.4,7.7,195.0,...,4.0,24.9,380.0,10.0,13.88,4.00,0.05,0.00,12.16,0.05


## Structure learning
- Score based structure learning using BDeu, K2 and BIC. It computes a score to measure how well the given Bayesian model fits to the data set
- Searching methods include Exhaustive Search(Searches all possible graph with a given set of nodes) and Hill Climb Search(Starts at model start_dag and proceeds by step-by-step network modifications until a local maximum is reached) according to the scorng method supplied.

Given n nodes, 2^(n*(n-1)) graphs need to be searched in Exhaustive Search. It is likely not feasible for n>6. 

In [96]:
#BIC scoring
hc1 = HillClimbSearch(df_mc, scoring_method=BicScore(df_mc))
best_model1 = hc1.estimate()
print(best_model1.edges())
# BDeu scoring
hc2 = HillClimbSearch(df_mc, scoring_method=BDeuScore(df_mc))
best_model2 = hc2.estimate()
print(best_model2.edges())
# K2 scoring
hc3 = HillClimbSearch(df_mc, scoring_method=K2Score(df_mc))
best_model3 = hc3.estimate()
print(best_model3.edges())

  0%|          | 24/1000000 [04:40<3249:56:03, 11.70s/it]


KeyboardInterrupt: 

In [93]:
#Use BIC scoring method for microcystin, cyanobacteria abundance and geosmin model
hc_mc = HillClimbSearch(df_mc, scoring_method=BicScore(df_mc))
best_model_mc = hc_mc.estimate()
hc_geo = HillClimbSearch(df_geo, scoring_method=BicScore(df_geo))
best_model_geo = hc_geo.estimate()
hc_cyano = HillClimbSearch(df_cyano, scoring_method=BicScore(df_cyano))
best_model_cyano = hc_cyano.estimate()

AttributeError: 'DataFrame' object has no attribute 'unique'

## Define a Bayesian model
- 

  0%|          | 22/1000000 [01:10<893:06:43,  3.22s/it]


KeyboardInterrupt: 

## Parameter learning

## Max-Sum algorithm for MAP inference

## Model Validation