In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BDeuScore, K2Score, BicScore, ExhaustiveSearch, HillClimbSearch
from pgmpy.models import BayesianModel

## Data import and variable selection
- Import geosmin, microcystin and cyanobacteria abundance datasets
- Select columns according to importance ranking<br>
Geosmin:<br>
![image](var_geo.gif)<br>
Cyanobacteria:<br>
![image](var_cyano.gif)<br>
Microcystin:<br>
![image](var_mc.gif)<br>

In [3]:
df_g = pd.read_csv("geosmin.csv")[['Date','SSC','NO','cos','DO','Si','TKN','Geo']]
df_m = pd.read_csv("microcystin.csv")[['Date','Temp','sin','DO','Chl-a','TP','Fe','MC']]
df_c = pd.read_csv("cyanobacteria_abundance.csv")[['Date','Chl-a','sin','TKN','DP','Si','Temp','Cyano']]
df_sp = pd.read_csv("solar&prec.csv").drop(["LAT","LON","YEAR","MO","DY"], axis=1)

## Data merge

In [11]:
df_geo = df_sp.merge(df_g,on = 'Date',how = 'inner').drop(['Date'],axis=1)
df_mc = df_sp.merge(df_m,on = 'Date',how = 'inner').drop(['Date'],axis=1)
df_cyano = df_sp.merge(df_c,on = 'Date',how = 'inner').drop(['Date'],axis=1)
df_geo

Unnamed: 0,PRECTOT,ALLSKY_SFC_SW_DWN,SSC,NO,cos,DO,Si,TKN,Geo
0,0.00,22.07,12.8,0.030,-0.81,5.91,1.44,0.590,3.7
1,0.02,22.86,10.9,0.070,-0.44,6.70,7.04,0.556,4.8
2,0.18,13.21,3.8,0.010,0.76,14.42,8.55,0.556,22.0
3,0.96,15.84,2.6,0.170,-0.98,6.16,5.95,0.619,63.0
4,0.36,22.03,18.1,0.030,-1.00,7.98,7.56,0.786,7.0
...,...,...,...,...,...,...,...,...,...
179,0.00,10.37,3.0,0.005,0.98,14.64,162.00,0.469,7.6
180,0.00,14.24,4.0,0.005,0.76,13.65,0.71,0.697,5.0
181,0.02,19.74,3.0,0.005,0.37,13.57,0.71,0.633,2.7
182,0.04,12.85,40.0,0.034,-0.23,8.30,10.14,0.738,0.5


## Predict the missing data
Predicts states of all the missing variables.

## Split the data into training/test set
75% of the response variable data were used in training. The data was selected at random.

In [12]:
y_geo = df_g.loc[:, 'Geo']
x_geo = df_geo.loc[:, 'PRECTOT':'TKN']
y_mc = df_m.loc[:, 'MC']
x_mc = df_mc.loc[:, 'PRECTOT':'Fe']
y_cyano = df_c.loc[:, 'Cyano']
x_cyano = df_cyano.loc[:, 'PRECTOT':'Temp']
x_geo_train, x_geo_test, y_geo_train, y_geo_test = train_test_split(x_geo, y_geo, train_size=0.75)
x_mc_train, x_mc_test, y_mc_train, y_mc_test = train_test_split(x_mc, y_mc, train_size=0.75)
x_cyano_train, x_cyano_test, y_cyano_train, y_cyano_test = train_test_split(x_cyano, y_cyano, train_size=0.75)
geo_train = pd.concat([x_geo_train, y_geo_train], axis = 1)
mc_train = pd.concat([x_mc_train, y_mc_train], axis = 1)
cyano_train = pd.concat([x_cyano_train, y_cyano_train], axis = 1)
mc_train

Unnamed: 0,PRECTOT,ALLSKY_SFC_SW_DWN,Temp,sin,DO,Chl-a,TP,Fe,MC
145,1.79,3.13,4.7,0.67,11.96,13.6,0.060,280.0,0.05
76,0.00,20.69,6.4,0.96,11.47,27.5,0.030,490.0,0.05
107,0.00,25.03,26.0,-0.82,6.47,3.8,0.190,1810.0,0.05
168,0.00,11.80,16.8,-0.89,8.68,15.5,0.070,670.0,0.19
20,0.02,13.10,7.7,1.00,12.70,19.2,0.065,590.0,0.05
...,...,...,...,...,...,...,...,...,...
132,0.03,19.97,8.4,0.95,11.60,32.3,0.110,690.0,0.05
79,10.72,11.71,14.9,0.89,7.96,2.2,0.080,1230.0,0.05
136,0.00,28.68,25.4,0.06,6.92,4.9,0.120,1100.0,0.39
139,0.01,24.57,23.4,-0.77,6.64,9.1,0.150,1040.0,0.19


## Structure learning
- Score based structure learning using BDeu, K2 and BIC. It computes a score to measure how well the given Bayesian model fits to the data set
- Searching methods include Exhaustive Search(Searches all possible graph with a given set of nodes) and Hill Climb Search(Starts at model start_dag and proceeds by step-by-step network modifications until a local maximum is reached) according to the scorng method supplied.

Given n nodes, 2^(n*(n-1)) graphs need to be searched in Exhaustive Search. It is likely not feasible for n>6. 

In [13]:
#Ramdon sample 20% rows
df_mc_sl = df_mc.sample(frac=0.2)
df_geo_sl = df_geo.sample(frac=0.2)
df_cyano_sl = df_cyano.sample(frac=0.2)

In [14]:
#Use BIC scoring method and HillClimbSearch for microcystin, cyanobacteria abundance and geosmin model
hc_mc = HillClimbSearch(df_mc_sl, scoring_method=BicScore(df_mc))
best_model_mc = hc_mc.estimate()
print(best_model_mc.edges())
hc_geo = HillClimbSearch(df_geo_sl, scoring_method=BicScore(df_geo))
best_model_geo = hc_geo.estimate()
print(best_model_geo.edges())
hc_cyano = HillClimbSearch(df_cyano_sl, scoring_method=BicScore(df_cyano))
best_model_cyano = hc_cyano.estimate()
print(best_model_cyano.edges())

  0%|          | 15/1000000 [00:02<50:44:00,  5.48it/s]
  0%|          | 0/1000000 [00:00<?, ?it/s]

[('PRECTOT', 'TP'), ('PRECTOT', 'Fe'), ('ALLSKY_SFC_SW_DWN', 'DO'), ('Temp', 'Chl-a'), ('Temp', 'ALLSKY_SFC_SW_DWN'), ('sin', 'Temp'), ('Chl-a', 'ALLSKY_SFC_SW_DWN'), ('TP', 'Temp'), ('Fe', 'sin'), ('Fe', 'Chl-a'), ('Fe', 'TP'), ('Fe', 'DO'), ('MC', 'TP'), ('MC', 'Fe')]


  0%|          | 12/1000000 [00:02<55:43:19,  4.99it/s]
  0%|          | 0/1000000 [00:00<?, ?it/s]

[('PRECTOT', 'NO'), ('PRECTOT', 'cos'), ('ALLSKY_SFC_SW_DWN', 'DO'), ('ALLSKY_SFC_SW_DWN', 'Geo'), ('SSC', 'cos'), ('NO', 'cos'), ('NO', 'SSC'), ('NO', 'TKN'), ('cos', 'TKN'), ('cos', 'ALLSKY_SFC_SW_DWN'), ('DO', 'Si'), ('TKN', 'ALLSKY_SFC_SW_DWN')]


  0%|          | 11/1000000 [00:01<28:17:03,  9.82it/s]

[('PRECTOT', 'DP'), ('PRECTOT', 'Cyano'), ('ALLSKY_SFC_SW_DWN', 'Chl-a'), ('ALLSKY_SFC_SW_DWN', 'Si'), ('Chl-a', 'Temp'), ('Chl-a', 'sin'), ('TKN', 'ALLSKY_SFC_SW_DWN'), ('DP', 'Cyano'), ('DP', 'TKN'), ('Cyano', 'TKN'), ('Cyano', 'ALLSKY_SFC_SW_DWN')]





## Define the Bayesian model

## Parameter learning

## Max-Sum algorithm for MAP inference

## Model Validation