In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BDeuScore, K2Score, BicScore, ExhaustiveSearch, HillClimbSearch, BayesianEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import BeliefPropagation

## Data import and variable selection
- Import geosmin, microcystin and cyanobacteria abundance datasets
- Select columns according to importance ranking<br>

In [13]:
df_g = pd.read_csv("geosmin.csv")[['Date','SSC','NO','cos','DO','Si','TKN','Geo']]
df_m = pd.read_csv("microcystin.csv")[['Date','Temp','sin','DO','TP','Fe','MC']]
df_c = pd.read_csv("cyanobacteria_abundance.csv")
df_spw = pd.read_csv("solar&prec&wind.csv").drop(["LAT","LON","YEAR","MO","DY"], axis=1)

## Data merge

In [14]:
df_geo = df_spw.merge(df_g,on = 'Date',how = 'inner')
df_mc = df_spw.merge(df_m,on = 'Date',how = 'inner')
df_cyano = df_spw.merge(df_c,on = 'Date',how = 'inner')

In [15]:
df_cyano.to_csv('cyano&spw.csv')

## Predict the missing data
Predicts states of all the missing variables.

## Split the data into training/test set
75% of the response variable data were used in training. The data was selected at random.

In [12]:
y_geo = df_g.loc[:, 'Geo']
x_geo = df_geo.loc[:, 'PRECTOT':'TKN']
y_mc = df_m.loc[:, 'MC']
x_mc = df_mc.loc[:, 'PRECTOT':'Fe']
y_cyano = df_c.loc[:, 'Cyano']
x_cyano = df_cyano.loc[:, 'PRECTOT':'NONH']
x_geo_train, x_geo_test, y_geo_train, y_geo_test = train_test_split(x_geo, y_geo, train_size=0.75)
x_mc_train, x_mc_test, y_mc_train, y_mc_test = train_test_split(x_mc, y_mc, train_size=0.75)
x_cyano_train, x_cyano_test, y_cyano_train, y_cyano_test = train_test_split(x_cyano, y_cyano, train_size=0.75)
geo_train = pd.concat([x_geo_train, y_geo_train], axis = 1).reset_index()
mc_train = pd.concat([x_mc_train, y_mc_train], axis = 1).reset_index()
cyano_train = pd.concat([x_cyano_train, y_cyano_train], axis = 1).reset_index()
geo_test = pd.concat([x_geo_test, y_geo_test], axis = 1).reset_index()
mc_test = pd.concat([x_mc_test, y_mc_test], axis = 1).reset_index()
cyano_test = pd.concat([x_cyano_test, y_cyano_test], axis = 1).reset_index()
del geo_train['index']
del mc_train['index']
del cyano_train['index']
del geo_test['index']
del mc_test['index']
del cyano_test['index']
geo_train.to_csv('geo_train.csv')
mc_train.to_csv('mc_train.csv')
cyano_train.to_csv('cyano_train.csv')
geo_test.to_csv('geo_test.csv')
mc_test.to_csv('mc_test.csv')
cyano_test.to_csv('cyano_test.csv')

In [11]:
cyano_train

Unnamed: 0,PRECTOT,ALLSKY_SFC_SW_DWN,Date,sin,cos,DO,elev,pH,Spc,Temp,...,DP,TP,TN,FC,Chl-a,Fe,SSC,TNTP,NONH,Cyano
0,0.00,16.81,2014-09-16,-0.97,-0.25,7.96,1420.94,8.19,896.0,20.7,...,0.060,0.080,0.701,2.0,20.7,600,41,8.76,0.81,12966
1,0.24,27.43,2007-08-07,-0.59,-0.81,6.08,1422.54,8.45,755.0,27.1,...,0.129,0.186,0.795,1.0,18.8,890,16,4.27,1.40,1127
2,0.08,28.01,2009-06-23,0.15,-0.99,7.47,1421.98,8.35,763.0,25.4,...,0.110,0.160,1.096,1.0,29.1,900,18,6.85,26.67,1437
3,0.03,11.02,2006-04-25,0.92,-0.40,8.48,1421.55,8.33,862.0,17.1,...,0.040,0.090,1.039,4.0,11.1,200,34,11.54,1.25,4196
4,0.03,19.97,2012-03-12,0.95,0.33,11.60,1418.89,8.76,915.0,8.4,...,0.060,0.110,0.775,4.0,32.3,690,23,7.05,2.10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,1.10,20.14,2011-08-08,-0.60,-0.80,6.60,1419.22,8.95,848.0,27.7,...,0.160,0.220,0.601,45.0,33.5,1130,23,2.73,0.67,40298
134,0.01,25.31,2010-08-12,-0.66,-0.76,5.74,1421.46,8.07,639.0,28.6,...,0.150,0.200,1.017,60.0,5.1,1680,20,5.09,29.33,133
135,0.00,8.14,2012-01-18,0.30,0.95,12.82,1417.24,8.46,952.0,2.1,...,0.210,0.220,0.853,2.0,5.2,1430,34,3.88,6.13,1
136,0.00,28.34,2012-05-16,0.71,-0.71,7.82,1419.65,8.31,958.0,20.5,...,0.090,0.100,0.903,1.0,6.0,1130,24,9.03,12.35,1
