In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BDeuScore, K2Score, BicScore, ExhaustiveSearch, HillClimbSearch, BayesianEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import BeliefPropagation

## Data import and variable selection
- Import geosmin, microcystin and cyanobacteria abundance datasets
- Select columns according to importance ranking<br>

In [2]:
df_g = pd.read_csv("geosmin.csv")[['Date','SSC','NO','cos','DO','Si','TKN','Geo']]
df_m = pd.read_csv("microcystin.csv")[['Date','Temp','sin','DO','TP','Fe','MC']]
df_c = pd.read_csv("cyanobacteria_abundance.csv")[['Date','pH','DP','TKN','DO','Turb','Temp','Cyano']]
df_sp = pd.read_csv("solar&prec.csv").drop(["LAT","LON","YEAR","MO","DY"], axis=1)

## Data merge

In [3]:
df_geo = df_sp.merge(df_g,on = 'Date',how = 'inner')
df_mc = df_sp.merge(df_m,on = 'Date',how = 'inner')
df_cyano = df_sp.merge(df_c,on = 'Date',how = 'inner')

## Predict the missing data
Predicts states of all the missing variables.

## Split the data into training/test set
75% of the response variable data were used in training. The data was selected at random.

In [4]:
y_geo = df_g.loc[:, 'Geo']
x_geo = df_geo.loc[:, 'PRECTOT':'TKN']
y_mc = df_m.loc[:, 'MC']
x_mc = df_mc.loc[:, 'PRECTOT':'Fe']
y_cyano = df_c.loc[:, 'Cyano']
x_cyano = df_cyano.loc[:, 'PRECTOT':'Temp']
x_geo_train, x_geo_test, y_geo_train, y_geo_test = train_test_split(x_geo, y_geo, train_size=0.75)
x_mc_train, x_mc_test, y_mc_train, y_mc_test = train_test_split(x_mc, y_mc, train_size=0.75)
x_cyano_train, x_cyano_test, y_cyano_train, y_cyano_test = train_test_split(x_cyano, y_cyano, train_size=0.75)
geo_train = pd.concat([x_geo_train, y_geo_train], axis = 1).reset_index()
mc_train = pd.concat([x_mc_train, y_mc_train], axis = 1).reset_index()
cyano_train = pd.concat([x_cyano_train, y_cyano_train], axis = 1).reset_index()
geo_test = pd.concat([x_geo_test, y_geo_test], axis = 1).reset_index()
mc_test = pd.concat([x_mc_test, y_mc_test], axis = 1).reset_index()
cyano_test = pd.concat([x_cyano_test, y_cyano_test], axis = 1).reset_index()
del geo_train['index']
del mc_train['index']
del cyano_train['index']
del geo_test['index']
del mc_test['index']
del cyano_test['index']
geo_train.to_csv('geo_train.csv')
mc_train.to_csv('mc_train.csv')
cyano_train.to_csv('cyano_train.csv')
geo_test.to_csv('geo_test.csv')
mc_test.to_csv('mc_test.csv')
cyano_test.to_csv('cyano_test.csv')

In [5]:
cyano_train

Unnamed: 0,PRECTOT,ALLSKY_SFC_SW_DWN,Date,pH,DP,TKN,DO,Turb,Temp,Cyano
0,0.02,27.20,2010-06-17,8.25,0.090,0.424,6.75,16.5,24.8,166
1,0.00,11.20,2013-01-23,8.66,0.015,0.556,13.13,3.6,2.1,24537
2,0.00,11.01,2013-01-15,8.66,0.015,0.496,13.24,4.0,1.0,32993
3,0.84,29.70,2006-07-13,8.39,0.103,0.861,7.65,10.5,25.4,3509
4,1.32,23.14,2009-08-05,8.13,0.080,0.591,5.77,14.5,25.1,5542
...,...,...,...,...,...,...,...,...,...,...
133,0.00,11.80,2014-10-28,8.21,0.040,0.553,8.68,13.6,16.8,11406
134,0.00,19.14,2008-10-01,8.39,0.060,0.650,8.26,15.4,21.6,25661
135,0.29,12.10,2008-08-18,8.09,0.090,0.604,5.36,14.5,24.6,12420
136,0.18,13.21,2003-02-10,8.00,0.015,0.556,14.42,1.6,1.2,2228
