In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BDeuScore, K2Score, BicScore, ExhaustiveSearch, HillClimbSearch, BayesianEstimator, MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import BeliefPropagation

## Data import and variable selection
- Import geosmin, microcystin and cyanobacteria abundance datasets
- Select columns according to importance ranking<br>

In [4]:
df_g = pd.read_csv("geosmin.csv")[['Date','SSC','NO','cos','DO','Si','TKN','Geo']]
df_m = pd.read_csv("microcystin.csv")[['Date','Temp','sin','DO','TP','Fe','MC']]
df_c = pd.read_csv("cyanobacteria_abundance.csv")[['Date','DO','TKN','DP','Turb','Temp','Cyano']]
df_sp = pd.read_csv("solar&prec.csv").drop(["LAT","LON","YEAR","MO","DY"], axis=1)

## Data merge

In [5]:
df_geo = df_sp.merge(df_g,on = 'Date',how = 'inner')
df_mc = df_sp.merge(df_m,on = 'Date',how = 'inner')
df_cyano = df_sp.merge(df_c,on = 'Date',how = 'inner')

## Predict the missing data
Predicts states of all the missing variables.

## Split the data into training/test set
75% of the response variable data were used in training. The data was selected at random.

In [6]:
y_geo = df_g.loc[:, 'Geo']
x_geo = df_geo.loc[:, 'PRECTOT':'TKN']
y_mc = df_m.loc[:, 'MC']
x_mc = df_mc.loc[:, 'PRECTOT':'Fe']
y_cyano = df_c.loc[:, 'Cyano']
x_cyano = df_cyano.loc[:, 'PRECTOT':'Temp']
x_geo_train, x_geo_test, y_geo_train, y_geo_test = train_test_split(x_geo, y_geo, train_size=0.75)
x_mc_train, x_mc_test, y_mc_train, y_mc_test = train_test_split(x_mc, y_mc, train_size=0.75)
x_cyano_train, x_cyano_test, y_cyano_train, y_cyano_test = train_test_split(x_cyano, y_cyano, train_size=0.75)
geo_train = pd.concat([x_geo_train, y_geo_train], axis = 1).reset_index()
mc_train = pd.concat([x_mc_train, y_mc_train], axis = 1).reset_index()
cyano_train = pd.concat([x_cyano_train, y_cyano_train], axis = 1).reset_index()
geo_test = pd.concat([x_geo_test, y_geo_test], axis = 1).reset_index()
mc_test = pd.concat([x_mc_test, y_mc_test], axis = 1).reset_index()
cyano_test = pd.concat([x_cyano_test, y_cyano_test], axis = 1).reset_index()
del geo_train['index']
del mc_train['index']
del cyano_train['index']
del geo_test['index']
del mc_test['index']
del cyano_test['index']
geo_train.to_csv('geo_train.csv')
mc_train.to_csv('mc_train.csv')
cyano_train.to_csv('cyano_train.csv')
geo_test.to_csv('geo_test.csv')
mc_test.to_csv('mc_test.csv')
cyano_test.to_csv('cyano_test.csv')