In [53]:
import pandas as pd
import pgmpy
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

In [61]:
data = pd.read_csv("./london_merged.csv")
# data["date"] = pd.to_datetime(data["timestamp"]).dt.date.astype(str)
data["time"] = pd.to_datetime(data["timestamp"]).dt.time.astype(str).str.slice(0,2).astype(int)
data = data.drop("timestamp", axis = 1)

data = data[data['cnt'] < 5000]

data.head()


Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,time
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2
3,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3
4,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4


In [62]:

data['t1'] = pd.cut(data['t1'], bins=4, labels=[0, 1, 2, 3])
data['t2'] = pd.cut(data['t2'], bins=4, labels=[0, 1, 2, 3])
data['time'] = pd.cut(data['time'], bins=6, labels=[0, 1, 2, 3, 4, 5])
data['hum'] = pd.cut(data['hum'], bins=4, labels=[0, 1, 2, 3])
data['wind_speed'] = pd.cut(data['wind_speed'], bins=3, labels=[0, 1, 2])
data['cnt'] = pd.qcut(data['cnt'], q=8, labels=[0, 1, 2, 3, 4, 5, 6, 7])

In [63]:
data['cnt']

0         (102.0, 256.0]
1         (102.0, 256.0]
2         (102.0, 256.0]
3        (-0.001, 102.0]
4        (-0.001, 102.0]
              ...       
17409    (841.0, 1193.0]
17410     (256.0, 547.0]
17411     (256.0, 547.0]
17412     (102.0, 256.0]
17413     (102.0, 256.0]
Name: cnt, Length: 17379, dtype: category
Categories (8, interval[float64, right]): [(-0.001, 102.0] < (102.0, 256.0] < (256.0, 547.0] < (547.0, 841.0] < (841.0, 1193.0] < (1193.0, 1661.0] < (1661.0, 2429.0] < (2429.0, 4992.0]]

In [21]:
network = BayesianNetwork()
network.add_nodes_from(["cnt", "t1", "t2", "hum", "wind_speed", "weather_code", "is_holiday", "is_weekend", "season", "date", "time"])

network.add_edge("date", "is_holiday")
network.add_edge("date", "is_weekend")
network.add_edge("date", "season")
network.add_edge("is_holiday", "cnt")
network.add_edge("is_weekend", "cnt")
network.add_edge("season", "weather_code")
network.add_edge("time", "weather_code")
network.add_edge("time", "cnt")
network.add_edge("weather_code", "t1")
network.add_edge("weather_code", "hum")
network.add_edge("weather_code", "wind_speed")
network.add_edge("t1", "t2")
network.add_edge("hum", "t2")
network.add_edge("wind_speed", "t2")
network.add_edge("t2", "cnt")

NodeView(('cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season', 'date', 'time'))

In [64]:
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BicScore
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator

hc = HillClimbSearch(data)
best_model = hc.estimate()

b_model = BayesianModel(best_model.edges())
b_model.fit(data, estimator = MaximumLikelihoodEstimator)
for cpd in b_model.get_cpds():
    print(cpd)

  0%|          | 23/1000000 [00:05<61:32:03,  4.51it/s] 


+-----------------------+-----+-----------------+
| is_holiday            | ... | is_holiday(1.0) |
+-----------------------+-----+-----------------+
| is_weekend            | ... | is_weekend(1.0) |
+-----------------------+-----+-----------------+
| t1                    | ... | t1(3)           |
+-----------------------+-----+-----------------+
| time                  | ... | time(5)         |
+-----------------------+-----+-----------------+
| cnt((-0.001, 102.0])  | ... | 0.125           |
+-----------------------+-----+-----------------+
| cnt((102.0, 256.0])   | ... | 0.125           |
+-----------------------+-----+-----------------+
| cnt((256.0, 547.0])   | ... | 0.125           |
+-----------------------+-----+-----------------+
| cnt((547.0, 841.0])   | ... | 0.125           |
+-----------------------+-----+-----------------+
| cnt((841.0, 1193.0])  | ... | 0.125           |
+-----------------------+-----+-----------------+
| cnt((1193.0, 1661.0]) | ... | 0.125           |
