In [98]:
import pandas as pd
import pgmpy
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from matplotlib.pyplot import hist
import itertools
import numpy as np

In [99]:
data = pd.read_csv("./london_merged.csv")
data["time"] = pd.to_datetime(data["timestamp"]).dt.time.astype(str).str.slice(0,2).astype(int)
data = data.drop("timestamp", axis = 1)

# data = data[data['cnt'] < 5000]
data = data[data['time'] > 6]

data['is_weekend'] = data['is_weekend'].map({1: "yes", 0: "no"})
data['is_holiday'] = data['is_holiday'].map({1: "yes", 0: "no"})
data['weather'] = data['weather_code'].map({
    1: "clear", 
    2: "few_clouds", 
    3: "broken_clouds", 
    4: "cloudy", 
    7: "rain", 
    10: "thunderstorm",
    26: "snowfall",
    94: "freezing_fog"
})
data['season'] = data['season'].map({
    0: "spring",
    1: "summer",
    2: "fall",
    3: "winter"
})
data["wind"] = data['wind_speed']
data = data.drop('wind_speed', axis = 1)
data["temperature"] = data["t1"]
data = data.drop("t1", axis = 1)
data["temperature_feels"] = data["t2"]
data = data.drop("t2", axis = 1)
data = data.drop("weather_code", axis = 1)

data.head()


Unnamed: 0,cnt,hum,is_holiday,is_weekend,season,time,weather,wind,temperature,temperature_feels
7,75,100.0,no,yes,winter,7,cloudy,7.0,1.0,-1.0
8,131,96.5,no,yes,winter,8,cloudy,8.0,1.5,-1.0
9,301,100.0,no,yes,winter,9,broken_clouds,9.0,2.0,-0.5
10,528,93.0,no,yes,winter,10,broken_clouds,12.0,3.0,-0.5
11,727,100.0,no,yes,winter,11,broken_clouds,12.0,2.0,-1.5


In [100]:

data['temperature'] = pd.cut(data['temperature'], bins=4, labels=[0, 1, 2, 3])
data['temperature_feels'] = pd.cut(data['temperature_feels'], bins=4, labels=[0, 1, 2, 3])
data['time'] = pd.cut(data['time'], bins=4, labels=["morning", "afternoon", "evening", "night"])
data['hum'] = pd.cut(data['hum'], bins=4, labels=[0, 1, 2, 3])
data['wind'] = pd.cut(data['wind'], bins=2, labels=["yes", "no"])
data['cnt'] = pd.qcut(data['cnt'], q=8, labels=[0, 1, 2, 3, 4, 5, 6, 7])

In [101]:
data.head()

Unnamed: 0,cnt,hum,is_holiday,is_weekend,season,time,weather,wind,temperature,temperature_feels
7,0,3,no,yes,winter,morning,cloudy,yes,0,0
8,0,3,no,yes,winter,morning,cloudy,yes,0,0
9,0,3,no,yes,winter,morning,broken_clouds,yes,0,0
10,1,3,no,yes,winter,morning,broken_clouds,yes,0,0
11,1,3,no,yes,winter,morning,broken_clouds,yes,0,0


In [102]:
network = BayesianNetwork()
network.add_nodes_from(["cnt", "temperature", "wind", "weather", "is_weekend", "season", "time", "temperature_feels", "hum", "is_holiday"])

network.add_edge("season", "weather")
network.add_edge("time", "temperature")
network.add_edge("weather", "temperature")
network.add_edge("weather", "hum")
network.add_edge("weather", "wind")
network.add_edge("temperature", "temperature_feels")
network.add_edge("hum", "temperature_feels")
network.add_edge("wind", "temperature_feels")
network.add_edge("is_holiday", "cnt")
network.add_edge("is_weekend", "cnt")
network.add_edge("time", "cnt")
network.add_edge("temperature_feels", "cnt")

In [103]:
def createCPD(name, parents):
    domains = [sorted(data[p].unique()) for p in parents]
    combinations = list(itertools.product(*tuple(domains)))
    cpd = []
    for v in sorted(data[name].unique()):
        row = []
        for c in combinations:
            numerator = data[name] == v
            denominator = [True for i in range(data.shape[0])]
            for i, p in enumerate(parents):
                numerator = (numerator) & (data[p] == c[i])
                denominator = (denominator) & (data[p] == c[i])
            if data[denominator].shape[0] != 0:
                row.append(data[numerator].shape[0] / data[denominator].shape[0])
            else:
                row.append(data[data[name] == v].shape[0] / data.shape[0])

        cpd.append(row)
    return TabularCPD(name, len(data[name].unique()), cpd, evidence = parents, evidence_card = [len(data[p].unique()) for p in parents])

In [104]:
cpd_tables = {}
for node in network.nodes():
    in_edges = [edge[0] for edge in network.edges() if edge[1] == node]
    cpd_tables[node] = createCPD(node, in_edges)
network.add_cpds(*tuple(cpd_tables.values()))

In [105]:
# Variable Elimination
infer_non_adjust = VariableElimination(network)
print(infer_non_adjust.query(variables=["cnt"], evidence={"temperature_feels": 3, "is_holiday": "yes"}))

ValueError: Node T not in not in graph