# Байесовская сеть

In [None]:
import numpy as np
import pandas as pd
import scipy

import plotly.express as px
import seaborn as sns
%matplotlib notebook

## Подготовка данных

In [None]:
df = pd.read_csv('data/raifhack_train.csv', 
    parse_dates=['date'], 
    usecols=lambda x: x not in ['floor']
)

In [None]:
is_expert = (df['price_type'] == 1)
df = df[is_expert]

In [None]:
for_test = df['date'].dt.month.isin([7, 8])
df_train = df[ ~for_test ]
df_test = df[ for_test ]

In [None]:
from sklearn.decomposition import PCA
pca_columns = df_train.columns[df_train.columns.str.contains('.', regex=False)]

df_pca = df_train.select_dtypes(include=np.number)
df_pca = df_pca.loc[:, pca_columns]
df_pca = df_pca.apply(np.log1p)
df_pca = (df_pca - df_pca.mean()) / df_pca.std()
df_pca = df_pca.fillna(df_pca.mean())

pca = PCA()
df_pca = pca.fit_transform(df_pca)

In [None]:
def get_features(df):
    df_features = pd.DataFrame()
    
#     no_transformation = [
#         # тут ничего не осталось
#     ]
#     for i in no_transformation:
#         df_features[i] = df[i].values
    
    log_features = [
        'osm_city_closest_dist',
        'osm_crossing_closest_dist',
        'osm_subway_closest_dist',
        #'osm_train_stop_closest_dist',
        'osm_transport_stop_closest_dist',
        'reform_mean_floor_count_1000',
        'total_square',
        # Целевую переменную тоже логарифмируем
        'per_square_meter_price'
    ]
    for i in log_features:
        df_features[f'log_{i}'] = df[i].apply(np.log1p).values
    
    df_features['pca_1'] = pca.transform(df[pca_columns]).T[0]
    df_features['pca_1'] /= 100
    
    # Добавим только один бинарный признак про регионы,
    # так модель получается лучше, чем все регионы
    df_features['big_city'] = (
        (df['region'] == 'Санкт-Петербург') |
        (df['region'] == 'Москва')
    ).astype(int).values
        
    return df_features

df_train = df_train.pipe(get_features)
df_test = df_test.pipe(get_features)

df_train = df_train.fillna(df_train.mean())
df_test = df_test.fillna(df_test.mean())

In [None]:
target_name = 'log_per_square_meter_price'

X_train = df_train.drop(columns=[target_name])
y_train = df_train[target_name]

X_test = df_test.drop(columns=[target_name])
y_test = df_test[target_name]

## Байесовская сеть

In [None]:
import bamt
from bamt.Networks import HybridBN
from bamt.Preprocessors import Preprocessor
from pgmpy.estimators import K2Score, BicScore, BDeuScore

from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
import networkx as nx

**Дискретизация признаков**

In [None]:
encoder = LabelEncoder()
discretizer = KBinsDiscretizer(
    n_bins=5, 
    encode='ordinal', 
    strategy='kmeans', 
    random_state=0
)

prepro = Preprocessor([
    ('encoder', encoder),
    ('discretizer', discretizer),
])
df_train_discr, _ = prepro.apply(df_train)

In [None]:
info = prepro.info
info

## Обучаем сеть

In [None]:
network = HybridBN(has_logit=True, use_mixture=True)

In [None]:
network.add_nodes(info)

In [None]:
network.add_edges(df_train_discr, scoring_function=('K2', K2Score))
# network.add_edges(df_train_discr, scoring_function=('BIC', BicScore))

**Граф**

In [None]:
shorter_feature_names = {
    'big_city': 'big\ncity',
    'log_osm_subway_closest_dist': 'subway\ndist',
    'log_per_square_meter_price': 'price',
    'pca_1': 'pc\n1',
    'log_osm_city_closest_dist': 'city\ndist',
    'log_osm_crossing_closest_dist': 'cross\ndist',
    'log_total_square': 'total\narea',
    'log_reform_mean_floor_count_1000': 'mean\nfloor',
    'log_osm_transport_stop_closest_dist': 'stop\ndist'
}

graph = nx.DiGraph()
for a, b in network.edges:
    graph.add_edge(shorter_feature_names[a], shorter_feature_names[b])

nx.draw(
    graph, 
    pos=nx.circular_layout(graph), 
    with_labels=True,
    font_size=10,
    node_size=2500,
    node_color='white',
    edge_color='lightgrey'
)

In [None]:
%%time
network.fit_parameters(df_train)

## Качество предсказаний

In [None]:
bn_insample_pred = network.predict(X_train, 5)
bn_insample_pred = np.array(bn_insample_pred[target_name])

In [None]:
fig = px.scatter(
    x=bn_insample_pred, 
    y=y_train, 
    trendline='ols',
    template='plotly_white',
    trendline_color_override='lightgrey'
)

fig.update_layout(xaxis_title='y_pred', yaxis_title='y_true')
fig.data[0].marker.size=3

fig.show()

In [None]:
bn_pred = network.predict(X_test, 5)

In [None]:
fig = px.scatter(
    x=bn_pred['log_per_square_meter_price'], 
    y=y_test, 
    trendline='ols',
    template='plotly_white',
    trendline_color_override='lightgrey'
)

fig.update_layout(xaxis_title='y_pred', yaxis_title='y_true')
fig.data[0].marker.size=3

fig.show()

## Доверительный интервал

**Считаем число параметров**

In [None]:
def _get_num_params(a):
    return (
        np.array(a['covars']).size +
        np.array(a['mean']).size + 
        np.array(a['coef']).size
    )

n_net_params = sum(
    _get_num_params(a)
    for a in network.distributions[target_name]['hybcprob'].values()
)
n_net_params

# a = network.distributions[target_name]
# n_net_params = _get_num_params(a)
# n_net_params

In [None]:
def get_error(x_new):
    # число наблюдений
    n = len(X_train)
    # число параметров
    k = int(n_net_params)
    x = X_train.values
    # матричные операции
    inv = np.linalg.inv(np.dot(x.T, x))
    a = np.sqrt(np.dot(
        np.dot(x_new, inv),
        x_new
    ))
    # MSE
    mse = np.square(bn_insample_pred - y_train).mean()**0.5 
    mse *= np.sqrt( n/(n-k) )
    # распределение Стьюдента
    t = scipy.stats.t.ppf(1-0.05/2, n-2)
    
    return a * mse * abs(t)

In [None]:
df_confidence_interval = pd.DataFrame()

df_confidence_interval['y_true'] = y_test
df_confidence_interval['y_pred'] = bn_pred['log_per_square_meter_price']
# считаем длину доверительного интервала
df_confidence_interval['error'] = [
    get_error(x_new)
    for x_new in X_test.values
]

**Графики доверительных интервалов**

In [None]:
x_var_error_bar_plot = 'log_total_square'
fig = px.scatter(
    (
        df_confidence_interval
        .assign(x=X_test[x_var_error_bar_plot])
        # выберем точки равномерно по оси x
        .sort_values('x')
        .iloc[::10]
    ), 
    x='x', 
    y='y_pred',
    error_y='error',
    template='plotly_white'
)
fig.update_layout(xaxis_title=x_var_error_bar_plot)
fig.data[0].marker.size=1
fig.show()

In [None]:
X_test.columns