# Иерархия моделей

1. Посчитать модель линейной регрессии по первым 100 зданиям.
2. Найти точность, используя только дни недели и праздники, примемяя fit_intercept=False и логарифмируя целевой показатель.

Для вычисления отсутствующих или некорректных данных построить модели по всем зданиям одного типа в одном и во всех городах.

## Подключение библиотек

In [1]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.zip")

## Отбор 100 зданий, объединение и оптимизация

In [3]:
energy = energy[energy["building_id"]<100]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "year_built",
        "square_feet",
        "floor_count",
    ],
    axis=1
)

del buildings
del weather

energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 56.89 Мб (минус 71.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864557 entries, 0 to 864556
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           864557 non-null  datetime64[ns]
 1   site_id             864557 non-null  int8          
 2   building_id         864557 non-null  int8          
 3   meter_reading       864557 non-null  float16       
 4   primary_use         864557 non-null  category      
 5   air_temperature     864263 non-null  float16       
 6   cloud_coverage      487693 non-null  float16       
 7   dew_temperature     864263 non-null  float16       
 8   precip_depth_1_hr   864459 non-null  float16       
 9   sea_level_pressure  856210 non-null  float16       
 10  wind_direction      839970 non-null  float16       
 11  wind_speed          864557 non-null  float16       
dtypes: category(1), datetime64[ns](1

## Обогащение данных: час, дни недели, праздники

In [4]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")

for weekday in range(0, 7):
    energy[f"is_wday {str(weekday)}"] = energy["weekday"].isin([weekday]).astype("int8")

energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start="2015-12-31", end="2017-01-01")
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")

## Логарифмирование данных

In [5]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

## Разделение данных

In [6]:
energy_train, energy_test = train_test_split(
    energy[energy["meter_reading"] > 0],
    test_size=0.2
)
print(energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 427872 entries, 359787 to 602715
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           427872 non-null  datetime64[ns]
 1   site_id             427872 non-null  int8          
 2   building_id         427872 non-null  int8          
 3   meter_reading       427872 non-null  float16       
 4   primary_use         427872 non-null  category      
 5   air_temperature     427866 non-null  float16       
 6   cloud_coverage      248776 non-null  float16       
 7   dew_temperature     427866 non-null  float16       
 8   precip_depth_1_hr   427869 non-null  float16       
 9   sea_level_pressure  425641 non-null  float16       
 10  wind_direction      414100 non-null  float16       
 11  wind_speed          427872 non-null  float16       
 12  hour                427872 non-null  int8          
 13  weekday             4278

## Линейная регрессия:

1. по часам

In [7]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = [
    "meter_reading_log",
    "hour",
    "building_id",
    "is_holiday",
]

for wday in range(0, 7):
    lr_columns.append(f"is_wday {str(wday)}")

energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[] for _ in range(len(buildings))]

for building in buildings:
    energy_lr[building] = [[] for _ in range(len(hours))]
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
    for hour in hours:
        energy_lr[building].append([0 for _ in range(len(lr_columns) - 3)])
        energy_train_bh = pd.DataFrame(energy_train_b[energy_train_b["hour"] == hour])
        y = energy_train_bh["meter_reading_log"]
        if len(y) > 0:
            x = energy_train_bh.drop(labels=[
                    "meter_reading_log",
                    "hour",
                    "building_id",
                ],
                axis=1
            )
            model = LinearRegression(fit_intercept=False).fit(x, y)
            energy_lr[building][hour] = model.coef_
            energy_lr[building][hour] = np.append(energy_lr[building][hour], model.intercept_)

print(energy_lr[0])

[array([-0.02073569,  5.42426818,  5.46289062,  5.48467548,  5.44838571,
        5.49167799,  5.43475116,  5.44936343,  0.        ]), array([-0.09741782,  5.44723582,  5.44756611,  5.48152043,  5.46063932,
        5.41911733,  5.43648098,  5.44168527,  0.        ]), array([-0.08259412,  5.46897379,  5.46959918,  5.49189815,  5.46832093,
        5.42054078,  5.43945312,  5.41140625,  0.        ]), array([-0.14213745,  5.49013858,  5.4578683 ,  5.48578125,  5.46398271,
        5.44356779,  5.45814732,  5.44806134,  0.        ]), array([-0.16580383,  5.46061677,  5.44112723,  5.50796875,  5.48111195,
        5.43686892,  5.43945312,  5.45632102,  0.        ]), array([-0.04483696,  5.44718071,  5.45431386,  5.50014468,  5.49550781,
        5.46394231,  5.45862269,  5.44726563,  0.        ]), array([-0.04152366,  5.45463783,  5.41168478,  5.47515625,  5.43594202,
        5.454375  ,  5.4780971 ,  5.46213942,  0.        ]), array([-0.06003494,  5.43950699,  5.46905048,  5.52103365,  5.487678

2. по типам зданий

In [8]:
sites = range(0, energy["site_id"].max() + 1)
primary_uses = energy["primary_use"].unique()
lr_columns_use = [
    "meter_reading_log",
    "hour",
    "building_id",
    "is_holiday",
    "primary_use",
    "site_id",
]

for wday in range(0, 7):
    lr_columns_use.append(f"is_wday {str(wday)}")

energy_lr_use = {}
energy_lr_use_site = {}
energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns_use)

for primary_use in primary_uses:
    energy_train_u = energy_train_lr[energy_train_lr["primary_use"] == primary_use]
    if len(energy_train_u) > 0:
        energy_lr_use_site[primary_use] = [[] for _ in range(len(sites))]
        for site in sites:
            energy_lr_use_site[primary_use][site] = [[] for _ in range(len(hours))]
            energy_train_us = energy_train_u[energy_train_u["site_id"] == site]
            if len(energy_train_us) > 0:
                for hour in hours:
                    energy_train_uth = energy_train_us[energy_train_us["hour"] == hour]
                    y = energy_train_uth["meter_reading_log"]
                    if len(y) > 0:
                        x = energy_train_uth.drop(
                            labels=[
                                "meter_reading_log",
                                "hour",
                                "building_id",
                                "site_id",
                                "primary_use",
                            ],
                            axis=1
                        )
                        model = LinearRegression(fit_intercept=False).fit(x, y)
                        energy_lr_use_site[primary_use][site][hour] = model.coef_
                        energy_lr_use_site[primary_use][site][hour] = np.append(energy_lr_use_site[primary_use][site][hour], model.intercept_)
        energy_lr_use[primary_use] = [[] for _ in range(len(hours))]
        for hour in hours:
            energy_train_th = energy_train_u[energy_train_u["hour"] == hour]
            y = energy_train_th["meter_reading_log"]
            if len(y) > 0:
                x = energy_train_th.drop(
                    labels=[
                        "meter_reading_log",
                        "hour",
                        "building_id",
                        "site_id",
                        "primary_use",
                    ],
                    axis=1
                )
                model = LinearRegression(fit_intercept=False).fit(x, y)
                energy_lr_use[primary_use][hour] = model.coef_
                energy_lr_use[primary_use][hour] = np.append(energy_lr_use[primary_use][hour], model.intercept_)

print(energy_lr_use_site)

{'Education': [[array([-0.07321045,  5.64856254,  5.66784766,  5.6565429 ,  5.66536488,
        5.66359488,  5.66370941,  5.61093041,  0.        ]), array([-0.06052893,  5.59132192,  5.67887125,  5.64310726,  5.66139172,
        5.67337836,  5.62337539,  5.62096643,  0.        ]), array([-0.09194434,  5.61595462,  5.62223673,  5.65423387,  5.62736344,
        5.62451854,  5.58828289,  5.57212243,  0.        ]), array([-0.14494986,  5.61236473,  5.62571047,  5.60453761,  5.58839737,
        5.6250229 ,  5.61702013,  5.57133382,  0.        ]), array([-0.03200458,  5.57240715,  5.609375  ,  5.61042788,  5.64249077,
        5.62533513,  5.60171751,  5.58851665,  0.        ]), array([-0.09895333,  5.64241595,  5.65745762,  5.66244274,  5.65862364,
        5.6039439 ,  5.63732538,  5.61254317,  0.        ]), array([-0.18675801,  5.8095843 ,  5.71758717,  5.75048895,  5.75385764,
        5.71757272,  5.70760309,  5.70792963,  0.        ]), array([-0.14182286,  5.84476617,  5.8367765 ,  5.8799

## Расчет качества

In [11]:
def calculate_model(x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) == 0:
        model = energy_lr_use_site[x.primary_use][x.site_id][x.hour]
    if len(model) == 0:
        model = energy_lr_use[x.primary_use][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col] * model[i] for i, col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns) - 3]
        lr = np.exp(lr)
    if lr < 0:
        lr = 0
    
    x["meter_reading_lr_q"] = (np.log(x.meter_reading + 1) - np.log(1 + lr))**2

    return x

In [12]:
energy_test = energy_test.apply(calculate_model, axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print(f"Качество линейной регрессии, 20 зданий: {energy_test_lr_rmsle}, (округленно): {round(energy_test_lr_rmsle, 1)}")

Качество линейной регрессии, 20 зданий: 0.34534975565402587, (округленно): 0.3
