# Оптимизация регрессии

Несколько моделей линейной регрессии, что бы найти более оптимальную для первых 20 зданий

In [1]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.zip")

## Взять 20 зданий, объединить, оптимизировать

In [3]:
energy = energy[energy["building_id"] < 20]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
energy = energy.set_index(["timestamp", "site_id"])
weather =weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "site_id",
        "year_built",
        "square_feet",
        "floor_count",
    ],
    axis=1
)

del buildings
del weather

energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 10.39 Мб (минус 70.5%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   building_id         175680 non-null  int8          
 2   meter_reading       175680 non-null  float16       
 3   primary_use         175680 non-null  category      
 4   air_temperature     175620 non-null  float16       
 5   cloud_coverage      99080 non-null   float16       
 6   dew_temperature     175620 non-null  float16       
 7   precip_depth_1_hr   175660 non-null  float16       
 8   sea_level_pressure  173980 non-null  float16       
 9   wind_direction      170680 non-null  float16       
 10  wind_speed          175680 non-null  float16       
dtypes: category(1), datetime64[ns](1), float16(8), int8(1)
memory usage: 4.4 MB
None


## Обогащение данных: час, дни недели, праздники

In [4]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")

for weekday in range(0, 7):
    energy[f"is_wday {str(weekday)}"] = energy["weekday"].isin([weekday]).astype("int8")

energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start="2015-12-31", end="2017-01-01")
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")

## Логарифм

In [5]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

## Разделение данных

In [6]:
energy_train, energy_test = train_test_split(
    energy[energy["meter_reading"] > 0],
    test_size=0.2
)
print(energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86858 entries, 134711 to 105045
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   timestamp           86858 non-null  datetime64[ns]
 1   building_id         86858 non-null  int8          
 2   meter_reading       86858 non-null  float16       
 3   primary_use         86858 non-null  category      
 4   air_temperature     86858 non-null  float16       
 5   cloud_coverage      50534 non-null  float16       
 6   dew_temperature     86858 non-null  float16       
 7   precip_depth_1_hr   86858 non-null  float16       
 8   sea_level_pressure  86410 non-null  float16       
 9   wind_direction      84002 non-null  float16       
 10  wind_speed          86858 non-null  float16       
 11  hour                86858 non-null  int8          
 12  weekday             86858 non-null  int8          
 13  is_wday 0           86858 non-null  int8

In [7]:
energy_train.head()

Unnamed: 0,timestamp,building_id,meter_reading,primary_use,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,...,is_wday 0,is_wday 1,is_wday 2,is_wday 3,is_wday 4,is_wday 5,is_wday 6,date,is_holiday,meter_reading_log
134711,2016-10-07 15:00:00,11,421.75,Education,25.0,,22.203125,3.0,994.0,270.0,...,0,0,0,0,1,0,0,2016-10-07,0,6.046875
123515,2016-09-14 07:00:00,15,271.0,Office,25.0,,23.90625,0.0,1016.0,180.0,...,0,0,1,0,0,0,0,2016-09-14,0,5.605469
113609,2016-08-24 16:00:00,9,155.25,Office,26.0,,24.0,112.0,1021.0,50.0,...,0,0,1,0,0,0,0,2016-08-24,0,5.050781
73749,2016-06-02 15:00:00,9,138.375,Office,30.0,4.0,21.703125,0.0,1018.5,,...,0,0,0,1,0,0,0,2016-06-02,0,4.9375
162167,2016-12-03 20:00:00,7,571.5,Education,25.0,,13.296875,0.0,1018.5,60.0,...,0,0,0,0,0,1,0,2016-12-03,0,6.351562


## Линейная регрессия: по часам

In [8]:
from sklearn.metrics import r2_score

In [9]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = [
    "meter_reading_log",
    "hour",
    "building_id",
    "is_holiday",
]

for wday in range(0, 7):
    lr_columns.append(f"is_wday {str(wday)}")

lr_models = {
    "LinearRegression": LinearRegression,
    "Lasso-0.01": Lasso,
    "Lasso-0.1": Lasso,
    "Lasso-1.0": Lasso,
    "Ridge-0.01": Ridge,
    "Ridge-0.1": Ridge,
    "Ridge-1.0": Ridge,
    "ElasticNet-1-1": ElasticNet,
    "ElasticNet-0.1-1": ElasticNet,
    "ElasticNet-1-0.1": ElasticNet,
    "ElasticNet-0.1-0.1": ElasticNet,
    "BayesianRidge": BayesianRidge,
}

energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)

Линейная регрессия:
$$\LARGE z = Ax + By + C, \lvert z + z_0 \rvert^2 \to min$$
Лассо + LARS Лассо:
$$\LARGE \dfrac{1}{2n} \lvert z - z_0 \rvert^2 + a(\lvert A \rvert + \lvert B \rvert) \to min$$
Гребная регрессия:
$$\LARGE \lvert z - z_0 \rvert^2 + a(A^2 + B^2) \to min$$
ElasticNet: Лассо + Гребневая регрессия:
$$\LARGE \dfrac{1}{2n} \lvert z - z_0 \rvert^2 + ap \lvert A^2 + B^2\rvert + (a - p)(\lvert A \rvert + \lvert B \rvert)/2 \to min$$

In [10]:
lr_models_scores = {}

for i in lr_models:
    lr_model = lr_models[i]
    energy_lr_scores = [[] for _ in range(len(buildings))]

    for building in buildings:
        energy_lr_scores[building] = [0 for _ in range(len(hours))]
        energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
        
        for hour in hours:
            energy_train_bh = energy_train_b[energy_train_b["hour"] == hour]
            y = energy_train_bh["meter_reading_log"]
            x = energy_train_bh.drop(
                labels=[
                    "meter_reading_log",
                    "hour",
                    "building_id",
                ],
                axis=1
            )
            if i in ["Ridge-0.1", "Lasso-0.1"]:
                model = lr_model(alpha=.1, fit_intercept=False).fit(x, y)
            elif i in ["Ridge-0.01", "Lasso-0.01"]:
                model = lr_model(alpha=.01, fit_intercept=False).fit(x, y)
            elif i == "ElasticNet-1-1":
                model = lr_model(alpha=1, l1_ratio=1, fit_intercept=False).fit(x, y)
            elif i == "ElasticNet-1-0.1":
                model = lr_model(alpha=1, l1_ratio=.1, fit_intercept=False).fit(x, y)
            elif i == "ElasticNet-0.1-1":
                model = lr_model(alpha=.1, l1_ratio=1, fit_intercept=False).fit(x, y)
            elif i == "ElasticNet-0.1-0.1":
                model = lr_model(alpha=.1, l1_ratio=.05, fit_intercept=False).fit(x, y)
            else:
                model = lr_model(fit_intercept=False).fit(x, y)
            
            energy_lr_scores[building][hour] = r2_score(y, model.predict(x))
        
    lr_models_scores[i] = np.mean(energy_lr_scores)

print(lr_models_scores)

{'LinearRegression': 0.13219319608619842, 'Lasso-0.01': -0.19903255435074385, 'Lasso-0.1': -31.955333115521974, 'Lasso-1.0': -2512.909424694678, 'Ridge-0.01': 0.13176737415979103, 'Ridge-0.1': 0.08997546053081722, 'Ridge-1.0': -3.761224218669234, 'ElasticNet-1-1': -2512.909424694678, 'ElasticNet-0.1-1': -31.955333115521974, 'ElasticNet-1-0.1': -2090.178247550908, 'ElasticNet-0.1-0.1': -438.5904600336049, 'BayesianRidge': 0.13218336384464255}


## Проверим модели: LinearRegression, Lasso, BayesianRidge

In [12]:
energy_lr = [[] for _ in range(len(buildings))]
energy_lasso = [[] for _ in range(len(buildings))]
energy_br = [[] for _ in range(len(buildings))]

for building in buildings:
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]
    energy_lr[building] = [[] for _ in range(len(hours))]
    energy_lasso[building] = [[] for _ in range(len(hours))]
    energy_br[building] = [[] for _ in range(len(hours))]

    for hour in hours:
        energy_train_bh = pd.DataFrame(energy_train_b[energy_train_b["hour"] == hour])
        y = energy_train_bh["meter_reading_log"]
        if len(y) > 0:
            x = energy_train_bh.drop(
                labels=[
                    "meter_reading_log",
                    "hour",
                    "building_id",
                ],
                axis=1
            )
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[building][hour] = np.append([model.coef_], model.intercept_)
        
        model = Lasso(fit_intercept=False, alpha=.01).fit(x, y)
        energy_lasso[building][hour] = np.append([model.coef_], model.intercept_)
        
        model = BayesianRidge(fit_intercept=False).fit(x, y)
        energy_br[building][hour] = np.append([model.coef_], model.intercept_)

print(energy_lr[0][0])
print(energy_lasso[0][0])
print(energy_br[0][0])
        

[-0.03284475  5.42302179  5.47375     5.50878906  5.43307411  5.4615625
  5.51256793  5.41983696  0.        ]
[0.         5.35143029 5.40455    5.43670573 5.36778356 5.3923625
 5.43735054 5.34461957 0.        ]
[-0.03226968  5.42244496  5.47323735  5.50825163  5.43258166  5.46105099
  5.51200676  5.41928522  0.        ]
