# Обогащение данных

1. Заполнить отсутствующие значения по погоде интерполяционными данными.
2. Для точки росы вычесть температуру воздуха.
3. Напрвыление ветра разложить на синус и косинус.
4. Для тем. воздуха вычеслить 1-ю и 2-ю производные.
5. Вывести параметры по праздничным дням, дням недели, месяцам и неделям года.
6. Посчитать модель линейной регрессии по первым 20 зданиям и найти точность.

In [29]:
import pandas as pd
import numpy as np

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from core.reduce_mem_usage import reduce_mem_usage

## Загрузка данных

In [2]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.zip")

## Отобрать 20 зданий и объединить

In [7]:
energy = energy[energy["building_id"] < 20]
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
del buildings
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175680 entries, 0 to 175679
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   building_id    175680 non-null  int64  
 1   meter          175680 non-null  int64  
 2   timestamp      175680 non-null  object 
 3   meter_reading  175680 non-null  float64
 4   site_id        175680 non-null  int64  
 5   primary_use    175680 non-null  object 
 6   square_feet    175680 non-null  int64  
 7   year_built     175680 non-null  float64
 8   floor_count    0 non-null       float64
dtypes: float64(3), int64(4), object(2)
memory usage: 13.4+ MB
None


## Интерполяция значений для weather

In [9]:
weather = weather[weather["site_id"] == 0]
weather["precip_depth_1_hr"] = weather["precip_depth_1_hr"].apply(lambda x: x if x > 0 else 0)
interpolate_columns = [
    "air_temperature",
    "dew_temperature",
    "cloud_coverage",
    "wind_speed",
    "wind_direction",
    "precip_depth_1_hr",
    "sea_level_pressure",
]

for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction="both", kind="cubic")

## Обогащение данных: погода

In [11]:
weather["wind_direction_rad"] = weather["wind_direction"] * np.pi/180
weather["wind_direction_sin"] = np.sin(weather["wind_direction_rad"])
weather["wind_direction_cos"] = np.cos(weather["wind_direction_rad"])
weather["air_temperature_diff_1"] = weather["air_temperature"].diff()
weather.at[0, "air_temperature_diff_1"] = weather.at[1, "air_temperature_diff_1"]
weather["air_temperature_diff_2"] = weather["air_temperature_diff_1"].diff()
weather.at[0, "air_temperature_diff_2"] = weather.at[1, "air_temperature_diff_2"]

## Объединить погодные данные

In [30]:
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "site_id",
        "year_built",
        "square_feet",
        "floor_count",
    ], 
    axis=1
)
del weather
energy = reduce_mem_usage(energy)
print(energy.info())

Потребление памяти меньше на - 15.41 Мб (минус 71.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   timestamp               175680 non-null  datetime64[ns]
 1   building_id             175680 non-null  int8          
 2   meter_reading           175680 non-null  float16       
 3   primary_use             175680 non-null  category      
 4   air_temperature         175680 non-null  float16       
 5   cloud_coverage          175680 non-null  float16       
 6   dew_temperature         175680 non-null  float16       
 7   precip_depth_1_hr       175680 non-null  float16       
 8   sea_level_pressure      175680 non-null  float16       
 9   wind_direction          175680 non-null  float16       
 10  wind_speed              175680 non-null  float16       
 11  wind_direction_rad      175680 non-nu

## Обогащение данных: дата

In [32]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")
energy["week"] = energy["timestamp"].dt.isocalendar().week.astype("int8")
energy["month"] = energy["timestamp"].dt.month.astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)

dates_range = pd.date_range(start="2015-12-31", end="2017-01-01")
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
energy["is_holiday"] = energy["date"].isin(us_holidays).astype("int8")

for weekday in range(0, 7):
    energy[f"is_wday {str(weekday)}"] = energy["weekday"].isin([weekday]).astype("int8")
for week in range(1, 54):
    energy[f"is_w {str(week)}"] = energy["week"].isin([week]).astype("int8")
for month in range(1, 13):
    energy[f"is_m {str(month)}"] = energy["month"].isin([month]).astype("int8")

## Логарифмирование данных

In [42]:
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

## Разделение данных

In [43]:
energy_train, energy_test = train_test_split(
    energy[energy["meter_reading"] > 0],
    test_size=0.2,
)
print(energy_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86858 entries, 69792 to 106572
Data columns (total 95 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   timestamp               86858 non-null  datetime64[ns]
 1   building_id             86858 non-null  int8          
 2   meter_reading           86858 non-null  float16       
 3   primary_use             86858 non-null  category      
 4   air_temperature         86858 non-null  float16       
 5   cloud_coverage          86858 non-null  float16       
 6   dew_temperature         86858 non-null  float16       
 7   precip_depth_1_hr       86858 non-null  float16       
 8   sea_level_pressure      86858 non-null  float16       
 9   wind_direction          86858 non-null  float16       
 10  wind_speed              86858 non-null  float16       
 11  wind_direction_rad      86858 non-null  float16       
 12  wind_direction_sin      86858 non-null  f

## Линейная регрессия

In [50]:
hours = range(0, 24)
buildings = range(0, energy_train["building_id"].max() + 1)
lr_columns = [
    "meter_reading_log",
    "hour",
    "building_id",
    "air_temperature",
    "dew_temperature",
    "sea_level_pressure",
    "wind_speed",
    "cloud_coverage",
    "air_temperature_diff_1",
    "air_temperature_diff_2",
    "is_holiday",
]

for wday in range(0, 7):
    lr_columns.append(f"is_wday {str(wday)}")
for week in range(1, 54):
    lr_columns.append(f"is_w {str(week)}")
for month in range(1, 13):
    lr_columns.append(f"is_m {str(month)}")

energy_train_lr = pd.DataFrame(energy_train, columns=lr_columns)
energy_lr = [[] for _ in range(len(buildings))]

for building in buildings:
    energy_lr[building] = [[] for _ in range(len(hours))]
    energy_train_b = energy_train_lr[energy_train_lr["building_id"] == building]

    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b["hour"] == hour]
        y = energy_train_bh["meter_reading_log"]
        x = energy_train_bh.drop(
            labels=["meter_reading_log",
                    "hour",
                    "building_id"],
            axis=1
        )
        model = LinearRegression(fit_intercept=False).fit(x, y)
        energy_lr[building][hour] = model.coef_
        energy_lr[building][hour] = np.append(
            energy_lr[building][hour],
            model.intercept_
        )

print(energy_lr[0])

[array([-3.00725894e-03,  1.14332667e-02, -1.66343543e-02, -6.04894678e-03,
       -1.80400278e-02, -5.02870105e-03, -7.44427216e-03, -7.94178338e-02,
        1.06918487e+01,  1.06867872e+01,  1.07466308e+01,  1.06658131e+01,
        1.07066007e+01,  1.06813383e+01,  1.06750287e+01,  4.46087611e-13,
        1.59205982e-13,  9.74775816e-14,  5.64215341e-13, -1.59872116e-13,
        1.56763491e-13,  9.12603326e-14,  4.41202630e-13,  4.74287276e-13,
       -2.14939178e-13,  9.94759830e-14,  1.68753900e-13,  3.28626015e-13,
       -2.62900812e-13,  4.44089210e-13,  7.81597009e-14, -1.45661261e-13,
       -1.06581410e-14, -1.08357767e-13,  2.73112653e+00,  2.54400218e+00,
        2.42966884e+00,  2.43636105e+00,  2.55520712e+00,  2.39318115e+00,
        2.38936315e+00,  2.40995307e+00,  2.58233592e+00,  2.55038880e+00,
        2.47897471e+00,  2.39158756e+00,  2.41567785e+00,  2.46360218e+00,
        2.22292826e+00,  2.14437569e+00,  2.18251163e+00,  2.12305045e+00,
        2.06908240e+00, 

## Качество расчета

In [52]:
def calculate_model(x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col] * model[i] for i, col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns)-3]
        lr = np.exp(lr)
    if lr < 0:
        lr = 0
    x["meter_reading_lr_q"] = (np.log(x.meter_reading + 1) - np.log(1 + lr))**2

    return x

In [53]:
energy_test = energy_test.apply(calculate_model, axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))

print(f"Качество линейной регрессии, 20 зданий: {energy_test_lr_rmsle}, (округленно): {round(energy_test_lr_rmsle, 1)}")

Качество линейной регрессии, 20 зданий: 0.2021599071898263, (округленно): 0.2
