# Заполнение пропусков в данных
1. Заполнить отсутствующие значения по погоде интерполяционными данными.
2. Посчитать модель линейной регрессии по первому зданию и найти ее точность.

In [4]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [5]:
buildings = pd.read_csv("../data/buildings.csv")
weather = pd.read_csv("../data/weather.csv")
energy = pd.read_csv("../data/train.0.zip")

## Данные по 0-му зданию

In [6]:
energy = energy[energy["building_id"]==0]
print(energy["building_id"].unique())
print(energy.info())

[0]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8784 entries, 0 to 12059506
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 343.1+ KB
None


## Объединение данных

In [7]:
energy = pd.merge(
    left=energy,
    right=buildings,
    how="left",
    left_on="building_id",
    right_on="building_id",
)
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(
    left=energy,
    right=weather,
    how="left",
    left_index=True,
    right_index=True,
)
energy.reset_index(inplace=True)
energy = energy.drop(
    columns=[
        "meter",
        "site_id",
    ],
    axis=1
)

del buildings
del weather
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           8784 non-null   object 
 1   building_id         8784 non-null   int64  
 2   meter_reading       8784 non-null   float64
 3   primary_use         8784 non-null   object 
 4   square_feet         8784 non-null   int64  
 5   year_built          8784 non-null   float64
 6   floor_count         0 non-null      float64
 7   air_temperature     8781 non-null   float64
 8   cloud_coverage      4954 non-null   float64
 9   dew_temperature     8781 non-null   float64
 10  precip_depth_1_hr   8783 non-null   float64
 11  sea_level_pressure  8699 non-null   float64
 12  wind_direction      8534 non-null   float64
 13  wind_speed          8784 non-null   float64
dtypes: float64(10), int64(2), object(2)
memory usage: 960.9+ KB
None


## Оптимизация памяти

In [8]:
def reduce_mem_usage(df: pd.DataFrame) -> pd.DataFrame:
    start_mem = df.memory_usage().sum()/1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
            
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Потребление памяти меньше на - {round(start_mem - end_mem, 2)} Мб (минус {round(100*(start_mem - end_mem)/start_mem, 1)}%)")
    return df

In [9]:
energy = reduce_mem_usage(energy)

Потребление памяти меньше на - 0.62 Мб (минус 66.1%)


## Интерполяция данных

In [12]:
energy["precip_depth_1_hr"] = energy["precip_depth_1_hr"].apply(
    lambda x: x if x>0 else 0
)

interpolate_columns = [
    "air_temperature",
    "dew_temperature",
    "cloud_coverage",
    "wind_speed",
    "precip_depth_1_hr",
    "sea_level_pressure",
]
for col in interpolate_columns:
    energy[col] = energy[col].interpolate(limit_direction="both", kind="cubic")

## Проверка качества интерполяции

In [13]:
pd.set_option("use_inf_as_na", True)
for col in interpolate_columns:
    print(f"{col} Inf+Nan: {energy[col].isna().sum()}")

air_temperature Inf+Nan: 0
dew_temperature Inf+Nan: 0
cloud_coverage Inf+Nan: 0
wind_speed Inf+Nan: 0
precip_depth_1_hr Inf+Nan: 0
sea_level_pressure Inf+Nan: 0


## Разделить данные

In [14]:
energy_train, energy_test = train_test_split(energy[energy["meter_reading"]>0], test_size=0.2)
print(energy_train.head())

               timestamp  building_id  meter_reading primary_use  square_feet  \
5172 2016-08-03 12:00:00            0        299.750   Education         7432   
4633 2016-07-12 01:00:00            0        303.000   Education         7432   
5336 2016-08-10 08:00:00            0        293.500   Education         7432   
4506 2016-07-06 18:00:00            0        249.875   Education         7432   
3709 2016-06-03 13:00:00            0        255.250   Education         7432   

      year_built  floor_count  air_temperature  cloud_coverage  \
5172      2008.0          NaN        25.593750        4.000000   
4633      2008.0          NaN        27.796875        6.332031   
5336      2008.0          NaN        23.906250        6.398438   
4506      2008.0          NaN        35.000000        4.000000   
3709      2008.0          NaN        27.796875        2.000000   

      dew_temperature  precip_depth_1_hr  sea_level_pressure  wind_direction  \
5172        22.796875               

## Линейная регрессия

In [17]:
regression_columns = [
    "meter_reading",
    "air_temperature",
    "dew_temperature",
    "cloud_coverage",
    "wind_speed",
    "precip_depth_1_hr",
    "sea_level_pressure",
]

energy_train_lr = pd.DataFrame(energy_train, columns=regression_columns)
y = energy_train_lr["meter_reading"]
x = energy_train_lr.drop(labels=["meter_reading"], axis=1)
model = LinearRegression().fit(x, y)
print(model.coef_, model.intercept_)

[ 2.53377807  3.73983481 -2.41972715 -1.90860465  0.13703707 -1.00194708] 1137.5690576330478


## Предсказание и оценка модели

In [19]:
def calculate_model(x):
    lr = np.sum(
        [
            x[col]*model.coef_[i] for i, col in enumerate(regression_columns[1:])
        ]
    )
    lr += model.intercept_
    x["meter_reading_lr_q"] = (np.log(1+x.meter_reading) - np.log(1+lr))**2
    return x

energy_test = energy_test.apply(calculate_model, axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print(f"Качество линейной регрессии: {energy_test_lr_rmsle} (округленно) --- {round(energy_test_lr_rmsle, 1)}")

Качество линейной регрессии: 0.21175880589094184 (округленно) --- 0.2
