# Предобработка данных

- Удаляем данные *первого года январь-октябрь* и *последнего года ноябрь-декабрь*
- Удаляем лишние признаки: координаты, пути к изображениям, максимальная и минимальные температуры
- Соединяем признаки с таргетами по (`target_year`, `fips`) и (`year`, `target`) соответственно
- Обработка `NaN значений
- Разделяем на `train`, `test`. В качестве `test` возьмем последний год

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

PATH_INTERIM = Path("../../data/interim")
PATH_PROCESSED = Path("../../data/processed")

In [2]:
X = pd.read_csv(PATH_INTERIM / "X.csv")
y = pd.read_csv(PATH_INTERIM / "y.csv")

## Удаляем данные *первого года январь-октябрь* и *последнего года ноябрь-декабрь*

In [3]:
min_year, max_year = X["year"].min(), X["year"].max()
X = X[
    ~((X["year"] == min_year) & (X["month"] < 11))
    | ((X["year"] == max_year) & (X["month"] > 10))
]

## Удаляем лишние признаки

In [4]:
X.drop(
    [
        "lat_lower_left",
        "lon_lower_left",
        "lat_upper_right",
        "lon_upper_right",
        "temperature_max",
        "temperature_avg",
        "images",
    ],
    axis=1,
    inplace=True,
)

In [5]:
display(X.shape)
X.head(5)

(22156, 18)

Unnamed: 0,year,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,target_year
3860,2017,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,2018
3861,2017,11,1,17003,278.042667,18.158111,90.355556,4.224,2.226778,-1.714222,0.091889,61.733333,0.099333,0.0003250581,0.383857,0.324059,0.241359,2018
3862,2017,11,1,17005,275.822167,2.791333,88.3,5.595833,3.043833,-1.9655,1.833667,111.766667,0.095833,0.0001284828,0.451369,0.439113,0.399522,2018
3863,2017,11,1,17007,273.233,6.926667,78.4,5.291667,2.995,-1.246667,2.581333,358.5,0.154,3.930181e-07,0.339598,0.347218,0.311294,2018
3864,2017,11,1,17009,274.035,0.12625,73.925,6.281875,3.22925,-2.013,2.42825,375.025,0.20425,8.822729e-07,0.299224,0.304276,0.238118,2018


## Соединяем признаки с таргетами

In [6]:
X["target_year"] = X["target_year"].astype(y["year"].dtype)
data = pd.merge(
    X.drop("year", axis=1),
    y,
    how="inner",
    left_on=["target_year", "fips"],
    right_on=["year", "fips"],
)
display(data.shape)
data.head()

(21020, 19)

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,target_year,year,yield_bu_per_acre
0,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,2018,2018,197.8
1,11,1,17003,278.042667,18.158111,90.355556,4.224,2.226778,-1.714222,0.091889,61.733333,0.099333,0.0003250581,0.383857,0.324059,0.241359,2018,2018,188.4
2,11,1,17007,273.233,6.926667,78.4,5.291667,2.995,-1.246667,2.581333,358.5,0.154,3.930181e-07,0.339598,0.347218,0.311294,2018,2018,194.1
3,11,1,17009,274.035,0.12625,73.925,6.281875,3.22925,-2.013,2.42825,375.025,0.20425,8.822729e-07,0.299224,0.304276,0.238118,2018,2018,187.9
4,11,1,17011,273.145593,4.49337,76.974074,5.554111,3.273741,-1.988667,2.418407,234.255556,0.165778,5.04085e-07,0.338929,0.348143,0.320493,2018,2018,214.1


## Обработка `NaN` значений

In [7]:
data[data.isna().any(axis=1)]

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,target_year,year,yield_bu_per_acre
17219,1,1,17043,270.2395,7.914,92.6,10.668167,5.038333,-1.216167,-4.8475,36.55,0.048,,,,,2022,2022,184.7
17410,1,15,17043,266.716667,1.2985,82.633333,9.850833,5.719,-4.128,-3.938667,1762.183333,0.075667,,,,,2022,2022,184.7
17601,2,1,17043,271.4915,4.346333,70.5,11.133,4.246667,-0.064333,3.682833,1647.45,0.210167,,,,,2022,2022,184.7
17792,2,15,17043,279.375,0.004333,64.0,0.0,3.984,1.909,3.496,0.0,0.342,,,,,2022,2022,184.7
17983,3,1,17043,276.278333,0.0,65.916667,5.327667,2.171,0.941833,0.530167,1811.216667,0.337333,,,,,2022,2022,184.7
18174,3,15,17043,274.858667,0.0,72.766667,4.0525,1.726167,-0.952167,-0.813833,4740.033333,0.330333,,,,,2022,2022,184.7
18365,4,1,17043,271.952833,1.117,77.183333,6.752833,3.056833,2.441667,-1.619833,3681.15,0.177,,,,,2022,2022,184.7
18556,4,15,17043,277.195667,0.4,44.516667,10.693167,4.495167,3.973833,1.9185,3418.65,0.553833,,,,,2022,2022,184.7
18747,5,1,17043,282.533,9.0225,79.35,10.789,4.672667,2.9435,3.176167,839.616667,0.2915,,,,,2022,2022,184.7
18938,5,15,17043,291.173833,0.319167,56.6,5.326667,2.180333,0.5085,-0.1955,3551.866667,1.134833,,,,,2022,2022,184.7


Не хватает данных для одного округа с `fips` 7043 о влажности почвы. Можем их удалить, т.к. это всего ли -1 объект (объектом здесь понимается временной ряд длиною в год для округа)

In [8]:
data.dropna(inplace=True)

## Разделяем на обучающую и тестовую выборку

In [9]:
data.drop("target_year", axis=1, inplace=True)

In [10]:
mask = data["year"] == data["year"].max()
data_train = data[~mask]
data_test = data[mask]

X_train = data_train.drop("yield_bu_per_acre", axis=1)
y_train = data_train["yield_bu_per_acre"]
X_test = data_test.drop("yield_bu_per_acre", axis=1)
y_test = data_test["yield_bu_per_acre"]

In [11]:
print(f"X_train: {X_train.shape[0]}")
print(f"y_train: {y_train.shape[0]}")
print(f"X_test: {X_test.shape[0]}")
print(f"y_test: {y_test.shape[0]}")
print(f"(test size)/(data size): {X_test.shape[0] / data.shape[0]}")

X_train: 16564
y_train: 16564
X_test: 4436
y_test: 4436
(test size)/(data size): 0.21123809523809522


In [12]:
columns_order = ["year", "month", "day", "fips"] + np.sort(
    X_train.columns.drop(["year", "month", "day", "fips"])
).tolist()
X_train = X_train[columns_order]
X_test = X_test[columns_order]

In [13]:
X_train.head(1)

Unnamed: 0,year,month,day,fips,humidity_relative,precipitation,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,solar_radiation_downward,temperature_min,vapor_pressure_deficit,wind_gust,wind_speed,wind_u_component,wind_v_component
0,2018,11,1,17001,71.783333,0.187208,4.736086e-07,0.310632,0.317329,0.258307,444.233333,273.936458,0.222625,7.191125,3.372792,-2.079917,2.556042


## Сохраняем

In [14]:
if not PATH_PROCESSED.exists():
    PATH_PROCESSED.mkdir()
X_train.to_csv(PATH_PROCESSED / "X_train.csv", index=False)
y_train.to_csv(PATH_PROCESSED / "y_train.csv", index=False)
X_test.to_csv(PATH_PROCESSED / "X_test.csv", index=False)
y_test.to_csv(PATH_PROCESSED / "y_test.csv", index=False)