# Предобработка данных

- Добавим столбец `target_year`, который ставит в соответствие объект и таргет. В частности, для периода месяцев *ноябрь-декабрь* `target_year = year + 1`, т.к. сбор урожая кукурузы происходит в *сентябре-октябре*.
- Удаляем данные *первого года январь-октябрь* и *последнего года ноябрь-декабрь*
- Удаляем лишние признаки: координаты, пути к изображениям, максимальная и минимальные температуры
- Соединяем признаки с таргетами по (`target_year`, `fips`) и (`year`, `target`) соответственно
- Обработка `NaN` значений
- Разделяем на `train`, `test`. В качестве `test` возьмем последний год

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

PATH_INTERIM = Path("../../data/interim")
PATH_PROCESSED = Path("../../data/processed")

In [2]:
X = pd.read_csv(PATH_INTERIM / "X.csv")
y = pd.read_csv(PATH_INTERIM / "y.csv")

## Добавим столбец `target_year`

In [3]:
X["target_year"] = np.where(X["month"] >= 11, X["year"] + 1, X["year"])

## Удаляем данные *первого года январь-октябрь* и *последнего года ноябрь-декабрь*

In [4]:
min_year, max_year = X["year"].min(), X["year"].max()
X = X[
    ~(
        ((X["year"] == min_year) & (X["month"] < 11))
        | ((X["year"] == max_year) & (X["month"] > 10))
    )
]

## Удаляем лишние признаки

In [5]:
X.drop(
    [
        "lat_lower_left",
        "lon_lower_left",
        "lat_upper_right",
        "lon_upper_right",
        "temperature_max",
        "temperature_avg",
    ],
    axis=1,
    inplace=True,
)

In [6]:
display(X.shape)
X.head(5)

(24000, 19)

Unnamed: 0,year,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,images,target_year
20,2017,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,17001-2017-11-01,2018
21,2017,11,15,17001,281.599,5.130583,66.95,8.53725,4.71175,2.010292,0.6845,2356.704167,0.475833,0.0002974148,0.411714,0.312134,0.264578,17001-2017-11-15,2018
22,2017,12,1,17001,270.082875,0.0,54.183333,3.181625,2.111375,0.135792,1.666417,2724.8375,0.460125,5.151347e-06,0.313707,0.323166,0.294289,17001-2017-12-01,2018
23,2017,12,15,17001,268.693333,0.196958,69.0875,5.463792,3.034458,2.569542,0.937,1957.1875,0.215583,4.038836e-07,0.308603,0.316075,0.296473,17001-2017-12-15,2018
44,2017,11,1,17003,278.042667,18.158111,90.355556,4.224,2.226778,-1.714222,0.091889,61.733333,0.099333,0.0003250581,0.383857,0.324059,0.241359,17003-2017-11-01,2018


## Соединяем признаки с таргетами

In [7]:
y["year"] = y["year"].astype(X["target_year"].dtype)
data = pd.merge(
    X.drop("year", axis=1),
    y,
    how="left",
    left_on=["target_year", "fips"],
    right_on=["year", "fips"],
)
# target_year не имеет NaN, year может иметь
data.drop("year", axis=1, inplace=True)
data.rename({"target_year": "year"}, inplace=True, axis=1)
display(data.shape)
data.head()

(24000, 19)

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,images,year,yield_bu_per_acre
0,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,17001-2017-11-01,2018,197.8
1,11,15,17001,281.599,5.130583,66.95,8.53725,4.71175,2.010292,0.6845,2356.704167,0.475833,0.0002974148,0.411714,0.312134,0.264578,17001-2017-11-15,2018,197.8
2,12,1,17001,270.082875,0.0,54.183333,3.181625,2.111375,0.135792,1.666417,2724.8375,0.460125,5.151347e-06,0.313707,0.323166,0.294289,17001-2017-12-01,2018,197.8
3,12,15,17001,268.693333,0.196958,69.0875,5.463792,3.034458,2.569542,0.937,1957.1875,0.215583,4.038836e-07,0.308603,0.316075,0.296473,17001-2017-12-15,2018,197.8
4,11,1,17003,278.042667,18.158111,90.355556,4.224,2.226778,-1.714222,0.091889,61.733333,0.099333,0.0003250581,0.383857,0.324059,0.241359,17003-2017-11-01,2018,188.4


## Сортировка

Приводим строки в правильный порядок. Т.к. считаем урожайность на данных ноябрь-август, то первыми должны идти данные за ноябрь-декабрь предыдущего года (в текущем `data` все ноябри-декабри относятся к "будущему году". Напр., у фактического ноября 2017-ого в `year` будет стоять 2018. Так сделано для удобства)

In [8]:
data["month_priority"] = np.where(data["month"] < 11, True, False)

data.sort_values(
    ["year", "fips", "month_priority", "month", "day"], inplace=True
)
data.drop("month_priority", axis=1, inplace=True)
data.head(25)

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,images,year,yield_bu_per_acre
0,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,17001-2017-11-01,2018,197.8
1,11,15,17001,281.599,5.130583,66.95,8.53725,4.71175,2.010292,0.6845,2356.704167,0.475833,0.0002974148,0.411714,0.312134,0.264578,17001-2017-11-15,2018,197.8
2,12,1,17001,270.082875,0.0,54.183333,3.181625,2.111375,0.135792,1.666417,2724.8375,0.460125,5.151347e-06,0.313707,0.323166,0.294289,17001-2017-12-01,2018,197.8
3,12,15,17001,268.693333,0.196958,69.0875,5.463792,3.034458,2.569542,0.937,1957.1875,0.215583,4.038836e-07,0.308603,0.316075,0.296473,17001-2017-12-15,2018,197.8
800,1,1,17001,245.96475,0.000292,68.516667,6.68175,8.900583,6.154292,-6.210375,2779.654167,0.036583,1.365202e-07,0.295965,0.29904,0.296177,17001-2018-01-01,2018,197.8
801,1,15,17001,259.711208,3.069542,79.375,10.654,7.62975,4.853792,-0.511083,2340.725,0.063917,8.59797e-07,0.366685,0.354921,0.302343,17001-2018-01-15,2018,197.8
802,2,1,17001,266.12275,0.000667,58.933333,8.36,5.538167,3.720167,-3.772458,3289.820833,0.216667,1.536061e-07,0.345383,0.352441,0.326802,17001-2018-02-01,2018,197.8
803,2,15,17001,278.780125,1.607292,90.7125,5.790583,3.133458,0.791167,1.074125,1149.2875,0.131958,0.0002810833,0.391254,0.355097,0.329683,17001-2018-02-15,2018,197.8
804,3,1,17001,277.554917,5.299417,84.329167,8.421125,5.573625,2.801,-4.140708,3445.05,0.196417,0.000288564,0.412397,0.404713,0.40034,17001-2018-03-01,2018,197.8
805,3,15,17001,275.964875,4.2e-05,43.4375,5.680458,2.870417,0.963125,-0.001708,5511.870833,0.720917,4.718701e-07,0.359893,0.36417,0.385584,17001-2018-03-15,2018,197.8


## Обработка `NaN` значений

### 1. `fips`, `year`, `yield_bu_per_acre`

- Мы сделали `left join`. Не для каждого `year`, `fips` мы будем иметь значение `yield_bu_per_acre`. Нужно от них избавиться, учитывая условие ниже;
- для `year`, `fips` в текущего года, может не быть значения `yield_bu_per_acre` прошлого года, но для *ноябрь-сентябрь* предыдущего года нам нужны значения текущего.

In [9]:
# избавимся от значений, для которых нет данных в y.
# Я как будто олимпиадные задачи решаю
data = data[~((data["month"] < 11) & (data["yield_bu_per_acre"].isna()))]
data.head(3)

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,images,year,yield_bu_per_acre
0,11,1,17001,273.936458,0.187208,71.783333,7.191125,3.372792,-2.079917,2.556042,444.233333,0.222625,4.736086e-07,0.310632,0.317329,0.258307,17001-2017-11-01,2018,197.8
1,11,15,17001,281.599,5.130583,66.95,8.53725,4.71175,2.010292,0.6845,2356.704167,0.475833,0.0002974148,0.411714,0.312134,0.264578,17001-2017-11-15,2018,197.8
2,12,1,17001,270.082875,0.0,54.183333,3.181625,2.111375,0.135792,1.666417,2724.8375,0.460125,5.151347e-06,0.313707,0.323166,0.294289,17001-2017-12-01,2018,197.8


In [10]:
# Значение отсортированы по ["year", "fips", "month_priority", "month", "day"]
# Значит ноябрь-декабрь идут до январь-август
data.loc[:, ["year", "yield_bu_per_acre"]] = data[
    ["year", "yield_bu_per_acre"]
].bfill()

In [11]:
data[data.isna().any(axis=1)]

Unnamed: 0,month,day,fips,temperature_min,precipitation,humidity_relative,wind_gust,wind_speed,wind_u_component,wind_v_component,solar_radiation_downward,vapor_pressure_deficit,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,images,year,yield_bu_per_acre


Остались такие, для которых не было данных за *январь-август*

In [12]:
data = data[data.groupby(["year", "fips"])["month"].transform("nunique") == 12]

## Удаляем *сентябрь-октябрь*

*Сентябрь-октябрь* - время сбора урожая. Прогнозирование урожайности в этот период не имеет смысла. К тому же мы не знаем точного месяца сбора.

In [13]:
data = data[(data["month"] != 9) & (data["month"] != 10)]

## Разделяем на обучающую и тестовую выборку

In [14]:
mask = data["year"] == data["year"].max()
data_train = data[~mask]
data_test = data[mask]

X_train = data_train.drop("yield_bu_per_acre", axis=1)
y_train = data_train["yield_bu_per_acre"]
X_test = data_test.drop("yield_bu_per_acre", axis=1)
y_test = data_test["yield_bu_per_acre"]

In [15]:
print(f"X_train: {X_train.shape[0]}")
print(f"y_train: {y_train.shape[0]}")
print(f"X_test: {X_test.shape[0]}")
print(f"y_test: {y_test.shape[0]}")
print(f"(test size)/(data size): {X_test.shape[0] / data.shape[0]}")

X_train: 14000
y_train: 14000
X_test: 3800
y_test: 3800
(test size)/(data size): 0.21348314606741572


In [16]:
columns_order = ["year", "fips", "month", "day"] + np.sort(
    X_train.columns.drop(["year", "fips", "month", "day"])
).tolist()
X_train = X_train[columns_order]
X_test = X_test[columns_order]

In [17]:
X_train.head(25)

Unnamed: 0,year,fips,month,day,humidity_relative,images,precipitation,skin_reservoir_content,soil_water_vol_layer1,soil_water_vol_layer2,soil_water_vol_layer3,solar_radiation_downward,temperature_min,vapor_pressure_deficit,wind_gust,wind_speed,wind_u_component,wind_v_component
0,2018,17001,11,1,71.783333,17001-2017-11-01,0.187208,4.736086e-07,0.310632,0.317329,0.258307,444.233333,273.936458,0.222625,7.191125,3.372792,-2.079917,2.556042
1,2018,17001,11,15,66.95,17001-2017-11-15,5.130583,0.0002974148,0.411714,0.312134,0.264578,2356.704167,281.599,0.475833,8.53725,4.71175,2.010292,0.6845
2,2018,17001,12,1,54.183333,17001-2017-12-01,0.0,5.151347e-06,0.313707,0.323166,0.294289,2724.8375,270.082875,0.460125,3.181625,2.111375,0.135792,1.666417
3,2018,17001,12,15,69.0875,17001-2017-12-15,0.196958,4.038836e-07,0.308603,0.316075,0.296473,1957.1875,268.693333,0.215583,5.463792,3.034458,2.569542,0.937
800,2018,17001,1,1,68.516667,17001-2018-01-01,0.000292,1.365202e-07,0.295965,0.29904,0.296177,2779.654167,245.96475,0.036583,6.68175,8.900583,6.154292,-6.210375
801,2018,17001,1,15,79.375,17001-2018-01-15,3.069542,8.59797e-07,0.366685,0.354921,0.302343,2340.725,259.711208,0.063917,10.654,7.62975,4.853792,-0.511083
802,2018,17001,2,1,58.933333,17001-2018-02-01,0.000667,1.536061e-07,0.345383,0.352441,0.326802,3289.820833,266.12275,0.216667,8.36,5.538167,3.720167,-3.772458
803,2018,17001,2,15,90.7125,17001-2018-02-15,1.607292,0.0002810833,0.391254,0.355097,0.329683,1149.2875,278.780125,0.131958,5.790583,3.133458,0.791167,1.074125
804,2018,17001,3,1,84.329167,17001-2018-03-01,5.299417,0.000288564,0.412397,0.404713,0.40034,3445.05,277.554917,0.196417,8.421125,5.573625,2.801,-4.140708
805,2018,17001,3,15,43.4375,17001-2018-03-15,4.2e-05,4.718701e-07,0.359893,0.36417,0.385584,5511.870833,275.964875,0.720917,5.680458,2.870417,0.963125,-0.001708


## Проверка

- Соединяем X и y
- Проверяем, что для пары (year, fips) существует только одно единственное значение `yield_bu_per_acre`
- Проверяем, имеют ли объекты одинаковую длину (12 месяцев по 2 дня = 20)
- Разделяем X и y

In [18]:
concat = lambda X, y: pd.concat([X, y], axis=1)

data_train = concat(X_train, y_train)
data_test = concat(X_test, y_test)

for X_y in [data_train, data_test]:
    X_y_grouped = X_y.groupby(["year", "fips"])
    assert (X_y_grouped["yield_bu_per_acre"].nunique() == 1).all()
    assert (X_y_grouped["yield_bu_per_acre"].count() == 20).all()

## Сохраняем

In [19]:
if not PATH_PROCESSED.exists():
    PATH_PROCESSED.mkdir()
X_train.to_csv(PATH_PROCESSED / "X_train.csv", index=False)
y_train.to_csv(PATH_PROCESSED / "y_train.csv", index=False)
X_test.to_csv(PATH_PROCESSED / "X_test.csv", index=False)
y_test.to_csv(PATH_PROCESSED / "y_test.csv", index=False)

In [20]:
print(
    "X_train",
    X_train_size := X_train[["year", "fips"]].drop_duplicates().shape[0],
)
print(
    "X_test",
    X_test_size := X_test[["year", "fips"]].drop_duplicates().shape[0],
)
print("X_test/X", X_test_size / (X_test_size + X_train_size))

X_train 700
X_test 190
X_test/X 0.21348314606741572
