In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
data_dir = Path.cwd().parent / "data"
features_dir = data_dir / "features"

In [4]:
train_features_sj = pd.read_csv(features_dir / "sj_train.csv")
train_features_iq = pd.read_csv(features_dir / "iq_train.csv")
test_features_sj = pd.read_csv(features_dir / "sj_test.csv")
test_features_iq = pd.read_csv(features_dir / "iq_test.csv")

In [5]:
def create_lag_features(data, n_in=1, n_out=1, dropna=True):
    n_vars = 1 if isinstance(data, list) else data.shape[1]

    df = pd.DataFrame(data)
    cols = []
    col_names = []
    for i in reversed(range(n_in)):
        cols.append(df.shift(i))
        col_names.extend(
            [f"var{j + 1}(t)" for j in range(n_vars)]
            if i == 0 else
            [f"var{j + 1}(t-{i})" for j in range(n_vars)]
        )
    for i in range (1, n_out):
        cols.append(df.shift(-i))
        col_names.extend(f"var{j + 1}(t+{i})" for j in range(n_vars))
    lag_features = pd.concat(cols, axis=1)
    lag_features.columns = col_names
    if dropna:
        lag_features.dropna(inplace=True)
    return lag_features

In [6]:
n_lag = 52

# SJ

In [7]:
X_train_sj_wide = create_lag_features(
    train_features_sj.drop("total_cases", axis=1),
    n_lag,
)
X_train_sj_wide.drop(
    ["var1(t)", "var2(t)", "var3(t)", "var4(t)", "var5(t)"],
    axis=1,
    inplace=True,
)
X_train_sj_wide["total_cases"] = train_features_sj["total_cases"]
X_train_sj_wide.shape

(885, 1088)

In [8]:
X_test_sj_wide = train_features_sj.drop(
    ["total_cases"],
    axis=1,
).iloc[-n_lag :, :].append(
    test_features_sj.drop(
        ["city", "weekofyear", "year"],
        axis=1,
    )
).reset_index(drop=True)
X_test_sj_wide = create_lag_features(X_test_sj_wide, n_lag)
X_test_sj_wide.drop(
    ["var1(t)", "var2(t)", "var3(t)", "var4(t)", "var5(t)"],
    axis=1,
    inplace=True,
)
X_test_sj_wide["city"] = "sj"
X_test_sj_wide["year"] = test_features_sj["year"]
X_test_sj_wide["weekofyear"] = test_features_sj["weekofyear"]
X_test_sj_wide.shape

(261, 1090)

# IQ

In [9]:
X_train_iq_wide = create_lag_features(
    train_features_iq.drop("total_cases", axis=1),
    n_lag,
)
X_train_iq_wide.drop(
    ["var1(t)", "var2(t)", "var3(t)", "var4(t)", "var5(t)"],
    axis=1,
    inplace=True,
)
X_train_iq_wide["total_cases"] = train_features_iq["total_cases"]
X_train_iq_wide.shape

(469, 1088)

In [10]:
X_test_iq_wide = train_features_iq.drop(
    ["total_cases"],
    axis=1,
).iloc[-n_lag :, :].append(
    test_features_iq.drop(
        ["city", "weekofyear", "year"],
        axis=1,
    )
).reset_index(drop=True)
X_test_iq_wide = create_lag_features(X_test_iq_wide, n_lag)
X_test_iq_wide.drop(
    ["var1(t)", "var2(t)", "var3(t)", "var4(t)", "var5(t)"],
    axis=1,
    inplace=True,
)
X_test_iq_wide["city"] = "iq"
X_test_iq_wide["year"] = test_features_iq["year"]
X_test_iq_wide["weekofyear"] = test_features_iq["weekofyear"]
X_test_iq_wide.shape

(157, 1090)

In [11]:
X_train_sj_wide.to_csv(
    features_dir / "sj_train_lag_features.csv",
    index=False,
)
X_train_iq_wide.to_csv(
    features_dir / "iq_train_lag_features.csv",
    index=False,
)
X_test_sj_wide.to_csv(
    features_dir / "sj_test_lag_features.csv",
    index=False,
)
X_test_iq_wide.to_csv(
    features_dir / "iq_test_lag_features.csv",
    index=False,
)