### Python Code

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
# read data
data_raw = pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")

data_raw = (
    # select columns
    data_raw[["Country", "Product", "Date", "Revenue"]]
    # change data types
    .assign(
        Date = pd.to_datetime(data_raw["Date"]), 
        Revenue = pd.to_numeric(data_raw["Revenue"])
    )
)

# print the first few rows
print(data_raw.head())

In [None]:
# filter on specific series
us_ck_raw = data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Cookies")]

us_ck_raw.set_index("Date", inplace=True)

print(us_ck_raw.head())

# plot the data
plt.figure(figsize=(10, 6))
plt.plot(us_ck_raw.index, us_ck_raw["Revenue"], label="Cookies Revenue")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Cookies Revenue in the United States")
plt.legend()

In [None]:
# Set a random seed for reproducibility
random.seed(15)

us_ck_corr = us_ck_raw.copy()

# Generate a new variable with strong correlation
correlation_target = 0.9
n = us_ck_corr.shape[0]
noise = np.random.randn(n)

# Orthogonalize noise to the original variable to ensure independence
noise = noise - np.dot(noise, us_ck_corr["Revenue"]) / np.dot(us_ck_corr["Revenue"], us_ck_corr["Revenue"]) * us_ck_corr["Revenue"]

# Scale the orthogonalized noise to match the desired correlation
noise = noise / np.linalg.norm(noise) * np.sqrt(1 - correlation_target**2) * np.linalg.norm(us_ck_corr["Revenue"])

# Create the new variable
us_ck_corr["xreg1"] = correlation_target * us_ck_corr["Revenue"] + noise

# Verify the correlation
correlation = us_ck_corr["Revenue"].corr(us_ck_corr["xreg1"])
print(f"Correlation between Original and New_Var: {correlation:.4f}")


# create a variable that has a weak correlation to the revenue column, create random values between 1 and 100
us_ck_corr["xreg2"] = random.sample(range(1, 100), us_ck_corr.shape[0])

# print the first few rows, formatting numbers to 2 decimal places
print(us_ck_corr.head(10).round(2))

# plot the data with the new variables as dotted lines
plt.figure(figsize=(10, 6))
plt.plot(us_ck_corr.index, us_ck_corr["Revenue"], label="Cookies Revenue")
plt.plot(us_ck_corr.index, us_ck_corr["xreg1"], label="xreg1", linestyle = "dotted")
plt.plot(us_ck_corr.index, us_ck_corr["xreg2"], label="xreg2", linestyle = "dotted")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Cookies Revenue in the United States")
plt.legend()

# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")

In [None]:
# calculate the correlation between the target variable and the new variables, dropping the date, country, and product columns
correlation = us_ck_corr.drop(columns=["Country", "Product"]).corr()

# create a simple table to display the correlation values
correlation_table = correlation.stack().reset_index()
correlation_table.columns = ["Variable 1", "Variable 2", "Correlation"]
correlation_table = correlation_table[correlation_table["Variable 1"] == "Revenue"]
correlation_table = correlation_table[correlation_table["Variable 2"] != "Revenue"]

# print the correlation table, rounding the values to 2 decimal places
print(correlation_table.round(2))

In [None]:
# calculate lags of the xreg1 and xreg2 columns. Create 1, 2, 3, 6, 9, 12 lags
us_ck_corr["xreg1_lag_1"] = us_ck_corr["xreg1"].shift(1)
us_ck_corr["xreg1_lag_2"] = us_ck_corr["xreg1"].shift(2)
us_ck_corr["xreg1_lag_3"] = us_ck_corr["xreg1"].shift(3)
us_ck_corr["xreg1_lag_6"] = us_ck_corr["xreg1"].shift(6)
us_ck_corr["xreg1_lag_9"] = us_ck_corr["xreg1"].shift(9)
us_ck_corr["xreg1_lag_12"] = us_ck_corr["xreg1"].shift(12)

us_ck_corr["xreg2_lag_1"] = us_ck_corr["xreg2"].shift(1)
us_ck_corr["xreg2_lag_2"] = us_ck_corr["xreg2"].shift(2)
us_ck_corr["xreg2_lag_3"] = us_ck_corr["xreg2"].shift(3)
us_ck_corr["xreg2_lag_6"] = us_ck_corr["xreg2"].shift(6)
us_ck_corr["xreg2_lag_9"] = us_ck_corr["xreg2"].shift(9)
us_ck_corr["xreg2_lag_12"] = us_ck_corr["xreg2"].shift(12)

# calculate the correlation between the target variable and the new variables, dropping the date, country, and product columns
relation = us_ck_corr.drop(columns=["Country", "Product"]).corr()

# create a simple table to display the correlation values
lag_table = relation.stack().reset_index()
lag_table.columns = ["Variable 1", "Variable 2", "Correlation"]
lag_table = lag_table[lag_table["Variable 1"] == "Revenue"]
lag_table = lag_table[lag_table["Variable 2"] != "Revenue"]

# print the correlation table, rounding the values to 2 decimal places
print(lag_table.round(2))