### Python Code

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
# read data
data_raw = pd.read_csv("../posts/2024-10-02-ts-fundamentals-whats-a-time-series/example_ts_data.csv")

data_raw = (
    # select columns
    data_raw[["Country", "Product", "Date", "Revenue"]]
    # change data types
    .assign(
        Date = pd.to_datetime(data_raw["Date"]), 
        Revenue = pd.to_numeric(data_raw["Revenue"])
    )
)

# print the first few rows
print(data_raw.head())

In [None]:
# filter on specific series
us_ck_raw = data_raw[(data_raw["Country"] == "United States") & (data_raw["Product"] == "Cookies")]

us_ck_raw.set_index("Date", inplace=True)

print(us_ck_raw.head())

# plot the data
plt.figure(figsize=(10, 6))
plt.plot(us_ck_raw.index, us_ck_raw["Revenue"], label="Cookies Revenue")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Cookies Revenue in the United States")
plt.legend()

# save the plot
# plt.savefig("chart1", dpi = 300, bbox_inches = "tight")

In [None]:
# Set a random seed for reproducibility
random.seed(15)

us_ck_corr = us_ck_raw.copy()

# Generate a new variable with 0.8 correlation
n = us_ck_corr.shape[0]
correlation_target = 0.7
noise = np.random.randn(n)  # Random noise
scale_noise = np.sqrt(1 - correlation_target**2)  # Scale the noise

us_ck_corr["xreg1"] = (correlation_target * us_ck_corr["Revenue"]) + (scale_noise * noise)

# create a variable that has a weak correlation to the revenue column, create random values between 1 and 100
us_ck_corr["xreg2"] = random.sample(range(1, 100), us_ck_corr.shape[0])

# print the first few rows, formatting numbers to 2 decimal places
print(us_ck_corr.head(10).round(2))

# plot the data with the new variables
plt.figure(figsize=(10, 6))
plt.plot(us_ck_corr.index, us_ck_corr["Revenue"], label="Cookies Revenue")
plt.plot(us_ck_corr.index, us_ck_corr["xreg1"], label="Strong Correlation")
plt.plot(us_ck_corr.index, us_ck_corr["xreg2"], label="Weak Correlation")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.title("Cookies Revenue in the United States")
plt.legend()

# save the plot
# plt.savefig("chart2", dpi = 300, bbox_inches = "tight")

In [None]:
us_ck_corr["Revenue"].corr(us_ck_corr["xreg1"])

In [None]:
# calculate the correlation between the target variable and the new variables, dropping the date, country, and product columns
correlation = us_ck_corr.drop(columns=["Country", "Product"]).corr()

# create a simple table to display the correlation values
correlation_table = correlation.stack().reset_index()
correlation_table.columns = ["Variable 1", "Variable 2", "Correlation"]
correlation_table = correlation_table[correlation_table["Variable 1"] == "Revenue"]
correlation_table = correlation_table[correlation_table["Variable 2"] != "Revenue"]

# print the correlation table, rounding the values to 2 decimal places
print(correlation_table.round(2))

# save the plot

In [None]:
data_raw

In [None]:
import numpy as np
import pandas as pd

# Create a random dataset
np.random.seed(42)  # For reproducibility
n = 100  # Number of samples
data_raw = pd.DataFrame({
    "Original": np.random.randn(n)  # Original variable
})

# Generate a new variable with a target correlation
correlation_target = 0.8

# Generate random noise
noise = np.random.randn(n)

# Orthogonalize noise to the original variable to ensure independence
noise = noise - np.dot(noise, data_raw["Original"]) / np.dot(data_raw["Original"], data_raw["Original"]) * data_raw["Original"]

# Scale the orthogonalized noise to match the desired correlation
noise = noise / np.linalg.norm(noise) * np.sqrt(1 - correlation_target**2) * np.linalg.norm(data_raw["Original"])

# Create the new variable
data_raw["New_Var"] = correlation_target * data_raw["Original"] + noise

# Verify the correlation
correlation = data_raw["Original"].corr(data_raw["New_Var"])
print(f"Correlation between Original and New_Var: {correlation:.4f}")
