In [None]:
# ============================================================
# Notebook setup: run this before everything
# ============================================================
# -- Copied from lecture
%load_ext autoreload
%config IPCompleter.greedy=True
%autoreload 1
%aimport util
import logging

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from util import util

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Control figure size
interactive_figures = False
if interactive_figures:
    # Normal behavior
    %matplotlib widget
    figsize=(9, 3)
else:
    # PDF export behavior
    figsize=(14, 4)

data_folder = '../resources/dataset'
file_name = '7_gecco2019_train_water_quality.csv'
# Load the input data
data_path = f'{data_folder}/{file_name}'
raw_data = pd.read_csv(data_path)
raw_data['Time'] = pd.to_datetime(raw_data['Time'])
raw_data.set_index('Time', inplace=True)
raw_data = raw_data.drop(columns=["Unnamed: 0"]) # The index was stored as an unnamed column

# Impute missing values
raw_data = util.impute_missing_values(raw_data)

# Get the feature columns
feature_columns = util.get_feature_columns(raw_data)

In [None]:
for col in feature_columns:
    plt.figure(figsize=(8, 4))
    # Drop missing values for the current column and plot the histogram
    plt.hist(raw_data[col].dropna(), bins=100, alpha=0.7, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Compute rolling statistics (mean and variance) using agg
df = raw_data.interpolate(method='linear', axis=0)
rolling_stats = df[feature_columns].rolling(window=15, min_periods=1, center=True).agg(['mean', 'var'])

# Plot the rolling mean with shaded error bars representing ±1 standard deviation (sqrt(variance))
for col in feature_columns:
    # Extract the rolling mean and compute standard deviation from variance
    mean_values = rolling_stats[(col, 'mean')]
    std_values = np.sqrt(rolling_stats[(col, 'var')])

    plt.figure(figsize=(12, 4))
    plt.plot(rolling_stats.index, mean_values, label='Mean')

    # Create shaded error bars using fill_between
    plt.fill_between(rolling_stats.index,
                     mean_values - std_values,
                     mean_values + std_values,
                     color='gray', alpha=0.3, label='±1 Std Dev')

    plt.title(f'Rolling 15-Minute Mean and ± Std Dev for {col}')
    plt.xlabel('Time')
    plt.ylabel(col)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Assume df is your DataFrame with a DateTime index and a 'SAC' column

# Step 1: Create a numeric time variable (e.g., seconds elapsed since the start)
df = raw_data.interpolate(method='linear', axis=0)
df['time_num'] = (df.index - df.index[0]).total_seconds()

# Step 2: Fit a linear regression model to 'SAC' over time
X = df[['time_num']]
y = df['SAC']
lr = LinearRegression()
lr.fit(X, y)

# Get the fitted trend
df['trend'] = lr.predict(X)

# Step 3: Remove the trend (detrend the data)
df['SAC_detrended'] = df['SAC'] - df['trend']

# Step 4: Plot the results
plt.figure(figsize=(12, 8))

# Plot original data and the fitted trend
plt.subplot(2, 1, 1)
plt.plot(df.index, df['SAC'], label='Original SAC')
plt.plot(df.index, df['trend'], label='Fitted Linear Trend', linestyle='--')
plt.title('Original SAC with Fitted Linear Trend')
plt.xlabel('Time')
plt.ylabel('SAC')
plt.legend()

# Plot detrended data
plt.subplot(2, 1, 2)
plt.plot(df.index, df['SAC_detrended'], label='Detrended SAC', color='green')
plt.title('Detrended SAC')
plt.xlabel('Time')
plt.ylabel('SAC (Detrended)')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Assume df is your DataFrame with a DateTime index and a 'Tp' column
df = raw_data.interpolate(method='linear', axis=0)

# Step 1: Create a numeric time variable (e.g., seconds since start)
df['time_num'] = (df.index - df.index[0]).total_seconds()

# Step 2: Apply log transformation to Tp (to linearize an exponential trend)
epsilon = 1e-6
df['log_Tp'] = np.log(df['Tp'] + epsilon)

# Step 3: Fit a linear regression model to the log-transformed Tp over time
X = df[['time_num']]
y = df['log_Tp']
lr = LinearRegression()
lr.fit(X, y)

# Get the fitted trend (in log-scale)
df['log_trend'] = lr.predict(X)

# Step 4: Detrend the log-transformed Tp
df['log_Tp_detrended'] = df['log_Tp'] - df['log_trend']

# Optionally, convert the detrended values back to the original scale
df['Tp_detrended'] = np.exp(df['log_Tp_detrended'])

# Step 5: Plot the results
plt.figure(figsize=(12, 12))

# Plot 1: Log(Tp) with the fitted trend
plt.subplot(3, 1, 1)
plt.plot(df.index, df['log_Tp'], label='Log(Tp)')
plt.plot(df.index, df['log_trend'], label='Fitted Linear Trend', linestyle='--')
plt.title('Log(Tp) with Fitted Linear Trend')
plt.xlabel('Time')
plt.ylabel('log(Tp)')
plt.legend()

# Plot 2: Detrended log(Tp)
plt.subplot(3, 1, 2)
plt.plot(df.index, df['log_Tp_detrended'], label='Detrended log(Tp)', color='green')
plt.title('Detrended log(Tp)')
plt.xlabel('Time')
plt.ylabel('log(Tp) Detrended')
plt.legend()

# Plot 3: Detrended Tp in the original scale (optional)
plt.subplot(3, 1, 3)
plt.plot(df.index, df['Tp_detrended'], label='Detrended Tp (Original Scale)', color='red')
plt.title('Detrended Tp in Original Scale')
plt.xlabel('Time')
plt.ylabel('Tp Detrended')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assume df is your DataFrame with a DateTime index and a 'PFM' column.
df = raw_data.interpolate(method='linear', axis=0)

# Step 1: Create a numeric time variable (e.g., seconds elapsed since the start)
df['time_num'] = (df.index - df.index[0]).total_seconds()

# Step 2: Fit a quadratic polynomial to 'PFM'
# np.polyfit fits a polynomial of the given degree (2 in this case)
coeffs = np.polyfit(df['time_num'], df['PFM'], 2)
poly_trend = np.poly1d(coeffs)

# Calculate the fitted trend
df['PFM_trend'] = poly_trend(df['time_num'])

# Step 3: Detrend the data by subtracting the fitted trend from the original PFM values
df['PFM_detrended'] = df['PFM'] - df['PFM_trend']

# Step 4: Plot the original data with the fitted quadratic trend, and the detrended data
plt.figure(figsize=(12, 8))

# Plot the original PFM and the fitted trend
plt.subplot(2, 1, 1)
plt.plot(df.index, df['PFM'], label='Original PFM')
plt.plot(df.index, df['PFM_trend'], label='Fitted Quadratic Trend', linestyle='--')
plt.title('Original PFM with Fitted Quadratic Trend')
plt.xlabel('Time')
plt.ylabel('PFM')
plt.legend()

# Plot the detrended PFM
plt.subplot(2, 1, 2)
plt.plot(df.index, df['PFM_detrended'], label='Detrended PFM', color='green')
plt.title('Detrended PFM')
plt.xlabel('Time')
plt.ylabel('PFM (Detrended)')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Assuming your DataFrame is loaded as 'df'
columns = ['Tp', 'pH', 'Cond', 'Turb', 'SAC', 'PFM']
window_size = 2 * 60 # 2 hours

detrended_df = raw_data[feature_columns] - raw_data[feature_columns].rolling(window=window_size, min_periods=1, center=True).mean()
detrended_df = detrended_df.dropna()

for col in columns:
    plt.figure(figsize=(12, 6))
    plt.plot(raw_data[col], label='Raw Data', color='blue', alpha=0.7)
    plt.plot(detrended_df[col], label='Detrended Data', color='red', alpha=0.7)

    plt.title(f'Raw vs Detrended Data for {col}')
    plt.xlabel('Index')
    plt.ylabel(col)
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
for col in feature_columns:
    plt.figure(figsize=(8, 4))
    # Drop missing values for the current column and plot the histogram
    plt.hist(detrended_df[col], bins=100, alpha=0.7, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
%%capture --no-display expensive_output
from datetime import timedelta

sample_delta = (detrended_df.index[1] - detrended_df.index[0])

filled_without_event = [detrended_df[c] for c in feature_columns]
print("Autocorrelation across 30 days:")
util.plot_multiple_autocorrelations(filled_without_event, timedelta(days=1) / sample_delta, figsize=(figsize[0], 6), ylim=(-0.02, 0.02))