In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.subplots import make_subplots

In [None]:
df = pd.read_parquet('../data/01_raw/Turbine.parquet')

In [None]:
df.shape

In [None]:
df = df[70_000:250_000]
df = df[df['Power'] > 10] # & (df['Power'] < 2000)]
# df.shape

In [None]:
df.head(3)

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
start_time = pd.Timestamp(df['Timestamps'][0])
periods = df.shape[0]  # change this to how many timestamps you want
freq = '10min'   # 'T' stands for minutes → '10T' = 10 minutes

# Create datetime series
timestamps = pd.date_range(start=start_time, periods=periods, freq=freq)

# Create DataFrame
df['Timestamps'] = timestamps

In [None]:
df.tail(3)

In [None]:
df.columns

# Adding noise to the data

In [None]:
df.describe()

# Deteriorating raw signals

In [None]:
def inject_random_jumps(
    s: pd.Series,
    error_pct: float = 0.02,
    n_events: int = 5,
    event_size_range: tuple = (5, 20),
    mode: str = "mixed",  # "mixed", "positive", "negative"
    seed: int = None
) -> pd.Series:
    """
    Deteriorate a time series by injecting small random jumps.

    Parameters
    ----------
    s : pd.Series
        Input time-series signal.
    error_pct : float
        Percent of mean value for jumps (e.g. 0.02 = +/- 2%).
    n_events : int
        Number of random jump events to inject.
    event_size_range : tuple(int, int)
        Range of contiguous points to modify for each jump event.
    mode : str
        "mixed"   → positive or negative jumps (random)
        "positive" → only positive jumps
        "negative" → only negative jumps
    seed : int
        Optional random seed.

    Returns
    -------
    pd.Series
        New degraded signal.
    """

    if seed is not None:
        np.random.seed(seed)

    degraded = s.copy().astype(float)
    n = len(degraded)
    mean_val = degraded.mean()

    for _ in range(n_events):
        # length of this jump event
        size = np.random.randint(event_size_range[0], event_size_range[1] + 1)

        # random start point
        start = np.random.randint(0, n - size)
        end = start + size

        # jump value: ± error_pct * mean
        jump_mag = error_pct * mean_val

        if mode == "mixed":
            jump = jump_mag * np.random.uniform(-1, 1)
        elif mode == "positive":
            jump = jump_mag * np.random.uniform(0, 1)
        elif mode == "negative":
            jump = -jump_mag * np.random.uniform(0, 1)
        else:
            raise ValueError("mode must be 'mixed', 'positive', or 'negative'")

        # apply jump to a chunk
        degraded.iloc[start:end] += jump

    return degraded

In [None]:
# y = df["Power"].copy()

# df["Power"] = inject_random_jumps(
#     s=y,
#     error_pct=0.10,           # ±3% jumps
#     n_events=20000,               # eight events
#     event_size_range=(10,40), # each jump lasts 10–40 points
#     mode="mixed",
#     seed=42
# )

In [None]:
# # Assume y_test and y_pred are pandas Series with a datetime index
# fig = go.Figure()

# # Add actual values
# fig.add_trace(go.Scatter(
#     x=df.index, y=df["Power"],
#     mode='lines',
#     name='Actual',
#     line=dict(width=2)
# ))

# # Add predicted values
# fig.add_trace(go.Scatter(
#     x=df.index, y=df["Power_new"],
#     mode='lines',
#     name='Predicted',
#     line=dict(width=2, dash='dot')
# ))

# # Customize layout
# fig.update_layout(
#     title='Actual vs Predicted Values - Validation Set',
#     xaxis_title='Date',
#     yaxis_title='Value',
#     legend=dict(x=0, y=1),
#     height=500
# )

# fig.show()

In [None]:
# np.random.seed(42)
# cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
# white_noise = np.random.normal(0, 8, size=df.shape[0])
# noise = cos_apml + white_noise
# df['Power'] = df['Power'] + noise
# print(df['Power'].std())

In [None]:
np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 3, size=df.shape[0])
noise = cos_apml + white_noise
df['WindSpeed'] = df['WindSpeed'] + noise
print(df['WindSpeed'].std())

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 1, size=df.shape[0])
noise = cos_apml + white_noise
df['StdDevWindSpeed'] = df['StdDevWindSpeed'] + noise
print(df['StdDevWindSpeed'].std())

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 6, size=df.shape[0])
noise = cos_apml + white_noise
df['RotorRPM'] = df['RotorRPM'] + noise
print(df['RotorRPM'].std())

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 3, size=df.shape[0])
noise = cos_apml + white_noise
df['Pitch'] = df['Pitch'] + noise
print(df['Pitch'].std())

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 100, size=df.shape[0])
noise = cos_apml + white_noise
df['GenRPM'] = df['GenRPM'] + noise
print(df['GenRPM'].std())

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 10, size=df.shape[0])
noise = cos_apml + white_noise
df['GenPh1Temp'] = df['GenPh1Temp'] + noise
df['GenPh2Temp'] = df['GenPh2Temp'] + noise
df['GenPh3Temp'] = df['GenPh3Temp'] + noise

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 5, size=df.shape[0])
noise = cos_apml + white_noise
df['EnvirTemp'] = df['EnvirTemp'] + noise

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 10, size=df.shape[0])
noise = cos_apml + white_noise
df['NacelTemp'] = df['NacelTemp'] + noise

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 6, size=df.shape[0])
noise = cos_apml + white_noise
df['GearOilTemp'] = df['GearOilTemp'] + noise

In [None]:
# np.random.seed(42)
cos_apml = 0 * np.cos(np.arange(0, df.shape[0]))
white_noise = np.random.normal(0, 10, size=df.shape[0])
noise = cos_apml + white_noise
df['GearBearTemp'] = df['GearBearTemp'] + noise

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
sns.histplot(df['WindSpeed'])

In [None]:
def plot_relationships(x, y, x_label='X', y_label='Y'):
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))
    
    # 1. Regression Plot (via seaborn)
    sns.regplot(x=x, y=y, ax=axs[0], scatter_kws={'s': 20}, line_kws={'color': 'red'})
    axs[0].set_title('Regression Plot')
    axs[0].set_xlabel(x_label)
    axs[0].set_ylabel(y_label)

    # 2. KDE Plot (Seaborn joint density)
    sns.kdeplot(x=x, y=y, fill=True, cmap="mako", ax=axs[1], thresh=0.01)
    axs[1].set_title('KDE Plot')
    axs[1].set_xlabel(x_label)
    axs[1].set_ylabel(y_label)

    # 3. Hexbin Plot (via Matplotlib)
    axs[2].hexbin(x, y, gridsize=30, cmap='viridis', mincnt=1)
    axs[2].set_title('Hexbin Plot')
    axs[2].set_xlabel(x_label)
    axs[2].set_ylabel(y_label)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [None]:
v1 = 'WindSpeed'
v2 = 'Power'
n = 20
plot_relationships(df[v1][::n], df[v2][::n], v1, v2)
print(df[[v1, v2]].corr())

In [None]:
v1 = 'RotorRPM'
v2 = 'Power'
n = 20
plot_relationships(df[v1][::n], df[v2][::n], v1, v2)
print(df[[v1, v2]].corr())

# Adding degradation

In [None]:
def apply_degradation(df, idx, severity=0.8, fluctuation=0.1, seed=42):
    """
    Apply gradual degradation with fluctuations to Power column
    
    Parameters:
    - df: DataFrame with 'Power' column
    - start_idx, end_idx: degradation time range
    - severity: how much to degrade (0=no degradation, 1=complete failure)
    - fluctuation: amount of random fluctuation
    - seed: random seed for reproducibility
    """
    np.random.seed(seed)
    
    # Apply to dataframe
    df_degraded = df.copy()
    df_degraded['Power_degr'] = df_degraded['Power'].copy()

    for index in idx:
        # Create degradation multiplier
        length = index[1] - index[0] + 1
        base_degradation = np.linspace(1.0, 1.0 - severity, length)
        
        # Add fluctuations
        noise = np.random.normal(0, fluctuation, length)
        multiplier = np.clip(base_degradation + noise, 0, 1)
        df_degraded.loc[index[0]:index[1], 'Power_degr'] *= multiplier
    
    return df_degraded

In [None]:
# Downtime
6 * 24 * 7 * (4 + 3) # 4 weeks logistics + 3 weeks repair

In [None]:
anomalies = [
    [40000, 45000],
    [90000, 95000],
]

In [None]:
df.iloc[40_000:45_000]

In [None]:
df = apply_degradation(df, idx=anomalies, severity=0.7, fluctuation=0.1)

# Adding mean to power

In [None]:
df['Power'] += 500 # 1000
df['Power_degr'] += 500 # 1000

In [None]:
plt.plot(df['Power'])
plt.plot(df['Power_degr'], alpha=0.5)

In [None]:
plt.plot(df['Power'] - df['Power_degr'])
plt.ylim(0, 1)

In [None]:
df['Anomaly'] = 0

In [None]:
df['diff'] = df['Power'] - df['Power_degr']
df['Anomaly'] = df['diff'].apply(lambda x: 1 if x > 1 else 0)

# Dropping columns

In [None]:
df.columns

In [None]:
df['Power'] = df['Power_degr']
df.drop(columns=['Power_degr', 'diff', 
                 'GenPh2Temp', 'GenPh3Temp',
                 'MaxPower', 'MinPower', 
                 'StdDevPower', 'AvgRPow', 
                 'GenTemp', 'StdDevWindSpeed'], inplace=True)

In [None]:
df.columns

# Adding Anomalies

In [None]:
def add_anomalies(df, anomaly_frac=0.01, std_min=3, std_max=5):
    df = df.copy()
    np.random.seed(42)
    for col in df.columns:
        if not np.issubdtype(df[col].dtype, np.number):
            continue
        
        n = len(df)
        k = max(1, int(n * anomaly_frac))  # number of anomalies
        
        # random anomaly positions
        anomaly_indices = np.random.choice(n, k, replace=False)

        mean_val = df[col].mean()
        std_val = df[col].std()

        # generate random multipliers between 3 and 5 std
        std_multipliers = np.random.uniform(std_min, std_max, size=k)

        df.loc[anomaly_indices, col] = mean_val + std_multipliers * std_val

    return df

In [None]:
df_with_anoms = add_anomalies(df, anomaly_frac=0.005)

In [None]:
plt.plot(df_with_anoms['WindSpeed'][:1000])

In [None]:
df = df_with_anoms.copy()

In [None]:
n_zeros = 7_000

zero_rows = pd.DataFrame(np.zeros((n_zeros, df.shape[1])), columns=df.columns)

# Adding downtime in training
df = pd.concat([df.iloc[:45_000], zero_rows, df.iloc[45_000:]], ignore_index=True)
df.reset_index(inplace=True, drop=True)

# Adding downtime in prod
df = pd.concat([df.iloc[:95_000+n_zeros], zero_rows, df.iloc[95_000+n_zeros:]], ignore_index=True)
df.reset_index(inplace=True, drop=True)

In [None]:
start_time = pd.Timestamp(df['Timestamps'][0])
periods = df.shape[0]  # change this to how many timestamps you want
freq = '10min'   # 'T' stands for minutes → '10T' = 10 minutes

# Create datetime series
timestamps = pd.date_range(start=start_time, periods=periods, freq=freq)

# Create DataFrame
df['Timestamps'] = timestamps

In [None]:
plt.plot(df['Power'][:1000])

In [None]:
sns.histplot(df['Power'])

In [None]:
df[df['Power'] > 20].mean()

In [None]:
plt.plot(df['GenRPM'][:1000])

In [None]:
df.drop(columns=['Anomaly'], inplace=True)

In [None]:
df.to_parquet('../data/01_raw/df_modified.parquet')

# Splitting to train_test AND prod

In [None]:
df_train_test = df[:70_000]
df_prod = df[70_000:]

In [None]:
df_train_test.to_parquet('../data/01_raw/df_train_test.parquet')
df_prod.to_parquet('../data/01_raw/df_prod.parquet')