In [99]:
import pandas as pd
path_to_data = "..//data//"

df = pd.read_csv(path_to_data+"pm_data.csv")
sensors_df = pd.read_csv(path_to_data+"sensors.csv")

In [100]:
nan_percentage = df.isnull().mean() * 100

threshold = 95  
filtered_df = df.loc[:, nan_percentage <= threshold]

In [101]:
filtered_sensors = filtered_df.columns[2:-1]
filtered_sensors_df = sensors_df.loc[sensors_df['id'].astype('str').isin(filtered_sensors)]

In [102]:
train_df = pd.read_csv(path_to_data+"raw_train.csv")

## clustering by hand

In [103]:
sensors_df['data_completeness'] = None

for sens in sensors_df['id'].values:
    if str(sens) in train_df.columns:
        sensors_df.loc[sensors_df['id']==sens, "data_completeness"] = train_df[str(sens)].count() / train_df.shape[0]

In [181]:
clusters_0 = {
    1: [1, 4372603],
    2: [2],
    5: [5], 
    6: [6],
    # 12: [12],
}

# clusters_0 = {8: [8],} # trash
# clusters_0 = {14: [14],} # kamenka
# clusters_0 = {16: [16],} # tec3
clusters_0 = {104: [104],} #  malo dannyh

In [182]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class SensorClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, clusters, date_column="full_date"):
        """
        Parameters:
        - clusters: dict, mapping cluster IDs to lists of sensor IDs.
        - date_column: str, name of the column containing date information.
        """
        self.clusters = clusters
        self.date_column = date_column

    def fit(self, X, y=None):
        """
        No fitting required for this transformer.
        """
        return self

    def transform(self, X):
        """
        Transforms the input DataFrame to compute mean time series for each cluster.

        Parameters:
        - X: pd.DataFrame, the input time series data with a date column and sensor time series.

        Returns:
        - pd.DataFrame, mean time series for each cluster.
        """
        # Check if the date column exists
        if self.date_column not in X.columns:
            raise ValueError(f"The specified date column '{self.date_column}' does not exist in the DataFrame.")

        # Ensure the date column is set as index temporarily for easier time series operations
        X = X.set_index(self.date_column)

        # Dictionary to hold mean time series for each cluster
        cluster_means = {}

        # Compute mean time series for each cluster
        for cluster_id, sensor_ids in self.clusters.items():
            # Check if all sensor IDs exist in the DataFrame
            sensor_ids = [str(s) for s in sensor_ids]
            missing_sensors = [sensor for sensor in sensor_ids if sensor not in X.columns]
            if missing_sensors:
                raise ValueError(f"The following sensor IDs are missing from the DataFrame: {missing_sensors}")

            # Calculate mean time series for the cluster
            cluster_means[cluster_id] = X[sensor_ids].mean(axis=1)

        # Create a DataFrame for the cluster means
        cluster_means_df = pd.DataFrame(cluster_means)

        # Reset the index to include the date column in the result
        cluster_means_df.reset_index(inplace=True)

        return cluster_means_df

In [183]:
transformer = SensorClusterTransformer(clusters=clusters_0)
cluster_means_df = transformer.transform(df)

### filling nan values

Мы будем заполнять только промежутки в каждом сенсоре

In [184]:
sensors_list = list(sensors_df['id'].values)
sensors_list = [str(s) for s in sensors_list if str(s) in df.columns]

In [185]:
import numpy as np
def rolling_zscore_outlier_removal(series, window=7, z_threshold=3.0):
    """
    Removes (or flags) outliers based on a rolling mean/std approach.
    - series: pd.Series (time indexed)
    - window: rolling window size
    - z_threshold: threshold for z-score
    Returns: A new Series with outliers replaced by NaN (or some other logic).
    """
    rolling_mean = series.rolling(window=window, center=True, min_periods=1).mean()
    rolling_std = series.rolling(window=window, center=True, min_periods=1).std()
    
    z_scores = (series - rolling_mean) / rolling_std
    outliers = z_scores.abs() > z_threshold
    
    # Option 1: Replace outliers with NaN
    cleaned_series = series.mask(outliers, np.nan)
    
    return cleaned_series

In [186]:
def divide_series_into_chunks(series):
    is_nan = series.isna()
    chunks = []
    start_idx = None

    for i, val in enumerate(is_nan):
        if not val and start_idx is None:
            start_idx = i  # Start a new chunk
        elif val and start_idx is not None:
            chunks.append(series[start_idx:i])  # Append the chunk
            start_idx = None

    # Add the last chunk if the series ends without NaNs
    if start_idx is not None:
        chunks.append(series[start_idx:])

    return chunks

from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.ets import AutoETS  

def fill_nans_with_arima(series, sp=12):
    # Get the chunks of non-NaN data
    chunks = divide_series_into_chunks(series)
    filled_series = series.copy()

    for i in range(len(chunks) - 1):
        current_chunk = chunks[i]
        next_chunk_start = chunks[i + 1].index[0]

        # Ensure datetime types
        last_index = current_chunk.index[-1]
        next_start = next_chunk_start

        # Define training and forecasting horizons
        y_train = current_chunk
        print(y_train)
        
        fh = np.arange(last_index+1, next_start)

        # Fit ARIMA and predict the gap
        forecaster = ExponentialSmoothing(trend="add", seasonal="additive", sp=6)
        forecaster.fit(y_train)
        y_pred = forecaster.predict(fh)

        # Fill the gap in the series
        filled_series.values[last_index+1:next_start] = y_pred

    return filled_series


In [187]:
def fill_missing_with_same_day_values(df, date_col, value_col, direction="both"):
    """
    Fills missing values in a time series by using values from the same day in the next or previous year.
    
    Parameters:
        df (pd.DataFrame): The dataframe containing the time series.
        date_col (str): Column name representing the date.
        value_col (str): Column name representing the values with potential missing data.
        direction (str): Direction to search for values ("next", "previous", or "both"). Default is "both".
    
    Returns:
        pd.DataFrame: The dataframe with missing values filled.
    """
    # Ensure the date column is in datetime format
    df[date_col] = pd.to_datetime(df[date_col])

    # Create an index for fast lookup
    df.set_index(date_col, inplace=True)

    # Fill missing values
    for idx in df[df[value_col].isna()].index:
        year = idx.year
        day_of_year = idx.timetuple().tm_yday

        # Look for the value in the next year
        next_year_date = pd.Timestamp(year + 1, 1, 1) + pd.Timedelta(days=day_of_year - 1)
        previous_year_date = pd.Timestamp(year - 1, 1, 1) + pd.Timedelta(days=day_of_year - 1)

        value_to_fill = None
        if direction in ["next", "both"] and next_year_date in df.index:
            value_to_fill = df.at[next_year_date, value_col]
        
        if direction in ["previous", "both"] and previous_year_date in df.index and pd.isna(value_to_fill):
            value_to_fill = df.at[previous_year_date, value_col]

        # Assign the value if found
        if not pd.isna(value_to_fill):
            df.at[idx, value_col] = value_to_fill

    # Reset the index
    df.reset_index(inplace=True)

    return df

# full pipeline

In [188]:
# Step 1: Clean all sensors
def clean_sensors(dataframe, sensors_list, window=15, z_threshold=3):
    cleaned_data = dataframe.copy()
    for sensor in sensors_list:
        cleaned_data[sensor] = rolling_zscore_outlier_removal(dataframe[sensor], window=window, z_threshold=z_threshold)
    
    return cleaned_data

# Step 2: Apply sensor cluster transformer
def transform_clusters(cleaned_df, clusters):
    transformer = SensorClusterTransformer(clusters=clusters)
    return transformer.transform(cleaned_df)

# Step 3: Fill missing values for all columns in cluster means
def fill_missing_for_all_columns(cluster_means_df, date_col, direction="both"):
    filled_df = cluster_means_df.copy()
    for column in cluster_means_df.columns:
        if column != date_col:
            filled_df = fill_missing_with_same_day_values(
                filled_df, date_col=date_col, value_col=column, direction=direction
            )
    return filled_df

def process_time_series(dataframe, sensors_list, clusters, date_col, window=15, z_threshold=3, direction="both"):
    # Step 1: Clean all sensors
    cleaned_df = clean_sensors(dataframe, sensors_list, window=window, z_threshold=z_threshold)
    
    # Step 2: Calculate cluster means
    cluster_means_df = transform_clusters(cleaned_df, clusters)
    
    # Step 3: Fill missing values
    cluster_means_df_filled = fill_missing_for_all_columns(cluster_means_df, date_col=date_col, direction=direction)
    
    return cluster_means_df_filled

In [189]:
def input_missing_dates(df, date_col="date_time"):
    df = df.set_index(date_col)
    df.index = pd.to_datetime(df.index)
    full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
    df = df.reindex(full_range)
    df.index.name = date_col

    return df 

df_dates_filled = input_missing_dates(df, "full_date").reset_index()

In [190]:
processed_df = process_time_series(
    dataframe=df_dates_filled,
    sensors_list=sensors_list,
    clusters=clusters_0,
    date_col="full_date",
    window=60,
    z_threshold=2,
    # direction="both"
    direction="none"
)

In [191]:
def cut_date_to_distr(df, date_col='full_date', value_col=8):
    filtered_df = df.dropna(subset=[value_col])

    # Get min and max dates
    min_date = filtered_df[date_col].min()
    max_date = filtered_df[date_col].max()

    return df.loc[(df[date_col]>=min_date) & (df[date_col]<=max_date)]

def cut_slack_dates(df, start_date, end_date, date_col='full_date'):
    return df.loc[(df[date_col]>=start_date) & (df[date_col]<=end_date)]

In [192]:
processed_df = cut_date_to_distr(processed_df, value_col=104)

In [195]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   full_date  892 non-null    datetime64[ns]
 1   104        892 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 14.1 KB


In [None]:
# processed_df = cut_slack_dates(processed_df, '2017-07-20', '2019-09-15')

In [194]:
from functions import holt_winters_imputation_and_expand

# processed_df[8] = holt_winters_imputation_and_expand(processed_df[2], 365)
# processed_df[5] = holt_winters_imputation_and_expand(processed_df[5], 365)

processed_df = fill_missing_for_all_columns(processed_df, 'full_date')
processed_df[104] = holt_winters_imputation_and_expand(processed_df[104], 365)

In [197]:
for col in processed_df.columns:
    processed_df.rename(columns={col: str(col)}, inplace=True)

In [198]:
processed_df.to_csv("../data/decomp/kusok_5.csv")