# 3.0 - Data Transformation
This notebook is for creating one/multiple datasets with different feature subsets and transformations.

## Imports and loading
Import necessary packages and load the preprocessed data.

In [None]:
import sys
if 'google.colab' in sys.modules:
    ! git clone https://github.com/nischa564/wind-speed-analysis.git # clone repository for colab
    ! ls

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
import pykalman

In [None]:
# load processed file
df = pd.read_csv('wind-speed-analysis/data/processed/processed.csv')

## Delete Unwanted Features
Often only a subset of features is required. So delete the rest of the features.

In [None]:
# Drop columns by names
#columns_to_drop = ['<Column1>', '<Column3>']
#df = df.drop(columns=columns_to_drop, axis=1)

# Drop columns by index
#index = 0
#df = df.drop(df.columns[index], axis=1)

# Drop columns by index range
#start_index = 0
#end_index = 1
#df = df.drop(df.columns[start_index:end_index + 1], axis=1)

## Apply Transformations

### Normalization
Normalization scales data to a standard range, usually between 0 and 1. It is useful when the features have different scales and ensures that all features contribute equally to the analysis.

In [None]:
# Define and select the columns which should be normalized
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Create a MinMaxScaler object
scaler = MinMaxScaler()
    
# Fit the scaler to the selected columns and transform them
data_normalized = scaler.fit_transform(df[cols].values)
    
# Convert the normalized data to a pandas dataframe
df_normalized = pd.DataFrame(data_normalized, index=df.index, columns=cols)
    
# Concatenate the normalized columns with the unnormalized columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_untransformed, df_normalized], axis=1)

### Standardization
Standardization transforms data to have a mean of 0 and a standard deviation of 1. It is effective when features have different scales and you can assume a normal distribution.

In [None]:
# Define and select the columns which should be standardized
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Create a StandardScaler object
scaler = StandardScaler()
    
# Fit the scaler to the selected columns and transform them
data_standardized = scaler.fit_transform(df[cols].values)
    
# Convert the standardized data to a pandas dataframe
df_standardized = pd.DataFrame(data_standardized, index=df.index, columns=cols)
    
# Concatenate the standardized columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_untransformed, df_standardized], axis=1)

### PCA
PCA is a dimensionality reduction technique that transforms data into a new set of uncorrelated variables (principal components). It is used to capture the most significant variability in the data while reducing its dimensionality.

In [None]:
# Define and select the columns on which the pca is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Fit the PCA and transform on the selected columns
pca = PCA(n_components=2)
data_pca = pca.fit_transform(df[cols])

# Define a new name for the new features
feature_name = 'pca_feature'

# Convert the PCA data to a pandas dataframe
new_cols = [f'{feature_name}_' + str(i+1) for i in range(data_pca.shape[1])]
df_pca = pd.DataFrame(data_pca, columns=new_cols, index=df.index)

# Concatenate the pca columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_pca, df_untransformed], axis=1)

### Shifting
Shifting involves moving data points by a constant value. It is used for various purposes, such as aligning signals or adjusting time series for temporal considerations.

In [None]:
# Select the number of periods to be shifted
periods = 1
# Select if you want to have all shifts between 1 and periods as own column
multi_shift = True
    
# Initialize an empty dataframe to store the windowed data
df_shifted = pd.DataFrame()
    
# Loop through each column in the dataframe and create the requested shifts for the specified columns
for col in df.columns:
    if col in cols:
        if multi_shift:
            for i in range(1, periods+1):
                # define the name for the shifted column
                shifted_col_name = col + '_shifted_' + str(i)
                df_shifted[shifted_col_name] = df[col].shift(i)
        else:
            # define the name for the shifted column
            shifted_col_name = col + '_shifted_' + str(periods)
            df_shifted[shifted_col_name] = df[col].shift(periods)
    
# Convert the shifted data to a pandas dataframe
new_cols = list(df_shifted.columns)
df = pd.concat([df_shifted, df], axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Sliding Window
A Sliding Window extracts subsets of data points sequentially, creating a "window" that moves through the dataset. It is used for tasks like feature extraction or smoothing.

In [None]:
# Define and select the columns on which the sliding window is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Selet a window size
window_size = 2
# Select between 'sum', 'mean', 'median', 'min', 'max', 'std' 
operation = 'mean' 
    
# Initialize an empty dataframe to store the windowed data
df_windowed = pd.DataFrame()
    
# Iterate over each column in the dataframe
for col in cols:
        
    # Apply the specified operations to the column using a rolling window
    if 'sum' == operation:
        df_windowed[f'{col}_sum'] = df[col].rolling(window_size).sum()
    elif 'mean' == operation:
        df_windowed[f'{col}_mean'] = df[col].rolling(window_size).mean()
    elif 'median' == operation:
        df_windowed[f'{col}_median'] = df[col].rolling(window_size).median()
    elif 'min' == operation:
        df_windowed[f'{col}_min'] = df[col].rolling(window_size).min()
    elif 'max' == operation:
        df_windowed[f'{col}_max'] = df[col].rolling(window_size).max()
    elif 'std' == operation:
        df_windowed[f'{col}_std'] = df[col].rolling(window_size).std()
        
# Add the windowed data to the windowed dataframe
new_cols = list(df_windowed.columns)
df_windowed = pd.concat([df_windowed, df], axis=1)

# Drop old columns
df = df_windowed.drop(cols, axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Differencing
It calculates the difference between consecutive data points. It is often used to transform a time series into a stationary series for trend and seasonality removal.

In [None]:
# Define and select the columns on which the sliding window is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

periods = 1
    
# Initialize an empty dataframe to store the windowed data
df_diff = pd.DataFrame()
    
# Iterate over each column in the dataframe
for col in cols:
    # Apply the specified operations to the column using a rolling window
    df_diff[f'{col}_diff'] = df[col].diff(periods=periods)
    
# Add the windowed data to the windowed dataframe
new_cols = list(df_diff.columns)
df_diff = pd.concat([df_diff, df], axis=1)

# Drop old columns
df = df_diff.drop(cols, axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Kalman Filter
A Kalman Filter estimates the state of a dynamic system from a series of noisy measurements. It is widely used in signal processing, control systems and sensor fusion applications.

In [None]:
# Define and select the columns on which the kalman filter is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Initialize the Kalman filter
kf = pykalman.KalmanFilter()
    
# Create an empty dataframe to store the filtered data
df_filtered = pd.DataFrame(index=df.index)
    
# Iterate over each column to be filtered
for col in cols:
        
    # Get the time series data as a numpy array
    data = df[col].values
        
    # Apply the Kalman filter to the data
    data_filtered, _ = kf.filter(data)
        
    # Add the filtered data to the filtered dataframe
    df_filtered[col] = data_filtered
        
# Concatenate the pca columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_filtered, df_untransformed], axis=1)

## Save Preprocessed Dataset
Save the transformed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('wind-speed-analysis/data/transformed/transformed.csv', index=False)