# 1.0 Data Exploration
This notebook allows to visualize the features of the raw dataset using different plots.

## Imports and loading
Import necessary packages and load the raw data.

In [None]:
import sys
if 'google.colab' in sys.modules:
    ! git clone https://github.com/nischa564/wind-speed-analysis.git # clone repository for colab
    ! ls

In [None]:
!pip install pykalman

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_predict

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

import pykalman

from dateutil import parser

In [None]:
# define numeric datatypes
NUMERICS = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [None]:
# load csv file
df = pd.read_csv('wind-speed-analysis/data/raw/wind_dataset.csv')

## Display the Dataset
Shows a few samples of the dataset.

In [None]:
df

## Statistics about the Data
Shows common meta information and statistics of the dataset like datatypes, number of missing values, ...

In [None]:
df.shape
# (#rows, #columns)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

## Visualize the Data

### Line Plot
Line plots are commonly used to visualize the trend of a variable over a continuous time. The plot connects data points with straight lines, making it easy to see the overall trend or pattern in the data.

In [None]:
# Plot the first column as a line plot
plt.plot(df.iloc[:, 0], label='<Column 1>')
#plt.plot(df.iloc[:, 1], label='<Column 2>')

# Add labels and title
plt.xlabel('Index')
plt.ylabel('Feature(s)')
plt.title('Line Plot')

# Add a legend
plt.legend()

# Show the plot
plt.show()

### Histogram
Histograms are used to represent the distribution of a single variable and show the frequency of different values or ranges. The plot consists of bars where the height of each bar corresponds to the frequency of data within a specified bin or range.

In [None]:
# Plot histogram for the first column
plt.hist(df.iloc[:, 0], bins=10, alpha=0.5, label='<Column 1>')
#plt.hist(df.iloc[:, 1], bins=10, alpha=0.5, label='<Column 2>')

# Add labels and title
plt.xlabel('Feature Values')
plt.ylabel('Frequency')
plt.title('Histogram')

# Add a legend
plt.legend()

# Show the plot
plt.show()

### Violin Plot
Violin plots are useful for visualizing the distribution of a variable or comparing the distributions of multiple variables. The plot combines aspects of box plots and kernel density estimation, providing insights into the distribution, quartiles, and probability density.

In [None]:
# Create a Violin trace for each column
traces = []
for column in df.columns:
    trace = go.Violin(y=df[column], name=column)
    
    # Set box_visible to False only if more than 5 features
    if len(traces) > 5:
        trace.visible = False
    
    traces.append(trace)

# Create the layout
layout = go.Layout(title='Violin Plot of Columns', xaxis=dict(title='Columns'), yaxis=dict(title='Values'))

# Create the figure
fig = go.Figure(data=traces, layout=layout)

# Show the plot
fig.show()

In [None]:
### Correlation Map
Correlation heatmaps are used to visualize the correlation structure between numeric variables in a dataset. Each cell in the heatmap represents the correlation coefficient between two variables. The color scale typically ranges from cool colors (e.g., blue) for negative correlations to warm colors (e.g., red) for positive correlations. A high positive correlation is represented by a lighter color, while a high negative correlation is represented by a darker color.

In [None]:
# Select only numeric columns since only them can be shown
cols_num = list(df.select_dtypes(include=NUMERICS).columns)
# Select the columns that you need
df_num = df[cols_num]

corr = df_num.corr()
fig = px.imshow(corr)
fig.show()

### Scatterplot
Scatter plots with color encoding are useful for visualizing the relationship between two variables, where the color represents a third variable. Data points are represented as markers, and the color of each marker encodes information about a third variable, providing insights into multivariate relationships.

In [None]:
# Select two columns for the scatter plot
x_column = '<Column 1>'
y_column = '<Column 2>'
# Select a color column if you want to do classification
#color_column = '<Class Column>'

# Create a scatter plot trace
scatter_trace = go.Scatter(
    x=df[x_column],
    y=df[y_column],
    mode='markers',
    marker=dict(
        size=10,
#        color=df[color_column],  # Use the values from the ColorColumn for color encoding
#        colorscale='Viridis',  # You can choose a different colorscale if needed
#        colorbar=dict(title=color_column)
    ),
    name=f'{x_column} vs {y_column}'
)

# Create the layout
layout = go.Layout(title=f'Scatter Plot of {x_column} vs {y_column}', xaxis=dict(title=x_column), yaxis=dict(title=y_column))

# Create the figure
fig = go.Figure(data=[scatter_trace], layout=layout)

# Show the plot
fig.show()

# 2.0 Data Preprocessing
This notebook allows to clean the dataset.

## Delete Unwanted Features
If you already know you don't need certain features, remove them before the preprocessing.

In [None]:
# Drop columns by names
#columns_to_drop = ['<Column1>', '<Column3>']
#df = df.drop(columns=columns_to_drop, axis=1)

# Drop columns by index
#index = 0
#df = df.drop(df.columns[index], axis=1)

# Drop columns by index range
#start_index = 0
#end_index = 1
#df = df.drop(df.columns[start_index:end_index + 1], axis=1)

## Convert Categorical Features
The further preprocessing requires only numeric features (no strings). So convert all categorical to numeric features before continuing.

### Label Encoding
Label Encoding is suitable when the categorical values have an ordinal relationship, meaning there is a meaningful order among the categories. Each category is assigned a unique numerical label. The labels are often assigned in ascending order based on their alphabetical or numerical order.

In [None]:
# Define the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']


# Loop through each categorical column to perform label encoding
for i in cols_cat:
    # Step 1: Store the original column values
    original = df[i]

    # Step 2: Create a mask for missing values in the column
    mask = df[i].isnull()

    # Step 3: Perform label encoding on the column and replace the original values
    df[i] = LabelEncoder().fit_transform(df[i].astype(str))

    # Step 4: Replace the encoded values with original values for missing values
    df[i] = df[i].where(~mask, original)

    # Step 5: Convert the column back to integers, treating 'nan' as NaN
    df[i] = df[i].apply(lambda x: int(x) if str(x) != 'nan' else np.nan) 

### One Hot Encoding
One-Hot Encoding is suitable when the categorical values are nominal, meaning there is no inherent order among the categories. Each category is represented by a binary column (0 or 1) in a new matrix. The column corresponding to the category is marked with a 1, and others are marked with 0.

In [None]:
# Define and select the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']

# Define the One Hot Encoder
encoder = OneHotEncoder(drop='first', handle_unknown='ignore')

# Encode the selected columns
df_cat_encoded = encoder.fit_transform(df_cat.astype(str)).toarray()

# Save the result in a dataframe
df_encoded = pd.DataFrame(df_cat_encoded, index=df_cat.index, columns=encoder.get_feature_names_out(df_cat.columns))

# Delete the old features
df = df.drop(cols_cat, axis=1)

# Concat the onehot features back to the data 
df = pd.concat([df, df_encoded], axis=1)

### Date Encoding
Encode cyclic data using sine and cosine functions.

In [None]:
# Define and select the columns which should be encoded
#date_cols = ['<Column 1>']
date_cols = ['sepal.width']

for col in date_cols:
    # Parse the date format
    df[col] = df[col].apply(lambda x: parser.parse(x) if isinstance(x, str) else x)

    # Encode year linearly
    df[col + ' year'] = df[col].dt.year

    # Encode other components using sine and cosine functions
    components = ['month', 'day', 'hour', 'minute', 'second', 'microsecond']
    for comp in components:
        df[col + ' ' + comp + ' sin'] = np.sin(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())
        df[col + ' ' + comp + ' cos'] = np.cos(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())

# Remove the original date columns
df.drop(date_cols, axis=1, inplace=True)

### Already Numeric
Some columns consist already of numeric values. Just convert them to numeric values. If the decimal numbers use the german writing, replace the comma with points before converting. 

In [None]:
def convert_column_comma_and_set_type_float(col: pd.Series) -> pd.Series:
    """
    Converts a Pandas Series containing numeric strings with commas to float values.

    Parameters:
    - col: The input Pandas Series containing numeric strings.

    Returns:
    - The converted Pandas Series with values converted to float.
    """
    # Use the map function to apply the specified lambda function to each element in the column
    col = col.map(lambda x: x.replace('.', '0.0').replace(',', '.') if type(x) != float else x)

    # Convert the column to the float type
    col = col.astype(float)

    # Return the converted column
    return col

In [None]:
# Define the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']

# Loop through each selected categorical column
for i in cols_cat:
    # Check if the column contains '.' or ',' in any of its values
    if df[i].str.contains('.').any() or df[i].str.contains(',').any():
        # If yes, apply the custom function to convert the column to float
        df[i] = convert_column_comma_and_set_type_float(df[i])
    else:
        # If no '.', ',' found, use pd.to_numeric to convert the column to numeric
        df[i] = pd.to_numeric(df[i])

## Fill Missing Values
Further operations require a dataset without missing values. So fill all missing values before continuing.

In [None]:
# Define the imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply the imputation to the dataset
df = df.transform(df)

## Detect and Remove Outlier
Outlier detection is important in various fields and applications because outliers, which are data points that significantly differ from the majority of the data, can have a significant impact on the analysis, interpretation and performance of statistical and machine learning models.

In [None]:
# Create an Isolation Forest outlier detector with 100 estimators
detector = IsolationForest(n_estimators=100)

# Fit the detector to the data and obtain outlier labels
out = pd.Series(detector.fit_predict(df), index=df.index)

# Identify outliers by mapping -1 labels to True, others to False
is_outlier = out.map(lambda x: x == -1)

# Create a new column 'is_outlier' in the original DataFrame to mark outliers
df_outlier["is_outlier"] = is_outlier

# Get the indices of the rows identified as outliers
indices = is_outlier.index[is_outlier == True]

# Drop rows identified as outliers from the original DataFrame
df = df.drop(indices)

## Save Preprocessed Dataset
Save the processed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('wind-speed-analysis/data/processed/processed.csv', index=False)

# 3.0 - Data Transformation
This notebook is for creating one/multiple datasets with different feature subsets and transformations.

## Delete Unwanted Features
Often only a subset of features is required. So delete the rest of the features.

In [None]:
# Drop columns by names
#columns_to_drop = ['<Column1>', '<Column3>']
#df = df.drop(columns=columns_to_drop, axis=1)

# Drop columns by index
#index = 0
#df = df.drop(df.columns[index], axis=1)

# Drop columns by index range
#start_index = 0
#end_index = 1
#df = df.drop(df.columns[start_index:end_index + 1], axis=1)

## Apply Transformations

### Normalization
Normalization scales data to a standard range, usually between 0 and 1. It is useful when the features have different scales and ensures that all features contribute equally to the analysis.

In [None]:
# Define and select the columns which should be normalized
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Create a MinMaxScaler object
scaler = MinMaxScaler()
    
# Fit the scaler to the selected columns and transform them
data_normalized = scaler.fit_transform(df[cols].values)
    
# Convert the normalized data to a pandas dataframe
df_normalized = pd.DataFrame(data_normalized, index=df.index, columns=cols)
    
# Concatenate the normalized columns with the unnormalized columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_untransformed, df_normalized], axis=1)

### Standardization
Standardization transforms data to have a mean of 0 and a standard deviation of 1. It is effective when features have different scales and you can assume a normal distribution.

In [None]:
# Define and select the columns which should be standardized
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Create a StandardScaler object
scaler = StandardScaler()
    
# Fit the scaler to the selected columns and transform them
data_standardized = scaler.fit_transform(df[cols].values)
    
# Convert the standardized data to a pandas dataframe
df_standardized = pd.DataFrame(data_standardized, index=df.index, columns=cols)
    
# Concatenate the standardized columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_untransformed, df_standardized], axis=1)

### PCA
PCA is a dimensionality reduction technique that transforms data into a new set of uncorrelated variables (principal components). It is used to capture the most significant variability in the data while reducing its dimensionality.

In [None]:
# Define and select the columns on which the pca is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Fit the PCA and transform on the selected columns
pca = PCA(n_components=2)
data_pca = pca.fit_transform(df[cols])

# Define a new name for the new features
feature_name = 'pca_feature'

# Convert the PCA data to a pandas dataframe
new_cols = [f'{feature_name}_' + str(i+1) for i in range(data_pca.shape[1])]
df_pca = pd.DataFrame(data_pca, columns=new_cols, index=df.index)

# Concatenate the pca columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_pca, df_untransformed], axis=1)

### Shifting
Shifting involves moving data points by a constant value. It is used for various purposes, such as aligning signals or adjusting time series for temporal considerations.

In [None]:
# Select the number of periods to be shifted
periods = 1
# Select if you want to have all shifts between 1 and periods as own column
multi_shift = True
    
# Initialize an empty dataframe to store the windowed data
df_shifted = pd.DataFrame()
    
# Loop through each column in the dataframe and create the requested shifts for the specified columns
for col in df.columns:
    if col in cols:
        if multi_shift:
            for i in range(1, periods+1):
                # define the name for the shifted column
                shifted_col_name = col + '_shifted_' + str(i)
                df_shifted[shifted_col_name] = df[col].shift(i)
        else:
            # define the name for the shifted column
            shifted_col_name = col + '_shifted_' + str(periods)
            df_shifted[shifted_col_name] = df[col].shift(periods)
    
# Convert the shifted data to a pandas dataframe
new_cols = list(df_shifted.columns)
df = pd.concat([df_shifted, df], axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Sliding Window
A Sliding Window extracts subsets of data points sequentially, creating a "window" that moves through the dataset. It is used for tasks like feature extraction or smoothing.

In [None]:
# Define and select the columns on which the sliding window is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Selet a window size
window_size = 2
# Select between 'sum', 'mean', 'median', 'min', 'max', 'std' 
operation = 'mean' 
    
# Initialize an empty dataframe to store the windowed data
df_windowed = pd.DataFrame()
    
# Iterate over each column in the dataframe
for col in cols:
        
    # Apply the specified operations to the column using a rolling window
    if 'sum' == operation:
        df_windowed[f'{col}_sum'] = df[col].rolling(window_size).sum()
    elif 'mean' == operation:
        df_windowed[f'{col}_mean'] = df[col].rolling(window_size).mean()
    elif 'median' == operation:
        df_windowed[f'{col}_median'] = df[col].rolling(window_size).median()
    elif 'min' == operation:
        df_windowed[f'{col}_min'] = df[col].rolling(window_size).min()
    elif 'max' == operation:
        df_windowed[f'{col}_max'] = df[col].rolling(window_size).max()
    elif 'std' == operation:
        df_windowed[f'{col}_std'] = df[col].rolling(window_size).std()
        
# Add the windowed data to the windowed dataframe
new_cols = list(df_windowed.columns)
df_windowed = pd.concat([df_windowed, df], axis=1)

# Drop old columns
df = df_windowed.drop(cols, axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Differencing
It calculates the difference between consecutive data points. It is often used to transform a time series into a stationary series for trend and seasonality removal.

In [None]:
# Define and select the columns on which the sliding window is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

periods = 1
    
# Initialize an empty dataframe to store the windowed data
df_diff = pd.DataFrame()
    
# Iterate over each column in the dataframe
for col in cols:
    # Apply the specified operations to the column using a rolling window
    df_diff[f'{col}_diff'] = df[col].diff(periods=periods)
    
# Add the windowed data to the windowed dataframe
new_cols = list(df_diff.columns)
df_diff = pd.concat([df_diff, df], axis=1)

# Drop old columns
df = df_diff.drop(cols, axis=1)

# Fill na values which are created during the process
df = df.backfill()
df = df.ffill()

### Kalman Filter
A Kalman Filter estimates the state of a dynamic system from a series of noisy measurements. It is widely used in signal processing, control systems and sensor fusion applications.

In [None]:
# Define and select the columns on which the kalman filter is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Initialize the Kalman filter
kf = pykalman.KalmanFilter()
    
# Create an empty dataframe to store the filtered data
df_filtered = pd.DataFrame(index=df.index)
    
# Iterate over each column to be filtered
for col in cols:
        
    # Get the time series data as a numpy array
    data = df[col].values
        
    # Apply the Kalman filter to the data
    data_filtered, _ = kf.filter(data)
        
    # Add the filtered data to the filtered dataframe
    df_filtered[col] = data_filtered
        
# Concatenate the pca columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_filtered, df_untransformed], axis=1)

## Save Preprocessed Dataset
Save the transformed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('wind-speed-analysis/data/transformed/transformed.csv', index=False)

# 4.0 - Data Analysis
This notebook is for analysing the transformed data.

## Imports and loading
Import necessary packages and load the transformed data.

In [None]:
# Define a list to save the results
train_scores = []
test_scores = []

# Define a list to name the different models
models = []

## Analysis

### Split Data in Trainset and Testset
In order to train and evaluate the model, we need a train set and a test set.

In [None]:
# Define X and y
y = df['<Target Column>']
X = df.drop(columns=['<Target Column>'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Get train and test size
train_size = len(y_train)
test_size = len(y_test)

### Define a Baseline
A baseline model is a simple, often naive, model that serves as a point of reference for evaluating the performance of more sophisticated machine learning models. A baseline model provides a benchmark against which the performance of more complex models can be compared. It serves as a starting point for assessing the effectiveness of your machine learning solution.

#### Simple Baseline

In [None]:
# Name the model
models.append('Simple Baseline')

# Define the dummy regressor
dr = DummyRegressor(strategy='mean')

# Fit the model
dr.fit(X_train, y_train)

# Get train and test prediction
pred_train = dr.predict(X_train)
pred_test = dr.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column>'].plot(ax=ax)
plt.plot(pred_test, label='Baseline Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### ARIMA Baseline
ARIMA is designed to handle univariate time series data and is effective for capturing and forecasting temporal patterns in a dataset. It is a versatile model that combines autoregression, differencing, and moving averages to make predictions.

In [None]:
# Name the model
models.append('ARIMA')

# Define the order
order = (0,1,0)

# Define the ARIMA model
model = sm.tsa.arima.ARIMA(df['<Target Column>'], order=order)

# Fit the model
results = model.fit()

# Get train and test prediction
pred_train = results.predict(end=train_size-1)
pred_test = results.predict(start=train_size)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plot_predict(results, train_size, train_size + test_size - 1, ax=ax)
plt.xlabel('Index')
plt.ylabel(f'Target')
plt.show()

### Train and Evaluate ML Models
Train and evaluate different models with different hyperparameter.

#### Linear Regression

In [None]:
# Name the model
models.append('Linear Regression')

# Define a linear regression
lr = LinearRegression()

# Fit the model
lr.fit(X_train, y_train)

# Get train and test prediction
pred_train = lr.predict(X_train)
pred_test = lr.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Linear Regression Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### Decision Tree

In [None]:
# Name the model
models.append('Decision Tree')

# Define a decision tree
dt = DecisionTreeRegressor(criterion='squared_error', max_depth=None)

# Fit the model
dt.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Decision Tree Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### Random Forest

In [None]:
# Name the model
models.append('Random Forest')

# Define a random forest
rf = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None)

# Fit the model
rf.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Random Forest Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

### Compare the Results

In [None]:
x = np.arange(len(models))
width = 0.3

plt.bar(x - 0.17, train_scores, width, label='Train')
plt.bar(x + 0.17, test_scores, width, label='Test')
plt.xticks(ticks=x, labels=models, rotation=45)
plt.xlabel('Models')
plt.ylabel(f'Scores')
plt.legend()
plt.show()