# 1.0 Data Exploration

## Imports and loading
Import necessary packages and load the raw data.

In [None]:
import sys
if 'google.colab' in sys.modules:
    ! git clone https://github.com/nischa564/wind-speed-analysis.git # clone repository for colab
    ! ls

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

In [None]:
# load csv file
df = pd.read_csv('wind-speed-analysis/data/raw/wind_dataset.csv')

## Display the Dataset

In [None]:
df

## Statistics about the Data

In [None]:
df.shape
# (#rows, #columns)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

## Visualize the Data

In [None]:
# add more plots if needed

### Line Plot

In [None]:
# Plot the one column as a line plot
plt.plot(df['<Column Name>'])

# Show the plot
plt.show()

# 2.0 Data Preprocessing

## Convert Categorical Features

In [None]:
# Add complex functions if needed

### Date Encoding
Encode cyclic data using sine and cosine functions.

In [None]:
from dateutil import parser
import math

# Define and select the columns which should be encoded
date_cols = ['<time column>']

for col in date_cols:
    # Parse the date format
    df[col] = df[col].apply(lambda x: parser.parse(x) if isinstance(x, str) else x)

    # Encode year linearly
    df[col + ' year'] = df[col].dt.year

    # Encode other components using sine and cosine functions
    components = ['month', 'day', 'hour', 'minute', 'second', 'microsecond']
    for comp in components:
        df[col + ' ' + comp + ' sin'] = np.sin(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())
        df[col + ' ' + comp + ' cos'] = np.cos(2 * math.pi * df[col].dt.__getattribute__(comp) / df[col].dt.__getattribute__(comp).max())

# Remove the original date columns
df.drop(date_cols, axis=1, inplace=True)

# Remove columns with only NaN values
df.dropna(axis=1, how='all', inplace=True)

## Fill Missing Values

In [None]:
# Extend with more complex function if needed
# fills all missing values with zero
df = df.fillna(0)

## Save Preprocessed Dataset
Save the processed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('wind-speed-analysis/data/processed/processed.csv', index=False)

# 3.0 - Data Transformation

## Apply Transformations

In [None]:
# add your transformations here

## Save Tranformed Dataset

In [None]:
df.to_csv('wind-speed-analysis/data/transformed/transformed.csv', index=False)

# 4.0 - Data Analysis

## Analysis

### Split Data in Trainset and Testset

In [None]:
# Define X and y
y = df['<Target Column>']
X = df.drop(columns=['<Target Column>'])

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Get train and test size
train_size = len(y_train)
test_size = len(y_test)

### Train and Evaluate ML Models
Train and evaluate different models with different hyperparameter.

#### Decision Tree

In [None]:
# Define a decision tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(criterion='squared_error', max_depth=None)

# Fit the model
dt.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
from sklearn.metrics import mean_absolute_error
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column>'].plot(ax=ax)
plt.plot(np.concatenate((pred_train, pred_test)), label='Decision Tree Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()