# 1.0 Data Exploration

## Imports and loading
Import necessary packages and load the raw data.

In [None]:
import sys
if 'google.colab' in sys.modules:
    ! git clone https://github.com/nischa564/wind-speed-analysis.git # clone repository for colab
    ! ls

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go

In [None]:
# load csv file
df = pd.read_csv('wind-speed-analysis/data/raw/wind_dataset.csv')

## Display the Dataset

In [None]:
df

## Statistics about the Data

In [None]:
df.shape
# (#rows, #columns)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

## Visualize the Data

### Line Plot

In [None]:
# Plot the first column as a line plot
plt.plot(df.iloc[:, 0], label='<Column 1>')

# Show the plot
plt.show()

# 2.0 Data Preprocessing

## Convert Categorical Features

### Label Encoding

In [None]:
# Define the columns which should be encoded
cols_cat = ['<Column 1>', '<Column 3>']


# Loop through each categorical column to perform label encoding
for i in cols_cat:
    # Step 1: Store the original column values
    original = df[i]

    # Step 2: Create a mask for missing values in the column
    mask = df[i].isnull()

    # Step 3: Perform label encoding on the column and replace the original values
    df[i] = LabelEncoder().fit_transform(df[i].astype(str))

    # Step 4: Replace the encoded values with original values for missing values
    df[i] = df[i].where(~mask, original)

    # Step 5: Convert the column back to integers, treating 'nan' as NaN
    df[i] = df[i].apply(lambda x: int(x) if str(x) != 'nan' else np.nan) 

## Fill Missing Values

In [None]:
# Define the imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Apply the imputation to the dataset
df = df.transform(df)

## Save Preprocessed Dataset
Save the processed data in a new file. Rename if you need multiple files.

In [None]:
df.to_csv('wind-speed-analysis/data/processed/processed.csv', index=False)

# 3.0 - Data Transformation

## Apply Transformations

### PCA

In [None]:
# Define and select the columns on which the pca is applied
#cols = ['<Column 1>', '<Column 3>']
cols = list(df.columns)

# Fit the PCA and transform on the selected columns
pca = PCA(n_components=2)
data_pca = pca.fit_transform(df[cols])

# Define a new name for the new features
feature_name = 'pca_feature'

# Convert the PCA data to a pandas dataframe
new_cols = [f'{feature_name}_' + str(i+1) for i in range(data_pca.shape[1])]
df_pca = pd.DataFrame(data_pca, columns=new_cols, index=df.index)

# Concatenate the pca columns with the unselected columns
df_untransformed = df[[col for col in df.columns if col not in cols]]
df = pd.concat([df_pca, df_untransformed], axis=1)

## Save Tranformed Dataset

In [None]:
df.to_csv('wind-speed-analysis/data/transformed/transformed.csv', index=False)

# 4.0 - Data Analysis

## Analysis

### Split Data in Trainset and Testset

In [None]:
# Define X and y
y = df['<Target Column>']
X = df.drop(columns=['<Target Column>'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Get train and test size
train_size = len(y_train)
test_size = len(y_test)

### Train and Evaluate ML Models
Train and evaluate different models with different hyperparameter.

#### Decision Tree

In [None]:
# Define a decision tree
dt = DecisionTreeRegressor(criterion='squared_error', max_depth=None)

# Fit the model
dt.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Decision Tree Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()