# 4.0 - Data Analysis
This notebook is for analysing the transformed data.

## Imports and loading
Import necessary packages and load the transformed data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# load transformed file
df = pd.read_csv('../data/transformed/transformed.csv')

In [None]:
# Define a list to save the results
train_scores = []
test_scores = []

# Define a list to name the different models
models = []

## Analysis

### Split Data in Trainset and Testset
In order to train and evaluate the model, we need a train set and a test set.

In [None]:
# Define X and y
y = df['<Target Column>']
X = df.drop(columns=['<Target Column>'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Get train and test size
train_size = len(y_train)
test_size = len(y_test)

### Define a Baseline
A baseline model is a simple, often naive, model that serves as a point of reference for evaluating the performance of more sophisticated machine learning models. A baseline model provides a benchmark against which the performance of more complex models can be compared. It serves as a starting point for assessing the effectiveness of your machine learning solution.

#### Simple Baseline

In [None]:
# Name the model
models.append('Simple Baseline')

# Define the dummy regressor
dr = DummyRegressor(strategy='mean')

# Fit the model
dr.fit(X_train, y_train)

# Get train and test prediction
pred_train = dr.predict(X_train)
pred_test = dr.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column>'].plot(ax=ax)
plt.plot(pred_test, label='Baseline Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### ARIMA Baseline
ARIMA is designed to handle univariate time series data and is effective for capturing and forecasting temporal patterns in a dataset. It is a versatile model that combines autoregression, differencing, and moving averages to make predictions.

In [None]:
# Name the model
models.append('ARIMA')

# Define the order
order = (0,1,0)

# Define the ARIMA model
model = sm.tsa.arima.ARIMA(df['<Target Column>'], order=order)

# Fit the model
results = model.fit()

# Get train and test prediction
pred_train = results.predict(end=train_size-1)
pred_test = results.predict(start=train_size)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plot_predict(results, train_size, train_size + test_size - 1, ax=ax)
plt.xlabel('Index')
plt.ylabel(f'Target')
plt.show()

### Train and Evaluate ML Models
Train and evaluate different models with different hyperparameter.

#### Linear Regression

In [None]:
# Name the model
models.append('Linear Regression')

# Define a linear regression
lr = LinearRegression()

# Fit the model
lr.fit(X_train, y_train)

# Get train and test prediction
pred_train = lr.predict(X_train)
pred_test = lr.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Linear Regression Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### Decision Tree

In [None]:
# Name the model
models.append('Decision Tree')

# Define a decision tree
dt = DecisionTreeRegressor(criterion='squared_error', max_depth=None)

# Fit the model
dt.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Decision Tree Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

#### Random Forest

In [None]:
# Name the model
models.append('Random Forest')

# Define a random forest
rf = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None)

# Fit the model
rf.fit(X_train, y_train)

# Get train and test prediction
pred_train = dt.predict(X_train)
pred_test = dt.predict(X_test)

# Compute the score
train_score = mean_absolute_error(y_train, pred_train)
test_score = mean_absolute_error(y_test, pred_test)

# Add score to list
train_scores.append(train_score)
test_scores.append(test_score)

In [None]:
# Plot the test prediction
fig, ax = plt.subplots()
ax = df['<Target Column'].plot(ax=ax)
plt.plot(pred_test, label='Random Forest Prediction')
plt.xlabel('Index')
plt.ylabel('Target')
plt.show()

### Compare the Results

In [None]:
x = np.arange(len(models))
width = 0.3

plt.bar(x - 0.17, train_scores, width, label='Train')
plt.bar(x + 0.17, test_scores, width, label='Test')
plt.xticks(ticks=x, labels=models, rotation=45)
plt.xlabel('Models')
plt.ylabel(f'Scores')
plt.legend()
plt.show()