In [None]:
# Setup - import basic data libraries
import numpy as np
import pandas as pd

## Task Group 1 - Load, Inspect, and Merge Datasets

### Task 1
The file 'datasets/EV charging reports.csv' contains electric vehicle (EV) charging data. These come from various residential apartment buildings in Norway. The data includes specific user and garage information, plug-in and plug-out times, charging loads, and the dates of the charging sessions.

Import this CSV file to a pandas DataFrame named `ev_charging_reports`.

Use the `.head()` method to preview the first five rows.

In [None]:
ev_charging_reports = pd.read_csv('datasets/EV charging reports.csv')
print(ev_charging_reports.head())

### Task 2
Import the file 'datasets/Local traffic distribution.csv' to a pandas DataFrame named `traffic_reports`. This dataset contains the hourly local traffic density counts at 5 nearby traffic locations.

Preview the first five rows.

In [None]:
traffic_reports = pd.read_csv('datasets/Local traffic distribution.csv')
print(traffic_reports.head())

### Task 3
Merge the `ev_charging_reports` and `traffic_reports` datasets together into a DataFrame named `ev_charging_traffic` using the columns:
- `Start_plugin_hour` in `ev_charging_reports`
- `Date_from` in `traffic_reports`

In [None]:
ev_charging_traffic = ev_charging_reports.merge(traffic_reports, left_on='Start_plugin_hour', right_on='Date_from')

### Task 4
Use `.info()` to inspect the merged dataset. Specifically, pay attention to the data types and number of missing values in each column.

In [None]:
print(ev_charging_traffic.info())

## Task Group 2 - Data Cleaning and Preparation

### Task 5
Drop columns that won't be used for training from `ev_charging_traffic_hourly`.

In [None]:
ev_charging_traffic_hourly = ev_charging_traffic.drop(columns=['session_ID', 'Garage_ID', 'User_ID', 'Shared_ID', 'Plugin_category', 'Duration_category', 'Start_plugin', 'Start_plugin_hour', 'End_plugout', 'End_plugout_hour', 'Date_from', 'Date_to'])

### Task 6
Replace commas with periods in the `El_kWh` and `Duration_hours` columns.

In [None]:
ev_charging_traffic_hourly['El_kWh'] = ev_charging_traffic_hourly['El_kWh'].str.replace(',', '.')
ev_charging_traffic_hourly['Duration_hours'] = ev_charging_traffic_hourly['Duration_hours'].str.replace(',', '.')

### Task 7
Convert the data types of all the columns of `ev_charging_traffic_hourly` to floats.

In [None]:
ev_charging_traffic_hourly = ev_charging_traffic_hourly.astype(float)

## Task Group 3 - Train Test Split

### Task 8
Create two datasets from `ev_charging_traffic_hourly`:
- `X` contains only the input numerical features
- `y` contains only the target column `El_kWh`

In [None]:
y = ev_charging_traffic_hourly['El_kWh']
X = ev_charging_traffic_hourly.drop(columns=['El_kWh'])

### Task 9
Use sklearn to split X and y into training and testing datasets. The training set should use 80% of the data. Set the random_state parameter to 2.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, train_size=0.8, test_size=0.2)

## Task Group 4 - Linear Regression Baseline

### Task 10
Use Scikit-learn to train a Linear Regression model using the training data to predict EV charging loads.

In [None]:
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

### Task 11
Print the coefficients and intercept of the model.

In [None]:
print('Coefficients:', linear_model.coef_)
print('Intercept:', linear_model.intercept_)

### Task 12
Evaluate the performance of the model on the test set using Root Mean Squared Error (RMSE) and R-squared metrics.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = linear_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print('RMSE:', rmse)
print('R-squared:', r2)