In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [11]:
def list_directory_tree_with_os_walk(starting_directory):
    for root, directories, files in os.walk(starting_directory):
        print(f"Directory: {root}")
        for file in files:
            print(f"  File: {file}")

list_directory_tree_with_os_walk('.')

Directory: .
  File: make_org_datasets.ipynb
  File: test.csv
  File: feature_engineering.ipynb
  File: modelling.ipynb
  File: sample_submission.csv
Directory: ./make_org_datasets
  File: test.csv
  File: training_and_validation.csv
Directory: ./.venv
  File: pyvenv.cfg
  File: .gitignore
Directory: ./.venv/bin
  File: pip3.9
  File: pyftsubset
  File: jupyter-run
  File: Activate.ps1
  File: python3
  File: ttx
  File: easy_install
  File: python
  File: pip3
  File: ipython
  File: activate.fish
  File: easy_install-3.9
  File: fonttools
  File: python3.9
  File: f2py
  File: ipython3
  File: pip
  File: pip3.11
  File: jupyter-troubleshoot
  File: pygmentize
  File: jupyter-migrate
  File: activate
  File: jupyter-kernelspec
  File: jupyter-kernel
  File: jupyter
  File: pyftmerge
  File: activate.csh
Directory: ./.venv/include
Directory: ./.venv/lib
Directory: ./.venv/lib/python3.9
Directory: ./.venv/lib/python3.9/site-packages
  File: easy_install.py
  File: threadpoolctl.py
  Fi

In [12]:
train_a = pd.read_parquet('data/A/train_targets.parquet')
train_b = pd.read_parquet('data/B/train_targets.parquet')
train_c = pd.read_parquet('data/C/train_targets.parquet')

train_a['Location'] = 'A'
train_b['Location'] = 'B'
train_c['Location'] = 'C'

target = pd.concat([train_a, train_b, train_c], ignore_index=True)
target.reset_index(drop=True, inplace=True)
target.rename(columns={'time': 'date_forecast'}, inplace=True)

# One-hot encoding for locations
location_dummies = pd.get_dummies(target['Location'], prefix='Location')
location_dummies = location_dummies.astype(int)
target = pd.concat([target, location_dummies], axis=1)
target = target.drop('Location', axis=1)

In [13]:
locations = [('A', 'data/A/X_train_estimated.parquet'),
             ('B', 'data/B/X_train_estimated.parquet'),
             ('C', 'data/C/X_train_estimated.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
validation = pd.concat(processed_dataframes, ignore_index=True)

# One hot encoding for locations
location_dummies = pd.get_dummies(validation['Location'], prefix='Location')
location_dummies = location_dummies.astype(int)
validation = pd.concat([validation, location_dummies], axis=1)
validation = validation.drop('Location', axis=1)

In [14]:
locations = [('A', 'data/A/X_train_observed.parquet'),
             ('B', 'data/B/X_train_observed.parquet'),
             ('C', 'data/C/X_train_observed.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
training_set = pd.concat(processed_dataframes, ignore_index=True)

# One hot encoding for locations
location_dummies = pd.get_dummies(training_set['Location'], prefix='Location')
location_dummies = location_dummies.astype(int)
training_set = pd.concat([training_set, location_dummies], axis=1)
training_set = training_set.drop('Location', axis=1)

In [15]:
locations = [('A', 'data/A/X_test_estimated.parquet'),
             ('B', 'data/B/X_test_estimated.parquet'),
             ('C', 'data/C/X_test_estimated.parquet')]

# Initialize an empty list to store the processed DataFrames
processed_dataframes = []

for location, file_path in locations:
    # Load the dataset
    df = pd.read_parquet(file_path)
    # Assuming 'date_forecast' is a datetime column
    df['hourly_timestamp'] = df['date_forecast'].dt.floor('H')
    # Group by 'hourly_timestamp' and calculate the mean for each group
    df = df.groupby('hourly_timestamp').mean().reset_index()
    # Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
    df.drop(columns=['hourly_timestamp'], inplace=True)
    df['date_forecast'] = df['date_forecast'].dt.floor('H')
    # Store the 'Location' information in a new column
    df['Location'] = location
    # Append the processed DataFrame to the list
    processed_dataframes.append(df)

# Concatenate all processed DataFrames into one
test = pd.concat(processed_dataframes, ignore_index=True)

# One hot encoding for locations
location_dummies = pd.get_dummies(test['Location'], prefix='Location')
location_dummies = location_dummies.astype(int)
test = pd.concat([test, location_dummies], axis=1)
test = test.drop('Location', axis=1)

In [16]:
training = pd.concat([training_set, validation], axis=0)

training = training.merge(target[['Location_A', 'Location_B', 'Location_C', 'date_forecast', 'pv_measurement']], 
                        how='left', 
                        on=['Location_A', 'Location_B', 'Location_C', 'date_forecast']) 
                        #suffixes=('', '_target'))

training = training.dropna(subset=['pv_measurement'])

In [17]:
# Create the 'generated_datasets' folder if it doesn't exist
output_folder = 'make_org_datasets'
os.makedirs(output_folder, exist_ok=True)

test_file = os.path.join(output_folder, 'test.csv')
test.to_csv(test_file, index=False)

training_and_validation_file = os.path.join(output_folder, 'training_and_validation.csv')
training.to_csv(training_and_validation_file, index=False)

#training_file = os.path.join(output_folder, 'training.csv')
#training_set.to_csv(training_file, index=False)

#validation_file = os.path.join(output_folder, 'validation.csv')
#validation.to_csv(validation_file, index=False)

### Code to get the generated dataframes

In [18]:
test = pd.read_csv('make_org_datasets/test.csv')
#validation = pd.read_csv('generated_datasets/validation.csv')
#training = pd.read_csv('generated_datasets/training.csv')
training_and_validation = pd.read_csv('make_org_datasets/training_and_validation.csv')

  training_and_validation = pd.read_csv('make_org_datasets/training_and_validation.csv')
