In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [16]:
def list_directory_tree_with_os_walk(starting_directory):
    for root, directories, files in os.walk(starting_directory):
        print(f"Directory: {root}")
        for file in files:
            print(f"  File: {file}")

list_directory_tree_with_os_walk('.')

Directory: .
  File: modelling_oneLoc.ipynb
  File: my_first_submission.csv
  File: .DS_Store
  File: explore_locA.ipynb
  File: test.csv
  File: Readme.md
  File: modelling_allLoc.ipynb
  File: sample_submission.csv
  File: read_files.ipynb
Directory: ./A
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet
Directory: ./C
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet
Directory: ./B
  File: X_train_observed.parquet
  File: train_targets.parquet
  File: X_train_estimated.parquet
  File: X_test_estimated.parquet


In [17]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

In [18]:
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

In [19]:
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

In [20]:
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

In [21]:
X_train_estimated_a['hourly_timestamp'] = X_train_estimated_a['date_forecast'].dt.floor('H')
# Group by 'hourly_timestamp' and calculate the mean for each group
X_train_estimated_a = X_train_estimated_a.groupby('hourly_timestamp').mean().reset_index()
# Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
X_train_estimated_a.drop(columns=['hourly_timestamp'], inplace=True)
X_train_estimated_a['date_forecast'] = X_train_estimated_a['date_forecast'].dt.floor('H')

X_train_observed_a['hourly_timestamp'] = X_train_observed_a['date_forecast'].dt.floor('H')
# Group by 'hourly_timestamp' and calculate the mean for each group
X_train_observed_a = X_train_observed_a.groupby('hourly_timestamp').mean().reset_index()
# Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
X_train_observed_a.drop(columns=['hourly_timestamp'], inplace=True)
X_train_observed_a['date_forecast'] = X_train_observed_a['date_forecast'].dt.floor('H')

X_test_estimated_a['hourly_timestamp'] = X_test_estimated_a['date_forecast'].dt.floor('H')
# Group by 'hourly_timestamp' and calculate the mean for each group
X_test_estimated_a = X_test_estimated_a.groupby('hourly_timestamp').mean().reset_index()
# Drop the 'hourly_timestamp' column if you don't need it in the final DataFrame
X_test_estimated_a.drop(columns=['hourly_timestamp'], inplace=True)
X_test_estimated_a['date_forecast'] = X_test_estimated_a['date_forecast'].dt.floor('H')

In [22]:
train_a.rename(columns={'time': 'date_forecast'}, inplace=True)

validation = X_train_estimated_a.merge(train_a[['date_forecast', 'pv_measurement']], 
                        how='left', 
                        on=['date_forecast'], 
                        suffixes=('', '_target'))

# Fill NaN values in 'pv_measurement' column with 0 if needed
#training['pv_measurement'].fillna(0, inplace=True)
validation = validation.dropna(subset=['pv_measurement'])

In [23]:
train_a.rename(columns={'time': 'date_forecast'}, inplace=True)

training = X_train_observed_a.merge(train_a[['date_forecast', 'pv_measurement']], 
                        how='left', 
                        on=['date_forecast'], 
                        suffixes=('', '_target'))

# Fill NaN values in 'pv_measurement' column with 0 if needed
#training['pv_measurement'].fillna(0, inplace=True)
training = training.dropna(subset=['pv_measurement'])

In [24]:
# Step 1: Split your data into features (X) and the target variable (y)
X_train = training.drop(columns=['pv_measurement'])  # Exclude the target column
X_train = X_train.drop(columns=['date_forecast'])
X_train.fillna(0, inplace=True)  # Fill NaN values with 0 in X_train
y_train = training['pv_measurement']

X_val = validation.drop(columns=['pv_measurement'])  # Exclude the target column
X_val = X_val.drop(columns=['date_forecast'])
X_val = X_val.drop(columns=['date_calc'])
X_val.fillna(0, inplace=True)  # Fill NaN values with 0 in X_val
y_val = validation['pv_measurement']

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 2: Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=10, random_state=10)  # You can adjust hyperparameters
rf_model.fit(X_train, y_train)

# Step 3: Evaluate the model's performance on the validation data
y_pred = rf_model.predict(X_val)

# Calculate the mean squared error (MSE)
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error on validation data: {mse:.2f}')

Mean Squared Error on validation data: 123092.49


In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
predictions = model.predict(X_val)

# Evaluate the model (e.g., using mean squared error)
mse = mean_squared_error(y_val, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 141436.63295475199
