In [1]:
using Pkg
Pkg.activate("."); Pkg.instantiate()
Pkg.precompile()

using CSV
using DataFrames
using Plots
using Glob
using Dates

[32m[1m  Activating[22m[39m project at `~/proyectos/TFG-ProduccionElectrica/codigo/Julia`


In [2]:
filepath = "../data/predict-energy-behavior-of-prosumers"
files = glob("*.csv", filepath)

8-element Vector{String}:
 "../data/predict-energy-behavior-of-prosumers/client.csv"
 "../data/predict-energy-behavior-of-prosumers/county_lon_lats.csv"
 "../data/predict-energy-behavior-of-prosumers/electricity_prices.csv"
 "../data/predict-energy-behavior-of-prosumers/forecast_weather.csv"
 "../data/predict-energy-behavior-of-prosumers/gas_prices.csv"
 "../data/predict-energy-behavior-of-prosumers/historical_weather.csv"
 "../data/predict-energy-behavior-of-prosumers/train.csv"
 "../data/predict-energy-behavior"[93m[1m ⋯ 20 bytes ⋯ [22m[39m"r_station_to_county_mapping.csv"

In [3]:
dfs = Dict(
    splitext(basename(archivo))[1] => CSV.read(archivo, DataFrame)
    for archivo in files
)
dfs["weather_station"] = dfs["weather_station_to_county_mapping"]
delete!(dfs, "weather_station_to_county_mapping")

Dict{String, DataFrame} with 8 entries:
  "historical_weather" => [1m1710802×18 DataFrame[0m[0m…
  "electricity_prices" => [1m15286×4 DataFrame[0m[0m…
  "weather_station"    => [1m112×4 DataFrame[0m[0m…
  "client"             => [1m41919×7 DataFrame[0m[0m…
  "train"              => [1m2018352×9 DataFrame[0m[0m…
  "forecast_weather"   => [1m3424512×18 DataFrame[0m[0m…
  "county_lon_lats"    => [1m75×4 DataFrame[0m[0m…
  "gas_prices"         => [1m637×5 DataFrame[0m[0m…

In [15]:
struct DataProcessor
    gas_join::Vector{String}
    ep_join::Vector{String}
    hw_join::Vector{String}
    client_join::Vector{String}
    lat_lon::Vector{String}
end

function DataProcessor()
    DataProcessor(
        ["data_block_id"],
        ["datetime", "data_block_id"],
        ["datetime", "county", "data_block_id"],
        ["county", "is_business", "product_type", "data_block_id"],
        ["latitude", "longitude"]
    )
end

function change_names(df::DataFrame, suffix::AbstractString, no_change::Vector{String})
    renamed_cols = [
        col in no_change ? col : col * suffix for col in names(df)
    ]

    renamed_df = rename(df, names(df) .=> renamed_cols)
    return renamed_df
end

function to_datetime(df::DataFrame, col::AbstractString)
    new_df = deepcopy(df)
    formato = DateFormat("yyyy-mm-dd HH:MM:SS")

    new_df[!, col] .= DateTime.(new_df[!, col], formato)
    return new_df
end

function get_client_features(client_df::DataFrame, client_join::Vector{String})
    return change_names(client_df, "_client", client_join)
end

function get_gas_features(gas_df::DataFrame, gas_join::Vector{String})
    df = deepcopy(gas_df)

    df[!, "mean"] = (df.highest_price_per_mwh + df.lowest_price_per_mwh) / 2

    df = change_names(df, "_gas", gas_join)

    return df
end

function get_electricity_features(ep_df::DataFrame, ep_join::Vector{String})
    df = to_datetime(ep_df, "forecast_date")
    df = rename(df, :forecast_date => :datetime)
    df = change_names(df, "_ep", ep_join)
    return df
end

function get_data_features(data::DataFrame)
    df = to_datetime(data, "datetime")
    df.date = Date.(df.datetime)
    df.year = year.(df.datetime)
    df.quarter = quarterofyear.(df.datetime)
    df.month = month.(df.datetime)
    df.week = week.(df.datetime)
    df.hour = hour.(df.datetime)
    df.day_of_year = dayofyear.(df.datetime)
    df.day_of_month = dayofmonth.(df.datetime)
    df.day_of_week = dayofweek.(df.datetime)

    return df
end

function process_data(dp::DataProcessor, df_dict::Dict)
    data = get_data_features(df_dict["train"])
    client = get_client_features(df_dict["client"], dp.client_join)
    gas = get_gas_features(df_dict["gas_prices"], dp.gas_join)
    electricity = get_electricity_features(df_dict["electricity_prices"], dp.ep_join)

    joins = [dp.client_join, dp.ep_join, dp.gas_join]
    df = leftjoin(data, gas, client, electricity on=joins)
    
    return df
end

Base.Meta.ParseError: ParseError:
# Error @ /home/pablo/proyectos/TFG-ProduccionElectrica/codigo/Julia/jl_notebook_cell_df34fa98e69747e1a8f8a730347b8e2f_W3sZmlsZQ==.jl:79:50
    joins = [dp.client_join, dp.ep_join, dp.gas_join]
    df = leftjoin(data, gas, client, electricity on=joins)
#                                                └──────┘ ── Expected `)`

In [13]:
processor = DataProcessor()
process_data(processor, dfs)

Row,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,forecast_date_gas,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,origin_date_gas,mean_gas
Unnamed: 0_level_1,Int64,Int64,Int64,Float64?,Int64,DateTime,Int64,Int64,Int64,Date,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Date?,Float64?,Float64?,Date?,Float64?
1,0,0,1,1.687,0,2021-09-02T00:00:00,1,2928,0,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
2,0,0,1,109.366,1,2021-09-02T00:00:00,1,2929,0,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
3,0,0,2,0.0,0,2021-09-02T00:00:00,1,2930,1,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
4,0,0,2,21.008,1,2021-09-02T00:00:00,1,2931,1,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
5,0,0,3,1.003,0,2021-09-02T00:00:00,1,2932,2,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
6,0,0,3,735.696,1,2021-09-02T00:00:00,1,2933,2,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
7,0,1,0,0.0,0,2021-09-02T00:00:00,1,2934,3,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
8,0,1,0,75.8,1,2021-09-02T00:00:00,1,2935,3,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
9,0,1,1,0.0,0,2021-09-02T00:00:00,1,2936,4,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775
10,0,1,1,502.241,1,2021-09-02T00:00:00,1,2937,4,2021-09-02,2021,3,9,35,0,245,2,4,2021-09-01,45.23,46.32,2021-08-31,45.775


In [70]:
as = rename(
    dfs["electricity_prices"],
    :forecast_date => :datetime
);

formato = DateFormat("yyyy-mm-dd HH:MM:SS");

as.datetime = DateTime.(as.datetime, formato);
as.date = Date.(as.datetime)
as.year = year.(as.datetime)
as.quarter = quarterofyear.(as.datetime)
as.month = month.(as.datetime)
as.day_of_year = dayofyear.(as.datetime)
as.day_of_month = dayofmonth.(as.datetime)
as.day_of_week = dayofweek.(as.datetime)
as.week = week.(as.datetime)
as.hour = hour.(as.datetime)

as

Row,datetime,euros_per_mwh,origin_date,data_block_id,date,year,quarter,month,day_of_year,day_of_month,day_of_week,week,hour
Unnamed: 0_level_1,DateTime,Float64,String31,Int64,Date,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,2021-09-01T00:00:00,92.51,2021-08-31 00:00:00,1,2021-09-01,2021,3,9,244,1,3,35,0
2,2021-09-01T01:00:00,88.9,2021-08-31 01:00:00,1,2021-09-01,2021,3,9,244,1,3,35,1
3,2021-09-01T02:00:00,87.35,2021-08-31 02:00:00,1,2021-09-01,2021,3,9,244,1,3,35,2
4,2021-09-01T03:00:00,86.88,2021-08-31 03:00:00,1,2021-09-01,2021,3,9,244,1,3,35,3
5,2021-09-01T04:00:00,88.43,2021-08-31 04:00:00,1,2021-09-01,2021,3,9,244,1,3,35,4
6,2021-09-01T05:00:00,93.58,2021-08-31 05:00:00,1,2021-09-01,2021,3,9,244,1,3,35,5
7,2021-09-01T06:00:00,118.7,2021-08-31 06:00:00,1,2021-09-01,2021,3,9,244,1,3,35,6
8,2021-09-01T07:00:00,135.44,2021-08-31 07:00:00,1,2021-09-01,2021,3,9,244,1,3,35,7
9,2021-09-01T08:00:00,140.72,2021-08-31 08:00:00,1,2021-09-01,2021,3,9,244,1,3,35,8
10,2021-09-01T09:00:00,130.0,2021-08-31 09:00:00,1,2021-09-01,2021,3,9,244,1,3,35,9


In [64]:
as2 = copy(dfs["gas_prices"])

Row,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
Unnamed: 0_level_1,DateTime,Float64,Float64,Date,Int64
1,2021-09-01T00:00:00,45.23,46.32,2021-08-31,1
2,2021-09-02T00:00:00,45.62,46.29,2021-09-01,2
3,2021-09-03T00:00:00,45.85,46.4,2021-09-02,3
4,2021-09-04T00:00:00,46.3,46.8,2021-09-03,4
5,2021-09-05T00:00:00,46.3,46.58,2021-09-04,5
6,2021-09-06T00:00:00,46.17,46.95,2021-09-05,6
7,2021-09-07T00:00:00,46.35,47.6,2021-09-06,7
8,2021-09-08T00:00:00,46.4,47.64,2021-09-07,8
9,2021-09-09T00:00:00,44.96,47.72,2021-09-08,9
10,2021-09-10T00:00:00,47.82,48.29,2021-09-09,10


In [None]:
as2[!, "mean"] = (as2.lowest_price_per_mwh .+ as2.highest_price_per_mwh) ./ 2

637-element Vector{Float64}:
 45.775
 45.955
 46.125
 46.55
 46.44
 46.56
 46.975
 47.019999999999996
 46.34
 48.055
  ⋮
 37.105
 34.7
 35.260000000000005
 32.5
 31.6
 31.200000000000003
 31.1
 32.57
 31.5

In [66]:
as2

Row,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id,mean
Unnamed: 0_level_1,DateTime,Float64,Float64,Date,Int64,Float64
1,2021-09-01T00:00:00,45.23,46.32,2021-08-31,1,45.775
2,2021-09-02T00:00:00,45.62,46.29,2021-09-01,2,45.955
3,2021-09-03T00:00:00,45.85,46.4,2021-09-02,3,46.125
4,2021-09-04T00:00:00,46.3,46.8,2021-09-03,4,46.55
5,2021-09-05T00:00:00,46.3,46.58,2021-09-04,5,46.44
6,2021-09-06T00:00:00,46.17,46.95,2021-09-05,6,46.56
7,2021-09-07T00:00:00,46.35,47.6,2021-09-06,7,46.975
8,2021-09-08T00:00:00,46.4,47.64,2021-09-07,8,47.02
9,2021-09-09T00:00:00,44.96,47.72,2021-09-08,9,46.34
10,2021-09-10T00:00:00,47.82,48.29,2021-09-09,10,48.055
