In [1]:
using Pkg
Pkg.activate(".")

using DataFrames, Statistics, CSV, Markdown, CategoricalArrays

[32m[1m  Activating[22m[39m project at `d:\Clases\4o Curso\2o Cuatrimestre\TFG\TFG-ProduccionElectrica\codigo\Julia`


In [2]:
md(s) = display("text/markdown", s)

md (generic function with 1 method)

In [3]:
data_path = "../data"

filter(f -> endswith(f, ".csv"), readdir(data_path))

6-element Vector{String}:
 "client.csv"
 "electricity_prices.csv"
 "gas_prices.csv"
 "historical_weather.csv"
 "train.csv"
 "weather_station_to_county_mapping.csv"

# Carga de los datos

In [4]:
function load_dict(path::AbstractString)::Dict{String, DataFrame}
    Dict(
        (occursin("weather_station", name) ? "weather_station" : name) =>
        CSV.read(joinpath(path, file), DataFrame)
        for file in readdir(path)
        if endswith(file, ".csv") && begin
            global name = splitext(file)[1]
            true
        end
    ) 
end

load_dict (generic function with 1 method)

function load_data(path::String)::Dict{String, DataFrame}
    files = filter(f -> endswith(f, ".csv"), readdir(path))
    data = Dict{String, DataFrame}()

    for file in files
        file_name, _ = splitext(file)
        file_path = joinpath(path, file)

        file_name = occursin("weather_station", file_name) ? "weather_station" : file_name

        data[file_name] = CSV.read(file_path, DataFrame)
    end

    return data
end

In [5]:
data_dict = load_dict(data_path)

Dict{String, DataFrame} with 6 entries:
  "historical_weather" => [1m1710802×18 DataFrame[0m[0m…
  "electricity_prices" => [1m15286×4 DataFrame[0m[0m…
  "weather_station"    => [1m112×4 DataFrame[0m[0m…
  "client"             => [1m41919×7 DataFrame[0m[0m…
  "train"              => [1m2018352×9 DataFrame[0m[0m…
  "gas_prices"         => [1m637×5 DataFrame[0m[0m…

# Descripción de los datos

In [6]:
function to_categorical!(df::DataFrame, cols::Vector{String})
    for col in cols
        if col in names(df)
            df[!, col] = CategoricalArray(df[!, col])
        end
    end
end

to_categorical! (generic function with 1 method)

In [7]:
categorical_columns = [
    "county",
    "product_type",
    "is_business",
    "county_name",
    "is_consumption"
] 

5-element Vector{String}:
 "county"
 "product_type"
 "is_business"
 "county_name"
 "is_consumption"

In [8]:
for key in keys(data_dict)
    to_categorical!(data_dict[key], categorical_columns)
end

In [43]:
function print_stats(df::DataFrame)
    stats = describe(df)
    for row in eachrow(stats)
        md("**$(row.variable)**")
        for stat in names(row)[2:end]
            md("\t$stat: $(row[stat])")
        end
    end
end

print_stats (generic function with 1 method)

In [46]:
function display_stats(df::DataFrame, name::String)
    # Data visuallization
    md("### **$name**")
    display(first(df, 5))
    md("shape: $(size(df))")

    display(describe(df))
    
end

display_stats (generic function with 1 method)

In [47]:
for key in keys(data_dict)
    display_stats(data_dict[key], key)
end

### **historical_weather**

Row,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
Unnamed: 0_level_1,String31,Float64,Float64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Float64,Int64,Float64,Float64,Float64,Float64,Float64,Float64
1,2021-09-01 00:00:00,14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.08333,8,0.0,0.0,0.0,57.6,21.7,1.0
2,2021-09-01 00:00:00,13.9,11.5,0.0,0.0,1010.7,33,37,0,0,5.11111,359,0.0,0.0,0.0,57.6,22.2,1.0
3,2021-09-01 00:00:00,14.0,12.5,0.0,0.0,1015.0,31,34,0,0,6.33333,355,0.0,0.0,0.0,57.6,22.7,1.0
4,2021-09-01 00:00:00,14.6,11.5,0.0,0.0,1017.3,0,0,0,0,8.08333,297,358.0,277.0,81.0,57.6,23.2,1.0
5,2021-09-01 00:00:00,15.7,12.9,0.0,0.0,1014.0,22,25,0,0,8.41667,5,0.0,0.0,0.0,57.6,23.7,1.0


shape: (1710802, 18)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,datetime,,2021-09-01 00:00:00,,2023-05-30 10:00:00,0,String31
2,temperature,5.74097,-23.7,5.1,32.6,0,Float64
3,dewpoint,2.24031,-25.9,1.7,23.8,0,Float64
4,rain,0.0496201,0.0,0.0,16.8,0,Float64
5,snowfall,0.016049,0.0,0.0,2.66,0,Float64
6,surface_pressure,1009.28,942.9,1010.4,1049.3,0,Float64
7,cloudcover_total,60.9127,0,72.0,100,0,Int64
8,cloudcover_low,46.6859,0,39.0,100,0,Int64
9,cloudcover_mid,34.407,0,16.0,100,0,Int64
10,cloudcover_high,36.0514,0,10.0,100,0,Int64


### **electricity_prices**

Row,forecast_date,euros_per_mwh,origin_date,data_block_id
Unnamed: 0_level_1,String31,Float64,String31,Int64
1,2021-09-01 00:00:00,92.51,2021-08-31 00:00:00,1
2,2021-09-01 01:00:00,88.9,2021-08-31 01:00:00,1
3,2021-09-01 02:00:00,87.35,2021-08-31 02:00:00,1
4,2021-09-01 03:00:00,86.88,2021-08-31 03:00:00,1
5,2021-09-01 04:00:00,88.43,2021-08-31 04:00:00,1


shape: (15286, 4)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,forecast_date,,2021-09-01 00:00:00,,2023-05-30 23:00:00,0,String31
2,euros_per_mwh,157.064,-10.06,128.28,4000.0,0,Float64
3,origin_date,,2021-08-31 00:00:00,,2023-05-29 23:00:00,0,String31
4,data_block_id,318.991,1,319.0,637,0,Int64


### **weather_station**

Row,county_name,longitude,latitude,county
Unnamed: 0_level_1,Cat…?,Float64,Float64,Cat…?
1,missing,21.7,57.6,missing
2,missing,21.7,57.9,missing
3,missing,21.7,58.2,missing
4,missing,21.7,58.5,missing
5,missing,21.7,58.8,missing


shape: (112, 4)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,county_name,,Harjumaa,,Võrumaa,63,"Union{Missing, CategoricalValue{String15, UInt32}}"
2,longitude,24.95,21.7,24.95,28.2,0,Float64
3,latitude,58.65,57.6,58.65,59.7,0,Float64
4,county,,0,,15,63,"Union{Missing, CategoricalValue{Int64, UInt32}}"


### **client**

Row,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
Unnamed: 0_level_1,Cat…,Cat…,Int64,Float64,Cat…,Date,Int64
1,1,0,108,952.89,0,2021-09-01,2
2,2,0,17,166.4,0,2021-09-01,2
3,3,0,688,7207.88,0,2021-09-01,2
4,0,0,5,400.0,1,2021-09-01,2
5,1,0,43,1411.0,1,2021-09-01,2


shape: (41919, 7)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,DataType
1,product_type,,0,,3,0,"CategoricalValue{Int64, UInt32}"
2,county,,0,,15,0,"CategoricalValue{Int64, UInt32}"
3,eic_count,73.3451,5,32.0,1517,0,Int64
4,installed_capacity,1450.77,5.5,645.2,19314.3,0,Float64
5,is_business,,0,,1,0,"CategoricalValue{Int64, UInt32}"
6,date,,2021-09-01,2022-07-20,2023-05-29,0,Date
7,data_block_id,322.899,2,324.0,637,0,Int64


### **train**

Row,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
Unnamed: 0_level_1,Cat…,Cat…,Cat…,Float64?,Cat…,String31,Int64,Int64,Int64
1,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
2,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
3,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
4,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
5,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


shape: (2018352, 9)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,county,,0,,15,0,"CategoricalValue{Int64, UInt32}"
2,is_business,,0,,1,0,"CategoricalValue{Int64, UInt32}"
3,product_type,,0,,3,0,"CategoricalValue{Int64, UInt32}"
4,target,274.856,0.0,31.133,15480.3,528,"Union{Missing, Float64}"
5,is_consumption,,0,,1,0,"CategoricalValue{Int64, UInt32}"
6,datetime,,2021-09-01 00:00:00,,2023-05-31 23:00:00,0,String31
7,data_block_id,321.875,0,323.0,637,0,Int64
8,row_id,1009180.0,0,1009180.0,2018351,0,Int64
9,prediction_unit_id,33.0454,0,33.0,68,0,Int64


### **gas_prices**

Row,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
Unnamed: 0_level_1,Date,Float64,Float64,Date,Int64
1,2021-09-01,45.23,46.32,2021-08-31,1
2,2021-09-02,45.62,46.29,2021-09-01,2
3,2021-09-03,45.85,46.4,2021-09-02,3
4,2021-09-04,46.3,46.8,2021-09-03,4
5,2021-09-05,46.3,46.58,2021-09-04,5


shape: (637, 5)

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,DataType
1,forecast_date,,2021-09-01,2022-07-16,2023-05-30,0,Date
2,lowest_price_per_mwh,95.0368,28.1,85.21,250.0,0,Float64
3,highest_price_per_mwh,107.755,34.0,93.47,305.0,0,Float64
4,origin_date,,2021-08-31,2022-07-15,2023-05-29,0,Date
5,data_block_id,319.0,1,319.0,637,0,Int64


In [27]:
describe(data_dict["client"])

Row,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Any,Any,Int64,DataType
1,product_type,,0,,3,0,"CategoricalValue{Int64, UInt32}"
2,county,,0,,15,0,"CategoricalValue{Int64, UInt32}"
3,eic_count,73.3451,5,32.0,1517,0,Int64
4,installed_capacity,1450.77,5.5,645.2,19314.3,0,Float64
5,is_business,,0,,1,0,"CategoricalValue{Int64, UInt32}"
6,date,,2021-09-01,2022-07-20,2023-05-29,0,Date
7,data_block_id,322.899,2,324.0,637,0,Int64
