In [1]:
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown

In [2]:
def transform_data(df: pd.DataFrame):
    df_copy = df.copy()

    df_copy['user'] = df_copy['user'].astype('string')

    df_copy['start_date'] = pd.to_datetime(
        df_copy['start_date'],
        format="%Y-%m-%dT%H:%M:%SZ"
    )

    df_copy['end_date'] = pd.to_datetime(
        df_copy['end_date'],
        format="%Y-%m-%dT%H:%M:%SZ"
    )
    
    df_copy['contract_start_date'] = pd.to_datetime(
        df_copy['end_date']
    )

    df_copy['contract_end_date'] = pd.to_datetime(
        df_copy['end_date']
    )

    df_copy['contracted_tariff'] = df_copy['contracted_tariff'].astype('category')
    df_copy['municipality'] = df_copy['municipality'].astype('category')
    df_copy['province'] = df_copy['province'].astype('category')

    return df_copy

In [3]:
def describe_data(df: pd.DataFrame):
    columns = df.columns
    stats = df.describe()
    display(stats)
    
    display(Markdown("### **Data description**"))
    for col in columns:
        col_name = col
        dtype = df[col].dtype
        display(Markdown(f"{col_name}:    {dtype}"))

    return None

In [4]:
metadata_path = Path("Data/smartmetersdata/7362094/metadata.csv").resolve()

In [5]:
metadata = pd.read_csv(metadata_path, parse_dates=True)

metadata = transform_data(metadata)


metadata['user'] = metadata['user'].astype('string')

In [6]:
# Data visualization
display(Markdown(f'## **{metadata_path.name}**'))
display(metadata.head(3))
display(Markdown(f"shape: {metadata.shape}"))

display(metadata.describe(include='all'))

# Data statistics
describe_data(metadata)
#display(metadata.describe().loc['count', 'p1'].item())

## **metadata.csv**

Unnamed: 0,user,start_date,end_date,length_days,length_years,potential_samples,actual_samples,missing_samples_abs,missing_samples_pct,contract_start_date,...,p1,p2,p3,p4,p5,p6,province,municipality,zip_code,cnae
0,00000c5a448d9faa097b761cc98036d45a4e7d36032903...,2022-05-30 01:00:00,2022-06-05,6.0,0.016427,144,144,0,0.0,2022-06-05,...,2.2,2.2,0.0,0.0,0.0,0.0,Gipuzkoa,,,9329.0
1,0001b3b2f18c01c62ed9b2a87de7b4e33e7836f786f790...,2017-05-31 01:00:00,2022-06-05,1831.0,5.013005,43944,43863,81,0.184326,2022-06-05,...,3.45,3.45,,,,,Bizkaia,,,9820.0
2,0003de2700e20a1681d69fe287441d9041407a7698d5c8...,2017-05-31 01:00:00,2019-11-14,897.0,2.455852,21528,21476,52,0.241546,2019-11-14,...,4.6,,,,,,Gipuzkoa,Donostia/San Sebastian,20013.0,9820.0


shape: (25559, 23)

Unnamed: 0,user,start_date,end_date,length_days,length_years,potential_samples,actual_samples,missing_samples_abs,missing_samples_pct,contract_start_date,...,p1,p2,p3,p4,p5,p6,province,municipality,zip_code,cnae
count,25559,25559,25559,25559.0,25559.0,25559.0,25559.0,25559.0,25559.0,25559,...,25371.0,24078.0,3596.0,3284.0,3286.0,3286.0,25371,9215,9215.0,25340.0
unique,25559,,,,,,,,,,...,,,,,,,47,102,,
top,00489151865c9806ea977413603cc72b8bc27ff57350e8...,,,,,,,,,,...,,,,,,,Gipuzkoa,Bilbao,,
freq,1,,,,,,,,,,...,,,,,,,9291,2067,,
mean,,2019-05-17 15:46:11.939433984,2022-03-22 01:22:22.024335616,1039.441764,2.845836,24946.602332,24736.542392,210.05994,0.673996,2022-03-22 01:22:22.024335616,...,4.850001,5.013765,8.039664,8.214998,8.209998,11.208181,,,26451.625285,9207.880979
min,,2014-11-02 01:00:00,2015-11-02 00:00:00,1.0,0.002738,24.0,24.0,0.0,0.0,2015-11-02 00:00:00,...,0.1,0.0,0.0,0.0,0.0,0.0,,,1001.0,0.0
25%,,2017-07-06 01:00:00,2022-06-04 00:00:00,461.0,1.262149,11064.0,10989.0,3.0,0.021433,2022-06-04 00:00:00,...,3.3,3.3,0.0,0.0,0.0,0.0,,,20003.0,9820.0
50%,,2019-03-12 01:00:00,2022-06-05 00:00:00,1017.0,2.784394,24408.0,24163.0,6.0,0.024539,2022-06-05 00:00:00,...,3.45,3.45,0.0,0.0,0.0,0.0,,,28010.0,9820.0
75%,,2020-12-17 01:00:00,2022-06-05 00:00:00,1685.0,4.613279,40440.0,40217.0,33.0,0.122884,2022-06-05 00:00:00,...,4.6,4.6,11.0,12.0,12.0,15.001,,,48003.0,9820.0
max,,2022-06-04 01:00:00,2022-06-08 00:00:00,2584.125,7.074949,62019.0,62012.0,38431.0,99.598326,2022-06-08 00:00:00,...,550.0,550.0,550.0,550.0,550.0,800.0,,,50617.0,9949.0


Unnamed: 0,start_date,end_date,length_days,length_years,potential_samples,actual_samples,missing_samples_abs,missing_samples_pct,contract_start_date,contract_end_date,p1,p2,p3,p4,p5,p6,zip_code,cnae
count,25559,25559,25559.0,25559.0,25559.0,25559.0,25559.0,25559.0,25559,25559,25371.0,24078.0,3596.0,3284.0,3286.0,3286.0,9215.0,25340.0
mean,2019-05-17 15:46:11.939433984,2022-03-22 01:22:22.024335616,1039.441764,2.845836,24946.602332,24736.542392,210.05994,0.673996,2022-03-22 01:22:22.024335616,2022-03-22 01:22:22.024335616,4.850001,5.013765,8.039664,8.214998,8.209998,11.208181,26451.625285,9207.880979
min,2014-11-02 01:00:00,2015-11-02 00:00:00,1.0,0.002738,24.0,24.0,0.0,0.0,2015-11-02 00:00:00,2015-11-02 00:00:00,0.1,0.0,0.0,0.0,0.0,0.0,1001.0,0.0
25%,2017-07-06 01:00:00,2022-06-04 00:00:00,461.0,1.262149,11064.0,10989.0,3.0,0.021433,2022-06-04 00:00:00,2022-06-04 00:00:00,3.3,3.3,0.0,0.0,0.0,0.0,20003.0,9820.0
50%,2019-03-12 01:00:00,2022-06-05 00:00:00,1017.0,2.784394,24408.0,24163.0,6.0,0.024539,2022-06-05 00:00:00,2022-06-05 00:00:00,3.45,3.45,0.0,0.0,0.0,0.0,28010.0,9820.0
75%,2020-12-17 01:00:00,2022-06-05 00:00:00,1685.0,4.613279,40440.0,40217.0,33.0,0.122884,2022-06-05 00:00:00,2022-06-05 00:00:00,4.6,4.6,11.0,12.0,12.0,15.001,48003.0,9820.0
max,2022-06-04 01:00:00,2022-06-08 00:00:00,2584.125,7.074949,62019.0,62012.0,38431.0,99.598326,2022-06-08 00:00:00,2022-06-08 00:00:00,550.0,550.0,550.0,550.0,550.0,800.0,50617.0,9949.0
std,,,622.855142,1.705284,14948.523415,14890.49466,1871.215495,5.437353,,,8.058459,9.041275,23.144941,23.591731,23.585419,38.989945,16959.03826,1632.046788


### **Data description**

user:    string

start_date:    datetime64[ns]

end_date:    datetime64[ns]

length_days:    float64

length_years:    float64

potential_samples:    int64

actual_samples:    int64

missing_samples_abs:    int64

missing_samples_pct:    float64

contract_start_date:    datetime64[ns]

contract_end_date:    datetime64[ns]

contracted_tariff:    category

self_consumption_type:    object

p1:    float64

p2:    float64

p3:    float64

p4:    float64

p5:    float64

p6:    float64

province:    category

municipality:    category

zip_code:    float64

cnae:    float64