In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [27]:
def _load_data(path: str, file_name: str):
    return pd.read_csv(path + file_name)

In [28]:
path = '../../data/raw/'

raw = _load_data(path, 'weather_data.csv')
cols_description = _load_data(path, 'column_descriptions.csv')

In [29]:
pd.set_option('display.max_colwidth', 200)
display(cols_description )

Unnamed: 0,column,description
0,date,Date and time of the observation.
1,p,Atmospheric pressure in millibars (mbar).
2,T,Air temperature in degrees Celsius (°C).
3,Tpot,"Potential temperature in Kelvin (K), representing the temperature an air parcel would have if moved to a standard pressure level."
4,Tdew,"Dew point temperature in degrees Celsius (°C), indicating the temperature at which air becomes saturated with moisture."
5,rh,"Relative humidity as a percentage (%), showing the amount of moisture in the air relative to the maximum it can hold at that temperature."
6,VPmax,"Maximum vapor pressure in millibars (mbar), representing the maximum pressure exerted by water vapor at the given temperature."
7,VPact,"Actual vapor pressure in millibars (mbar), indicating the current water vapor pressure in the air."
8,VPdef,"Vapor pressure deficit in millibars (mbar), measuring the difference between maximum and actual vapor pressure, used to gauge drying potential."
9,sh,"Specific humidity in grams per kilogram (g/kg), showing the mass of water vapor per kilogram of air."


In [30]:
def _prepare_data(df: pd.DataFrame, date_col: str) -> pd.DataFrame:

    df[date_col] = pd.to_datetime(df[date_col])
    df.set_index(date_col, inplace=True)

    df["hour"] = df.index.hour
    df["month"] = df.index.month
    df["day"] = df.index.day

    return df

raw = _prepare_data(raw, 'date')
raw.head()

Unnamed: 0_level_0,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,...,wd,rain,raining,SWDR,PAR,max. PAR,Tlog,hour,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:10:00,1008.89,0.71,273.18,-1.33,86.1,6.43,5.54,0.89,3.42,5.49,...,224.3,0.0,0.0,0.0,0.0,0.0,11.45,0,1,1
2020-01-01 00:20:00,1008.76,0.75,273.22,-1.44,85.2,6.45,5.49,0.95,3.39,5.45,...,206.8,0.0,0.0,0.0,0.0,0.0,11.51,0,1,1
2020-01-01 00:30:00,1008.66,0.73,273.21,-1.48,85.1,6.44,5.48,0.96,3.39,5.43,...,197.1,0.0,0.0,0.0,0.0,0.0,11.6,0,1,1
2020-01-01 00:40:00,1008.64,0.37,272.86,-1.64,86.3,6.27,5.41,0.86,3.35,5.37,...,206.4,0.0,0.0,0.0,0.0,0.0,11.7,0,1,1
2020-01-01 00:50:00,1008.61,0.33,272.82,-1.5,87.4,6.26,5.47,0.79,3.38,5.42,...,209.6,0.0,0.0,0.0,0.0,0.0,11.81,0,1,1


In [31]:
def basic_data_info(data: pd.DataFrame) -> dict:
    """
    Get basic information about the dataset.

    Args:
        data (pd.DataFrame): Input dataframe

    Returns:
        dict: Basic statistics and info
    """
    info = {
        "shape": data.shape,
        "columns": list(data.columns),
        "dtypes": data.dtypes.to_dict(),
        "missing_values": data.isnull().sum().to_dict(),
        "memory_usage": data.memory_usage(deep=True).sum(),
    }
    return info

basic_info = basic_data_info(raw)
print(f"Dataset Shape: {basic_info['shape']}")
print(f"Number of Columns: {len(basic_info['columns'])}")
print(f"Number of Missing Values: {sum(basic_info['missing_values'].values())}")


Dataset Shape: (52696, 23)
Number of Columns: 23
Number of Missing Values: 0


In [40]:
def _basic_statistics(data: pd.DataFrame) -> tuple:
    """
    Compute basic statistics for the numerical columns in the dataframe.
    """
    return data.describe(), data.corr()

describe, correlations = _basic_statistics(raw)

display(describe)
display(correlations)

Unnamed: 0,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,...,hour,month,day,rain_total,T_mean,T_min,T_max,rh_mean,SWDR_sum,rain_day
count,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,...,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0,52696.0
mean,989.989233,10.818241,284.796938,5.409105,72.487133,14.487046,9.676828,4.810131,6.111159,9.782341,...,11.500171,6.513891,15.754497,1.695339,10.818241,6.169912,15.343137,72.487133,18905.526562,0.382572
std,9.207149,7.468671,7.616995,5.956722,19.23026,7.63296,4.023504,5.53932,2.561536,4.082684,...,6.922783,3.451478,8.810597,5.383562,6.703272,6.013112,7.840298,13.967279,13525.979953,0.48602
min,955.58,-6.44,266.19,-13.81,21.16,3.77,2.09,0.0,1.3,2.09,...,0.0,1.0,1.0,0.0,-2.705417,-6.44,-0.12,43.65875,0.0,0.0
25%,984.8,4.59,278.55,0.7775,58.82,8.48,6.46,1.17,4.07,6.53,...,5.0,4.0,8.0,0.0,4.836875,1.28,8.82,61.112292,6754.43,0.0
50%,990.92,10.23,284.32,5.26,75.4,12.48,8.89,2.74,5.61,8.99,...,12.0,7.0,16.0,0.0,10.582153,5.65,15.04,72.993194,14105.5,0.0
75%,995.93,16.18,290.26,9.7,87.9,18.42,12.05,6.44,7.62,12.2,...,18.0,10.0,23.0,0.8,16.131528,10.84,21.7,83.393819,30622.32,1.0
max,1020.07,34.8,309.13,20.5,100.0,55.67,24.16,42.1,15.4,24.53,...,23.0,12.0,31.0,46.4,26.869375,19.25,34.8,100.0,49617.22,1.0


Unnamed: 0,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,...,hour,month,day,rain_total,T_mean,T_min,T_max,rh_mean,SWDR_sum,rain_day
p,1.0,-0.127305,-0.22345,-0.20728,-0.068386,-0.098905,-0.183765,-0.002805,-0.200955,-0.201174,...,-0.014812,-0.15495,0.040475,-0.216411,-0.106707,-0.206373,-0.038445,-0.132758,0.156898,-0.288388
T,-0.127305,1.0,0.995228,0.78274,-0.540784,0.967901,0.76286,0.779616,0.76155,0.761934,...,0.193087,0.149844,0.003643,0.040782,0.897519,0.836326,0.869004,-0.384982,0.566242,-0.055229
Tpot,-0.22345,0.995228,1.0,0.789478,-0.52488,0.961013,0.767773,0.766556,0.768193,0.76859,...,0.191264,0.161709,-0.000435,0.061513,0.892399,0.842058,0.857709,-0.365345,0.540917,-0.025764
Tdew,-0.20728,0.78274,0.789478,1.0,0.087203,0.712665,0.977413,0.272064,0.976878,0.977256,...,0.013741,0.344544,-0.028797,0.184342,0.840526,0.89267,0.759881,0.059886,0.248666,0.12506
rh,-0.068386,-0.540784,-0.52488,0.087203,1.0,-0.561612,0.085528,-0.836007,0.086812,0.086729,...,-0.295063,0.268277,-0.0502,0.190412,-0.311547,-0.15232,-0.368289,0.726318,-0.571145,0.24082
VPmax,-0.098905,0.967901,0.961013,0.712665,-0.561612,1.0,0.712554,0.86039,0.711273,0.711482,...,0.192277,0.143217,0.00358,0.011555,0.849436,0.777852,0.833189,-0.38112,0.553485,-0.090241
VPact,-0.183765,0.76286,0.767773,0.977413,0.085528,0.712554,1.0,0.255506,0.999814,0.999816,...,0.007999,0.308538,-0.023154,0.192119,0.827552,0.880848,0.749457,0.046136,0.25488,0.116861
VPdef,-0.002805,0.779616,0.766556,0.272064,-0.836007,0.86039,0.255506,1.0,0.253875,0.254162,...,0.25914,-0.026768,0.021749,-0.12363,0.569386,0.432033,0.603723,-0.558685,0.577545,-0.209235
sh,-0.200955,0.76155,0.768193,0.976878,0.086812,0.711273,0.999814,0.253875,1.0,0.999997,...,0.008172,0.308783,-0.023792,0.196425,0.82558,0.880647,0.746589,0.048809,0.250587,0.121821
H2OC,-0.201174,0.761934,0.76859,0.977256,0.086729,0.711482,0.999816,0.254162,0.999997,1.0,...,0.008212,0.309186,-0.023779,0.196376,0.825924,0.880965,0.746901,0.048804,0.250708,0.121847


In [41]:
top_corr = correlations.apply(lambda col: col.drop(col.name).abs().sort_values(ascending=False).head(10))
display(top_corr)

Unnamed: 0,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,...,hour,month,day,rain_total,T_mean,T_min,T_max,rh_mean,SWDR_sum,rain_day
H2OC,0.201174,,0.76859,0.977256,,,0.999816,,0.999997,,...,,0.309186,0.023779,0.196376,0.825924,0.880965,0.746901,,,
PAR,,,,,0.558661,,,0.576877,,,...,0.031493,,,,,,,,,
SWDR,,,,,0.558295,,,,,,...,,,,,,,,,,0.15115
SWDR_sum,,,,,0.571145,,,0.577545,,,...,,,,,,,,0.786358,,0.346789
T,,,0.995228,0.78274,0.540784,0.967901,0.76286,0.779616,0.76155,0.761934,...,0.193087,,,,0.897519,0.836326,0.869004,0.384982,0.566242,
T_max,,0.869004,0.857709,0.759881,,0.833189,0.749457,0.603723,0.746589,0.746901,...,,,,,0.968229,0.829678,,0.507064,0.715486,0.148248
T_mean,,0.897519,0.892399,0.840526,,0.849436,0.827552,0.569386,0.82558,0.825924,...,,0.166953,,,,0.93182,0.968229,0.428941,0.630897,
T_min,0.206373,0.836326,0.842058,0.89267,,0.777852,0.880848,,0.880647,0.880965,...,,0.233625,,,0.93182,,0.829678,,,
Tdew,0.20728,0.78274,0.789478,,,0.712665,0.977413,,0.976878,0.977256,...,,0.344544,0.028797,0.184342,0.840526,0.89267,0.759881,,,
Tlog,,0.981562,0.972736,0.741963,0.558603,0.962748,0.727446,0.798241,0.725602,0.725934,...,0.203697,,,,0.868541,0.787411,0.856469,0.388493,0.590245,


In [None]:
def _daily_columns(df: pd.DataFrame) -> pd.DataFrame:
    rain_day = df.groupby(df.index.date).agg(
        rain_sum=('rain', 'sum'),
        T_mean=('T', 'mean'),
        T_min=('T', 'min'),
        T_max=('T', 'max'),
        rh_mean=('rh', 'mean'),
    SWDR_sum=('SWDR', 'sum')
)

    # Add daily rain sum to original dataframe
    date_index = pd.Series(df.index.date, index=df.index)
    df['rain_total'] = date_index.map(rain_day['rain_sum'])
    df['T_mean'] = date_index.map(rain_day['T_mean'])
    df['T_min'] = date_index.map(rain_day['T_min'])
    df['T_max'] = date_index.map(rain_day['T_max'])
    df['rh_mean'] = date_index.map(rain_day['rh_mean'])
    df['SWDR_sum'] = date_index.map(rain_day['SWDR_sum'])

    df['rain_day'] = df['rain_total'].apply(lambda x: 1 if x > 0 else 0)

    return df

raw = _daily_columns(raw)

Unnamed: 0_level_0,p,T,Tpot,Tdew,rh,VPmax,VPact,VPdef,sh,H2OC,...,hour,month,day,rain_total,T_mean,T_min,T_max,rh_mean,SWDR_sum,rain_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01 00:10:00,1008.89,0.71,273.18,-1.33,86.1,6.43,5.54,0.89,3.42,5.49,...,0,1,1,0.0,-0.514196,-3.46,4.58,86.429301,8250.91,0
2020-01-01 00:20:00,1008.76,0.75,273.22,-1.44,85.2,6.45,5.49,0.95,3.39,5.45,...,0,1,1,0.0,-0.514196,-3.46,4.58,86.429301,8250.91,0
2020-01-01 00:30:00,1008.66,0.73,273.21,-1.48,85.1,6.44,5.48,0.96,3.39,5.43,...,0,1,1,0.0,-0.514196,-3.46,4.58,86.429301,8250.91,0
2020-01-01 00:40:00,1008.64,0.37,272.86,-1.64,86.3,6.27,5.41,0.86,3.35,5.37,...,0,1,1,0.0,-0.514196,-3.46,4.58,86.429301,8250.91,0
2020-01-01 00:50:00,1008.61,0.33,272.82,-1.50,87.4,6.26,5.47,0.79,3.38,5.42,...,0,1,1,0.0,-0.514196,-3.46,4.58,86.429301,8250.91,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 23:20:00,978.32,2.28,277.16,-0.80,80.0,7.20,5.76,1.44,3.67,5.89,...,23,12,31,0.0,1.870139,-0.07,3.43,83.545694,3902.74,0
2020-12-31 23:30:00,978.30,2.13,277.01,-0.43,83.1,7.12,5.92,1.20,3.77,6.05,...,23,12,31,0.0,1.870139,-0.07,3.43,83.545694,3902.74,0
2020-12-31 23:40:00,978.26,1.99,276.88,-0.71,82.2,7.05,5.80,1.26,3.69,5.93,...,23,12,31,0.0,1.870139,-0.07,3.43,83.545694,3902.74,0
2020-12-31 23:50:00,978.26,2.07,276.95,-0.77,81.4,7.09,5.77,1.32,3.68,5.90,...,23,12,31,0.0,1.870139,-0.07,3.43,83.545694,3902.74,0


In [None]:
def _coordinate_hourly_monthly_data(df: pd.DataFrame) -> pd.DataFrame:
    df['hour_sin'] = np.sin(2 * np.pi * df.index.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df.index.hour / 24)
    df['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df.index.month / 12)
    return df

raw = _coordinate_hourly_monthly_data(raw)


In [45]:
raw.resample('D').mean().describe()[['rain_total', 'T_mean', 'T_min', 'T_max', 'rh_mean', 'wv']]

Unnamed: 0,rain_total,T_mean,T_min,T_max,rh_mean,wv
count,367.0,367.0,367.0,367.0,367.0,367.0
mean,1.690463,10.794832,6.159019,15.307439,72.510647,1.992138
std,5.38387,6.718604,6.016605,7.870443,13.978555,3.701414
min,0.0,-2.705417,-6.44,-0.12,43.65875,-65.882431
25%,0.0,4.836076,1.34,8.815,61.169688,1.452118
50%,0.0,10.582153,5.65,15.04,72.993194,1.879653
75%,0.8,16.074028,10.735,21.69,83.360451,2.685799
max,46.4,26.869375,19.25,34.8,100.0,6.453403


In [None]:
column_descriptions = {
    "date": "Date and time of the observation.",
    "p": "Atmospheric pressure in millibars (mbar).",
    "T": "Air temperature in degrees Celsius (°C).",
    "Tpot": "Potential temperature in Kelvin (K), representing the temperature an air parcel would have if moved to a standard pressure level.",
    "Tdew": "Dew point temperature in degrees Celsius (°C), indicating the temperature at which air becomes saturated with moisture.",
    "rh": "Relative humidity as a percentage (%), showing the amount of moisture in the air relative to the maximum it can hold at that temperature.",
    "VPmax": "Maximum vapor pressure in millibars (mbar), representing the maximum pressure exerted by water vapor at the given temperature.",
    "VPact": "Actual vapor pressure in millibars (mbar), indicating the current water vapor pressure in the air.",
    "VPdef": "Vapor pressure deficit in millibars (mbar), measuring the difference between maximum and actual vapor pressure, used to gauge drying potential.",
    "sh": "Specific humidity in grams per kilogram (g/kg), showing the mass of water vapor per kilogram of air.",
    "H2OC": "Concentration of water vapor in millimoles per mole (mmol/mol) of dry air.",
    "rho": "Air density in grams per cubic meter (g/m³), reflecting the mass of air per unit volume.",
    "wv": "Wind speed in meters per second (m/s), measuring the horizontal motion of air.",
    "max. wv": "Maximum wind speed in meters per second (m/s), indicating the highest recorded wind speed over the period.",
    "wd": "Wind direction in degrees (°), representing the direction from which the wind is blowing.",
    "rain": "Total rainfall in millimeters (mm), showing the amount of precipitation over the observation period.",
    "raining": "Duration of rainfall in seconds (s), recording the time for which rain occurred during the observation period.",
    "SWDR": "Short-wave downward radiation in watts per square meter (W/m²), measuring incoming solar radiation.",
    "PAR": "Photosynthetically active radiation in micromoles per square meter per second (µmol/m²/s), indicating the amount of light available for photosynthesis.",
    "max. PAR": "Maximum photosynthetically active radiation recorded in the observation period in µmol/m²/s.",
    "Tlog": "Temperature logged in degrees Celsius (°C), potentially from a secondary sensor or logger.",
    "OT": "Likely refers to an operational timestamp or an offset in time, but may need clarification depending on the dataset's context."
}
