# Data of Maximum Temperature in California (NOAA)

Author: Martin Pavez

Creation: January 2025

This notebook shows the very beginning steps in heatwave detection from meteorological stations data. 
1. Detection of missing data: quantification and cleaning.
2. Selection of stations. 
3. We generate cleaned data for heatwave detections.

## Libraries import

In [1]:
import numpy as np
import pandas as pd
import os


from datetime import datetime
from calendar import monthrange

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) # Suppress specific RuntimeWarnings

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
os.chdir("../")
# change working directory to project's root path
print(os.getcwd())

c:\Users\marti\Desktop\data\hw_extra


# Parameters


In [4]:
METADATA_PATH_AND_FILENAME = 'data/local_data/NOAA/stations.parquet'
TEMP_DATA_PATH = 'data/local_data/NOAA/original/3899963.csv'
CLEANED_DATA_PATH = 'data/local_data/NOAA/cleaned_2/'

# Utilities


In [31]:
# Function to concatenate CSV files in a folder
def concatenate_csv(folder_path):
    """
    Reads all .csv files in the specified folder, concatenates them (axis=0),
    and indexes the resulting DataFrame by the 'date' column.

    Parameters:
        folder_path (str): Path to the folder containing .csv files.

    Returns:
        pd.DataFrame: Concatenated DataFrame indexed by the 'date' column.
    """
    # List to store DataFrames from each CSV file
    dataframes = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            # Read CSV file, parse 'date' column as datetime
            df = pd.read_csv(file_path, parse_dates=['date'])
            dataframes.append(df)

    # Concatenate all DataFrames along axis 0
    concatenated_df = pd.concat(dataframes, axis=0)

    # Set 'date' column as the index
    concatenated_df.set_index('date', inplace=True)

    ### Found duplicated items, these come from the end and start between files (some have them, some not)
    #duplicates = concatenated_df.index[concatenated_df.index.duplicated(keep=False)]
    #if not duplicates.empty:
    #    print("Duplicate indices found:")
    #    print(duplicates)
    concatenated_df = concatenated_df[~concatenated_df.index.duplicated(keep='first')]


    # Create a complete date range from the earliest to the latest date
    complete_index = pd.date_range(start=concatenated_df.index.min(), end=concatenated_df.index.max(), freq='D')

    # Reindex the DataFrame to the complete date range, filling gaps with NaN
    concatenated_df = concatenated_df.reindex(complete_index)

    # Rename the index to 'date'
    concatenated_df.index.name = 'date'

    # Sort by index for consistency
    concatenated_df.sort_index(inplace=True)

    concatenated_df["value"] = concatenated_df["value"]/10
    concatenated_df.rename(columns={"value":"max_temp"}, inplace=True)

    return concatenated_df

def Month_Days(year):
    return {"year": year, "month_days": {m: monthrange(year, m)[1] for m in range(1, 13)}}

def Tfilter(data, column_label, nperc, year_window_init: int, year_window_end: int):
    start_date = data.index[0]
    end_date = data.index[-1]
    perc_label = 'perc'
    Tadd = 0.0
    year_window_init = year_window_init
    year_window_end = year_window_end
    data_temp = data[column_label]

    data_threshold = pd.DataFrame(
        [],
        columns=[perc_label],
        index = data.index
    )
    for year in range(start_date.year, end_date.year + 1):
        month_days = Month_Days(year)
        for month in month_days["month_days"]:
            for day in range(1, month_days["month_days"][month] + 1):
                try:
                    current_date = datetime(year, month, day)
                except ValueError:
                    if current_date == start_date:
                        current_date = datetime(year, month, day+1)
                    if current_date == end_date:
                        current_date = datetime(year, month, day-1)
                if  current_date >= start_date and current_date <= end_date:
                    f_data_temp = data_temp[
                        (year_window_init <= data_temp.index.year)
                        & (data_temp.index.year <= year_window_end)
                        & (data_temp.index.day == day)
                        & (data_temp.index.month == month)
                    ]
                    try:
                        data_threshold.loc[datetime(year, month, day), perc_label] = f_data_temp.quantile(
                            nperc*0.01, interpolation="midpoint"
                        ).values[0]
                    except AttributeError:
                        data_threshold.loc[datetime(year, month, day), perc_label] = f_data_temp.quantile(
                            nperc*0.01, interpolation="midpoint"
                        )
                    except ValueError:
                        data_threshold.loc[datetime(year, month, day-1), perc_label] = f_data_temp.quantile(
                            nperc*0.01, interpolation="midpoint"
                        ).values[0] + Tadd

        smoothed_series = data_threshold.rolling(window=31, center=True).mean()
    return smoothed_series + Tadd
    #return data_threshold + Tadd

def to_format(data, max_temp_lim = 50, add_filter_year = None, filter_by_hist = False, filter = True, dropnans = True):

    #data["Date"] = pd.to_datetime(data["Date"],format="%Y-%m-%d")

    if dropnans:
        max_temp = data.set_index("date").dropna(subset=["max_temp"])[["max_temp"]]

    #max_temp = max_temp.rename(columns={"DayAirTmpMax":"max_temp"})
    #min_temp = min_temp.rename(columns={"DayAirTmpMin":"min_temp"})
    #mean_temp = mean_temp.rename(columns={"DayAirTmpAvg":"mean_temp"})
    max_temp = data[["max_temp"]]

    if filter:

        if add_filter_year is None:
            max_temp = max_temp.drop(max_temp[np.abs(max_temp["max_temp"])>max_temp_lim].index)
        else:
        #if add_filter_year is not None:
            max_temp.loc[(max_temp.index.year < add_filter_year) & (np.abs(max_temp['max_temp']) > max_temp_lim), 'max_temp'] = np.nan
            max_temp = max_temp.dropna(subset=["max_temp"])[["max_temp"]]


        if filter_by_hist:#add_filter_year is not None:
            perc_max = Tfilter(max_temp, 'max_temp', 99.9, add_filter_year, 2023)
            perc_max = perc_max.reindex(max_temp.index)


            max_temp.loc[(max_temp.index.year < add_filter_year) & (max_temp['max_temp'] > perc_max['perc'] + 10), 'max_temp'] = np.nan

        if dropnans:
            max_temp = max_temp.dropna(subset=["max_temp"])[["max_temp"]]
    
    else:
        return data

    concatenated_df = pd.concat([max_temp], axis=1)

    return concatenated_df

def separate_by_station(data, name_stat):
    station_data = data[data["STATION"]==name_stat]
    station_data["DATE"] = pd.to_datetime(station_data["DATE"],format='%Y-%m-%d')
    station_data.rename(columns={"DATE":"Date", "TMAX":"max_temp"}, inplace=True)
    station_data.set_index('Date', inplace=True)
    # Create a complete date range from the earliest to the latest date
    complete_index = pd.date_range(start=station_data.index.min(), end=station_data.index.max(), freq='D')

    # Reindex the DataFrame to the complete date range, filling gaps with NaN
    station_data = station_data.reindex(complete_index)
    return station_data

# Reading data

In [21]:
# get all cimis stations information
stations = pd.read_csv(TEMP_DATA_PATH)
stations.head(20)

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,TMAX,TMAX_ATTRIBUTES
0,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-01,19.4,",,0"
1,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-02,19.4,",,0"
2,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-03,17.8,",,0"
3,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-04,16.7,",,0"
4,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-05,17.2,",,0"
5,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-06,16.1,",,0"
6,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-07,18.3,",,0"
7,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-08,19.4,",,0"
8,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-09,17.2,",,0"
9,USC00041253,"CACHUMA LAKE, CA US",34.58223,-119.98159,240.5,1970-01-10,12.2,",,0"


In [22]:
#filter only stations that have data at least from 1971
statlist = list(stations["STATION"].unique())
statlist

['USC00041253',
 'USC00041277',
 'USC00041244',
 'USC00040449',
 'USC00040931',
 'USC00040983',
 'USC00040741',
 'USC00040212',
 'USC00041194',
 'USC00040693']

In [32]:
stations_data_no_filter = {}
stations_data_filter_nans = {}
stations_data_filter1 = {}
dropnans = False

for stat in statlist:
    print(stat)
    station_data_to_read = separate_by_station(stations, stat)

    stations_data_no_filter[stat] = to_format(station_data_to_read, max_temp_lim=50, add_filter_year=None, filter_by_hist = False, filter = False, dropnans=dropnans)
    stations_data_filter1[stat] = to_format(station_data_to_read, max_temp_lim=50, add_filter_year=None, filter_by_hist = False, filter = True, dropnans=dropnans)
    stations_data_filter_nans[stat] = to_format(station_data_to_read, max_temp_lim=50, add_filter_year=None, filter_by_hist = False, filter = False, dropnans=dropnans)

    stations_data_filter1[stat].to_parquet(CLEANED_DATA_PATH + f'Stat_{stat}.parquet')




USC00041253
USC00041277
USC00041244
USC00040449
USC00040931


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data["DATE"] = pd.to_datetime(station_data["DATE"],format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data.rename(columns={"DATE":"Date", "TMAX":"max_temp"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data["DATE"] = pd.to_datetime(station_data["DATE"],format='%Y-%m-%d')
A value is trying to be set 

USC00040983
USC00040741
USC00040212
USC00041194
USC00040693


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data["DATE"] = pd.to_datetime(station_data["DATE"],format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data.rename(columns={"DATE":"Date", "TMAX":"max_temp"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_data["DATE"] = pd.to_datetime(station_data["DATE"],format='%Y-%m-%d')
A value is trying to be set 

In [33]:
#Percentage of missing and cleaned data

df_nans_and_deleted = pd.DataFrame(index=statlist)
df_total_days = {}
df_filter1_days = {}
total_days = np.zeros((len(statlist),))
filter_nan_days = np.zeros((len(statlist),))
filter1_days = np.zeros((len(statlist),))

for i, stat in enumerate(statlist):
    df_total_days = stations_data_no_filter[stat][stations_data_no_filter[stat].index.year>1970]['max_temp']
    #np.isnan(station_data_to_read[stat]['temperature'])
    df_filter_nan_days = stations_data_filter_nans[stat][stations_data_filter_nans[stat].index.year>1970]['max_temp']

    df_filter1_days = stations_data_filter1[stat][stations_data_filter1[stat].index.year>1970]['max_temp']

    #station_data_to_read[stat][np.isnan(station_data_to_read[stat]['temperature'])]
    total_days[i] = len(df_total_days)#[np.isnan(df_total_days)])
    filter_nan_days[i] = len(df_filter_nan_days[np.isnan(df_total_days)])
    filter1_days[i] = len(df_filter1_days[np.isnan(df_filter1_days)])

df_nans_and_deleted['total'] = np.array(total_days)
df_nans_and_deleted['nans'] = np.array(filter_nan_days)/(np.array(total_days)[0])
df_nans_and_deleted['>50'] = -np.array(filter_nan_days)/(np.array(total_days)[0])+np.array(filter1_days)/(np.array(total_days)[0])


print(df_nans_and_deleted)

               total      nans  >50
USC00041253  19693.0  0.015640  0.0
USC00041277  19693.0  0.104098  0.0
USC00041244  19693.0  0.062916  0.0
USC00040449  19358.0  0.118976  0.0
USC00040931  19693.0  0.049459  0.0
USC00040983  19693.0  0.006246  0.0
USC00040741  19693.0  0.036409  0.0
USC00040212  19693.0  0.035393  0.0
USC00041194  19693.0  0.013406  0.0
USC00040693  19479.0  0.099579  0.0


In [37]:
print(r'$\frac{total missing data}{total days} =$' +
      str(np.sum(df_nans_and_deleted['nans']*df_nans_and_deleted['total'])/np.sum(df_nans_and_deleted['total'])))

print(r'$\frac{total missing data}{total days} =$' +
      str(np.sum(df_nans_and_deleted['>50']*df_nans_and_deleted['total'])/np.sum(df_nans_and_deleted['total'])))

$\frac{total missing data}{total days} =$0.05405224099418335
$\frac{total missing data}{total days} =$0.0


In [38]:
df_nans_and_deleted.sort_values("nans", ascending=True, inplace=True)
df_nans_and_deleted = df_nans_and_deleted[df_nans_and_deleted["total"] > 18500]
df_nans_and_deleted.iloc[0:10]

Unnamed: 0,total,nans,>50
USC00040983,19693.0,0.006246,0.0
USC00041194,19693.0,0.013406,0.0
USC00041253,19693.0,0.01564,0.0
USC00040212,19693.0,0.035393,0.0
USC00040741,19693.0,0.036409,0.0
USC00040931,19693.0,0.049459,0.0
USC00041244,19693.0,0.062916,0.0
USC00040693,19479.0,0.099579,0.0
USC00041277,19693.0,0.104098,0.0
USC00040449,19358.0,0.118976,0.0


In [43]:
statlist_10 = list(df_nans_and_deleted.iloc[0:10].index)
statlist_10

['GHCND_USC00040983',
 'GHCND_USC00041194',
 'GHCND_USC00041253',
 'GHCND_USC00040741',
 'GHCND_USC00040212',
 'GHCND_USC00040931',
 'GHCND_USC00041277',
 'GHCND_USC00041244',
 'GHCND_USC00040693',
 'GHCND_USC00040449']

In [44]:
print(str(statlist_10))

['GHCND_USC00040983', 'GHCND_USC00041194', 'GHCND_USC00041253', 'GHCND_USC00040741', 'GHCND_USC00040212', 'GHCND_USC00040931', 'GHCND_USC00041277', 'GHCND_USC00041244', 'GHCND_USC00040693', 'GHCND_USC00040449']
