In [None]:
import pandas as pd
import numpy as np
import requests
import random
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Union, Dict, List

import seaborn as sns
import matplotlib.pyplot as plt

import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
url = 'https://coagmet.colostate.edu/data/nw/hourly.csv?header=yes&from=2018-01-01&to=2023-12-31&dateFmt=iso&tz=co&fields=t,rh,dewpt,windSpeed,windDir,gustSpeed&stations=bld01,bld02,bru01,btd01,crk01,eat01,ftc02,ftc04,ftl02,ftm02,gil01,gly05,jcn01,lmt01,lmt02,lov01,ovd01'

In [None]:
df = pd.read_csv(url)

In [None]:
df

In [None]:
# Delete the first row
df = df.iloc[1:]
df

In [None]:
df.columns

In [None]:
columns_to_change = ['Air Temp', 'RH', 'Dewpoint', 'Wind', 'Wind Dir', 'Gust Speed']
df.loc[:, columns_to_change] = df[columns_to_change].astype(float, errors='ignore')

In [None]:
df.describe()

In [None]:
# Check the number of NaN values in each column
print(df.isna().sum())

In [None]:
# Replace multiple values with np.nan
df = df.replace([-999.0,], np.nan)

# Check the number of NaN values in each column
print(df.isna().sum())

In [None]:
# Calculate the percentage of NaN values in each column
nan_percentage = (df.isna().sum() / len(df)) * 100

# Optionally, format the output to 2 decimal places
print("\nPercentage of NaN values in each column (formatted):")
print(nan_percentage.round(2))

In [None]:
num_unique_stations = df['Station'].nunique()

print(f"Number of unique stations: {num_unique_stations}")

unique_stations_list = df['Station'].unique().tolist()

print(f"List of unique stations: {unique_stations_list}")

In [None]:
df.dtypes

In [None]:
# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['Date and Time'])

# Find the lowest date
lowest_date = df['date'].min()

# Create a new column 'day_index' starting from 0
df['day_index'] = (df['date'] - lowest_date).dt.days

# Create a new column 'hour_index' going from 0 to 23
df['hour_index'] = df['date'].dt.hour

df

In [None]:
selected_stations = random.sample(unique_stations_list, 3)
print(f"Randomly selected stations: {selected_stations}")

In [None]:
unique_stations_list

In [None]:
upper_bound = 1460
lower_bound = upper_bound-365

df_filtered = df[(df['day_index'] >= lower_bound) & (df['day_index'] <= upper_bound)]

# Filter the DataFrame for the lower range (new DataFrame)
df_lower_range = df[(df['day_index'] >= 0) & (df['day_index'] < lower_bound)]

In [None]:
df_lower_range[df_lower_range['Station'] == 'lmt01']

In [None]:
def process_station_data(df, station):
    print(f"Processing station: {station}")
    print(f"Shape of df before filtering: {df.shape}")
    
    # Filter the DataFrame for the specific station
    df_station = df[df['Station'] == station].copy()
    
    print(f"Shape of df_station after filtering: {df_station.shape}")
    
    if df_station.empty:
        print(f"No data found for station {station}")
        return None
    
    # Append station name to specific columns
    columns_to_rename = ['Air Temp', 'RH', 'Dewpoint', 'Wind']
    for col in columns_to_rename:
        if col in df_station.columns:
            df_station.rename(columns={col: f'{col}_{station}'}, inplace=True)
    
    return df_station

In [None]:
df = df[['Station', 'Air Temp', 'RH', 'Wind', 'Dewpoint', 'day_index', 'hour_index']]
df

In [None]:
# Debugging: Print information about df and unique_stations_list
print(f"Shape of df: {df.shape}")
print(f"Columns in df: {df.columns}")
print(f"Unique values in 'Station' column: {df['Station'].unique()}")
print(f"unique_stations_list: {unique_stations_list}")

In [None]:
# Initialize an empty DataFrame to store the merged results
merged_df = pd.DataFrame()

# Iterate through each station in the list of unique stations
for station in unique_stations_list:
    # Process the data for the current station
    processed_df = process_station_data(df, station)
    
    # Check if the processed data is valid (not None and not empty)
    if processed_df is not None and not processed_df.empty:
        # Drop the station name column from the processed DataFrame
        processed_df = processed_df.drop(columns=['Station'])  # Adjust 'station_name' to match your actual column name
        
        # If merged_df is empty, this is the first valid data we've processed
        if merged_df.empty:
            merged_df = processed_df
        else:
            # If merged_df already contains data, concatenate the new data vertically
            merged_df = pd.merge(merged_df, processed_df, 
                     on=['day_index', 'hour_index'], 
                     how='outer')
        
        # Print a success message for this station
        print(f"Station {station} processed successfully!")
    else:
        # If processed_df is None or empty, print a message and skip this station
        print(f"Skipping empty or None result for station {station}")

In [None]:
merged_df.describe()

In [None]:
merged_df.columns

In [None]:
# Assuming your DataFrame is called 'merged_df'
# First, let's select only the 'Air Temp' columns
air_temp_columns = [col for col in merged_df.columns if col.startswith('Air Temp')]

# Create a new DataFrame with only these columns
air_temp_df = merged_df[air_temp_columns]

# Set up the plot
plt.figure(figsize=(20, 20))  # Adjust the size as needed

# Create the pair plot
sns.pairplot(air_temp_df.sample(1000), diag_kind='kde', plot_kws={'alpha': 0.1})

# Add a title
plt.suptitle('Pair Plot of Air Temperature Across All Stations', y=1.02, fontsize=16)

# Adjust the layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Assuming your DataFrame is called 'merged_df'
# Select only the 'RH' columns
rh_columns = [col for col in merged_df.columns if col.startswith('RH')]

# Create a new DataFrame with only these columns
rh_df = merged_df[rh_columns]

# Set up the plot
plt.figure(figsize=(20, 20))  # Adjust the size as needed

# Create the pair plot
sns.pairplot(rh_df.sample(1000), diag_kind='kde', plot_kws={'alpha': 0.3})

# Add a title
plt.suptitle('Pair Plot of Relative Humidity Across All Stations', y=1.02, fontsize=16)

# Adjust the layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Assuming your DataFrame is called 'merged_df'
# Select only the 'Wind' columns
wind_columns = [col for col in merged_df.columns if col.startswith('Wind')]

# Create a new DataFrame with only these columns
wind_df = merged_df[wind_columns]

# Set up the plot
plt.figure(figsize=(20, 20))  # Adjust the size as needed

# Create the pair plot
sns.pairplot(wind_df, diag_kind='kde', plot_kws={'alpha': 0.1})

# Add a title
plt.suptitle('Pair Plot of Wind Speed Across All Stations', y=1.02, fontsize=16)

# Adjust the layout and display the plot
plt.tight_layout()
plt.show()

## Splitting data for ML

In [None]:
merged_df.columns

In [None]:
print(merged_df.isna().sum())


In [None]:
def impute_with_rolling_average_and_median(df, window_size=5):
    df_imputed = df.copy()
    
    for column in df_imputed.columns:
        if df_imputed[column].dtype.kind in 'biufc':  # Check if column is numeric
            # Create a Series with the rolling mean
            rolling_mean = df_imputed[column].rolling(window=window_size, center=True, min_periods=1).mean()
            
            # Use the rolling mean to fill NaN values
            df_imputed[column] = df_imputed[column].fillna(rolling_mean)
            
            # If any NaNs remain, fill with the median of the column
            if df_imputed[column].isna().any():
                column_median = df_imputed[column].median()
                df_imputed[column] = df_imputed[column].fillna(column_median)
                print(f"Column '{column}': Filled remaining NaNs with median ({column_median})")
        else:
            print(f"Column '{column}' is non-numeric. Skipping imputation.")
    
    return df_imputed

In [None]:
df_imputed = impute_with_rolling_average_and_median(merged_df)

In [None]:
print(df_imputed.isna().sum())

In [None]:
# Define your list of target column suffixes
target_suffixes = ['ovd01', 'bru01']



In [None]:
# Calculate the number of days for each split
total_days = df_imputed['day_index'].nunique()
train_days = int(total_days * 0.80)
test_days = int(total_days * 0.15)
validation_days = total_days - train_days - test_days  # This ensures we use all the data

# Get the day_index values for split points
train_end = df_imputed['day_index'].unique()[train_days - 1]
test_end = df_imputed['day_index'].unique()[train_days + test_days - 1]

# Split the data
train_df = df_imputed[df_imputed['day_index'] <= train_end]
test_df = df_imputed[(df_imputed['day_index'] > train_end) & (df_imputed['day_index'] <= test_end)]
validation_df = df_imputed[df_imputed['day_index'] > test_end]

# Print the sizes of each split to verify
print(f"Train set: {len(train_df)} rows ({len(train_df)/len(df_imputed)*100:.2f}%)")
print(f"Test set: {len(test_df)} rows ({len(test_df)/len(df_imputed)*100:.2f}%)")
print(f"Validation set: {len(validation_df)} rows ({len(validation_df)/len(df_imputed)*100:.2f}%)")

In [None]:
def filter_columns(df, suffixes):
    # Create the regex pattern from the list of suffixes
    pattern = '|'.join(f'{suffix}$' for suffix in suffixes)
    return df.filter(regex=pattern)


# Filter target columns
target_columns = filter_columns(train_df, target_suffixes).columns

# Create X_train and y_train
df_X_train = train_df.drop(columns=target_columns)
df_X_train = df_X_train.select_dtypes(include=['float64'])
df_y_train = filter_columns(train_df, target_suffixes)

# Create X_test and y_test
df_X_test = test_df.drop(columns=target_columns)
df_X_test = df_X_test.select_dtypes(include=['float64'])
df_y_test = filter_columns(test_df, target_suffixes)

# Create X_validation and y_validation
df_X_validation = validation_df.drop(columns=target_columns)
df_X_validation = df_X_validation.select_dtypes(include=['float64'])
df_y_validation = filter_columns(validation_df, target_suffixes)

# Verify the shapes of the resulting dataframes
print("X_train shape:", df_X_train.shape)
print("y_train shape:", df_y_train.shape)
print("X_test shape:", df_X_test.shape)
print("y_test shape:", df_y_test.shape)
print("X_validation shape:", df_X_validation.shape)
print("y_validation shape:", df_y_validation.shape)

In [None]:
# 1. Prepare the data
X_train = df_X_train
y_train = df_y_train
X_test = df_X_test
y_test = df_y_test
X_validation = df_X_validation
y_validation = df_y_validation

In [None]:
# Initialize the XGBoost model
xgb_model = MultiOutputRegressor(xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
))

In [None]:
# Train the model
xgb_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
rmse = np.sqrt(mse)  # Calculate RMSE
r2 = r2_score(y_test, y_pred, multioutput='raw_values')

# Print results
print("Mean Squared Error for each target:")
for target, error in zip(y_test.columns, mse):
    print(f"{target}: {error:.4f}")

print("\nRoot Mean Squared Error for each target:")
for target, error in zip(y_test.columns, rmse):
    print(f"{target}: {error:.4f}")

print("\nR2 Score for each target:")
for target, score in zip(y_test.columns, r2):
    print(f"{target}: {score:.4f}")

# Overall performance
print(f"\nAverage MSE: {np.mean(mse):.2f}")
print(f"Average RMSE: {np.mean(rmse):.2f}")
print(f"Average R2 Score: {np.mean(r2):.2f}")

In [None]:
# Make predictions
y_pred = xgb_model.predict(X_validation)

# Evaluate the model
mse = mean_squared_error(y_validation, y_pred, multioutput='raw_values')
rmse = np.sqrt(mse)  # Calculate RMSE
r2 = r2_score(y_validation, y_pred, multioutput='raw_values')

# Print results
print("Mean Squared Error for each target:")
for target, error in zip(y_validation.columns, mse):
    print(f"{target}: {error:.4f}")

print("\nRoot Mean Squared Error for each target:")
for target, error in zip(y_validation.columns, rmse):
    print(f"{target}: {error:.4f}")

print("\nR2 Score for each target:")
for target, score in zip(y_validation.columns, r2):
    print(f"{target}: {score:.4f}")

# Overall performance
print(f"\nAverage MSE: {np.mean(mse):.2f}")
print(f"Average RMSE: {np.mean(rmse):.2f}")
print(f"Average R2 Score: {np.mean(r2):.2f}")