In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.neighbors import NearestNeighbors

import warnings

# Filter all warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the files
core_area = pd.read_csv("./data/core_area.csv")
grouped_precip = pd.read_csv("./data/grouped_precip.csv")
lcz = pd.read_csv("./data/lcz.csv")

# Remove leading and trailing spaces from column names of the LCZ
lcz.columns = lcz.columns.str.strip()
lcz = lcz.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Display the first few rows of each dataframe
core_area.head(), grouped_precip.head(),lcz.head()


(   Longitude   Latitude  Core Area
 0  -95.66735  28.891169       9.00
 1  -95.68225  28.901718       8.00
 2  -95.69984  28.910925       5.75
 3  -95.71833  28.922235       5.00
 4  -95.74094  28.937017       4.25,
    Longitude   Latitude  Precipitation
 0 -97.429273  29.487136            6.0
 1 -97.429028  29.523018            7.3
 2 -97.428804  29.451254            5.2
 3 -97.428066  29.558892            3.6
 4 -97.427620  29.415385           14.3,
    Station     TYPE  CA    PLAND  NP           PD     LPI     TE           ED  \
 0        1  openLow   0  24.5098  22  17973856209  6.5359  0.406  3316993.464   
 1        1  openLow   0  24.5098  22  17973856209  6.5359  0.406  3316993.464   
 2        1  openLow   0  24.5098  22  17973856209  6.5359  0.406  3316993.464   
 3        1  openLow   0  24.5098  22  17973856209  6.5359  0.406  3316993.464   
 4        1  openLow   0  24.5098  22  17973856209  6.5359  0.406  3316993.464   
 
       LSI  ...  LATITUDE  LONGITUDE  ELEVATION 

# We discover important variables from the LCZ dataset using RandomForest to calculate feature importance. The targt variable would be both the precipitation and updraft, given that we want to understand the impact of the LCZ features on both of these variables

In [3]:
core_area = pd.read_csv("./data/core_area.csv")
grouped_precip = pd.read_csv("./data/grouped_precip.csv")
lcz = pd.read_csv("./data/combined_data.csv")

# Remove leading and trailing spaces from column names of the LCZ
lcz.columns = lcz.columns.str.strip()
lcz = lcz.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Encode the 'TYPE' feature in the LCZ data
le = LabelEncoder()
lcz_numerical = le.fit_transform(lcz['TYPE'])

# Add the 'LCZ' variable to the LCZ data and drop the 'TYPE' feature
lcz['LCZ'] = lcz_numerical
lcz = lcz.drop(columns=['TYPE'])

# Function to append a suffix to duplicate column names
def rename_duplicates(old):
    seen = {}
    for x in old:
        if x in seen:
            seen[x] += 1
            yield "%s_%d" % (x, seen[x])
        else:
            seen[x] = 0
            yield x

# Apply the function to the column names in the LCZ data
lcz.columns = list(rename_duplicates(lcz.columns))

# Function to calculate correlation for each station
def calculate_correlation_for_station(station, core_area, grouped_precip, k=5):
    # Create a NearestNeighbors model
    nn = NearestNeighbors(n_neighbors=k)

    # Fit the model to the core_area data
    nn.fit(core_area[["Longitude", "Latitude"]])

    # Find the k nearest points in the core_area data for the station
    station_coordinates = station[["LONGITUDE", "LATITUDE"]].values.reshape(1, -1)
    distances, indices = nn.kneighbors(station_coordinates)

    # Get the core_area and grouped_precip values for the nearest points
    core_area_values = core_area.iloc[indices[0]]["Core Area"]
    grouped_precip_values = grouped_precip.iloc[indices[0]]["Precipitation"]

    # Calculate the correlation between the core_area and grouped_precip values
    correlation, _ = pearsonr(core_area_values, grouped_precip_values)

    return correlation

# Apply the function to all stations in the LCZ data
correlations = lcz.apply(calculate_correlation_for_station, args=(core_area, grouped_precip, 5), axis=1)

# Add the correlations as a new column in the LCZ data
lcz['Correlation'] = correlations

# Define the numerical and categorical features
numerical_features = lcz.select_dtypes(include=[np.number]).columns.drop('Correlation')
categorical_features = ['LCZ']

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Preprocessing the LCZ data
lcz_preprocessed = preprocessor.fit_transform(lcz)

# Fit a RandomForest model to the preprocessed LCZ data
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(lcz_preprocessed, lcz['Correlation'])

# Get the feature importances
importances = rf.feature_importances_

# Get the names of the features from the preprocessor
feature_names = preprocessor.transformers_[0][1].get_feature_names_out(numerical_features)
feature_names = np.concatenate([feature_names, preprocessor.transformers_[1][1].get_feature_names_out(categorical_features)])

# Create a DataFrame that contains the feature importances
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the DataFrame by the importances, in descending order
feature_importances_sorted = feature_importances.sort_values(by='importance', ascending=False)

feature_importances_sorted.head(10)


KeyError: "None of [Index(['LONGITUDE', 'LATITUDE'], dtype='object')] are in the [index]"