# 1. Filling missing values for NDVI
- forward fill (which avoids lookahead);
- interpolation (possible lookahead)

In [2]:
#Check missing values and information for Metadata
# Import
import pandas as pd
import numpy as np

# Read monthly NDVI data
NDVI_data = pd.read_csv("../datasets/hokkaido_ndvi_data_monthly.csv").copy()

# Notice some months missing
NDVI_data.head()

Unnamed: 0,市町村名,緯度,経度,year,month,NDVI
0,札幌市,43.061936,141.354292,2014,4,0.134534
1,札幌市,43.061936,141.354292,2014,5,0.260263
2,札幌市,43.061936,141.354292,2014,6,0.335899
3,札幌市,43.061936,141.354292,2014,7,0.599395
4,札幌市,43.061936,141.354292,2014,8,0.413911


In [3]:
# Generate all possible combinations of city_code, year, and month (1-12)
cities = NDVI_data["市町村名"].unique()
years = NDVI_data["year"].unique()
months = range(1, 13)  # All months from 1 to 12

# Create a MultiIndex with all combinations
complete_index = pd.MultiIndex.from_product(
    [cities, years, months],
    names=["市町村名", "year", "month"]
)

# Set the current index to city_code, year, and month
NDVI_data = NDVI_data.set_index(["市町村名", "year", "month"])

# Reindex to the complete index (fills missing months with NaN)
NDVI_data = NDVI_data.reindex(complete_index).reset_index()

NDVI_data.head()

Unnamed: 0,市町村名,year,month,緯度,経度,NDVI
0,札幌市,2014,1,,,
1,札幌市,2014,2,,,
2,札幌市,2014,3,,,
3,札幌市,2014,4,43.061936,141.354292,0.134534
4,札幌市,2014,5,43.061936,141.354292,0.260263


In [8]:
# Check for missing values on the frontier data between train/test
frontier_NDVI_data = NDVI_data[(NDVI_data['month'] >= 5) & (NDVI_data['month'] <= 10)] #only harvest period
frontier_NDVI_data = frontier_NDVI_data[frontier_NDVI_data["year"] == 2021] #2021 is the frontier year
frontier_NDVI_data.info() #only 2 missing values, insignificant possible lookahead

<class 'pandas.core.frame.DataFrame'>
Int64Index: 654 entries, 88 to 13053
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   市町村名    654 non-null    object 
 1   year    654 non-null    int64  
 2   month   654 non-null    int64  
 3   緯度      652 non-null    float64
 4   経度      652 non-null    float64
 5   NDVI    652 non-null    float64
dtypes: float64(3), int64(2), object(1)
memory usage: 35.8+ KB


In [10]:
# Applying forward fill & linear interpolation on NDVI data and after filter for months [5, 10] related to haverst period

# ffill before filter to leverage information
NDVI_data["NDVI_ff"] = NDVI_data.groupby(["市町村名", "year"])["NDVI"].ffill()
NDVI_data["NDVI_li"] = NDVI_data.groupby(["市町村名", "year"])["NDVI"].apply(lambda x: x.interpolate())

filtered_NDVI_data = NDVI_data[(NDVI_data['month'] >= 5) & (NDVI_data['month'] <= 10)]

filtered_NDVI_data.info()
filtered_NDVI_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6540 entries, 4 to 13077
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   市町村名     6540 non-null   object 
 1   year     6540 non-null   int64  
 2   month    6540 non-null   int64  
 3   緯度       6510 non-null   float64
 4   経度       6510 non-null   float64
 5   NDVI     6510 non-null   float64
 6   NDVI_ff  6540 non-null   float64
 7   NDVI_li  6540 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 459.8+ KB


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  NDVI_data["NDVI_li"] = NDVI_data.groupby(["市町村名", "year"])["NDVI"].apply(lambda x: x.interpolate())


Unnamed: 0,year,month,緯度,経度,NDVI,NDVI_ff,NDVI_li
count,6540.0,6540.0,6510.0,6510.0,6510.0,6540.0,6540.0
mean,2018.5,7.5,43.139942,141.648139,0.678788,0.678731,0.678683
std,2.872501,1.707956,0.726526,0.961678,0.123645,0.123827,0.123798
min,2014.0,5.0,41.483942,139.512172,0.242342,0.242342,0.242342
25%,2016.0,6.0,42.584939,140.794788,0.594955,0.595017,0.594633
50%,2018.5,7.5,43.164178,141.784299,0.688965,0.688823,0.688768
75%,2021.0,9.0,43.72343,142.354421,0.780077,0.780139,0.780139
max,2023.0,10.0,44.722419,144.172737,0.944602,0.944602,0.944602


# 2. Outliers detection for NDVI

In [11]:
def detect_global_outliers(df, column, method="iqr", threshold=1.5, z_threshold=3):
    """
    Detect outliers considering cities as one.
    
    Parameters:
        method: "iqr" (interquartile interval) or "zscore" (std).
        threshold: IQR (default: 1.5).
        z_threshold: Z-Score (default: 3).
    """
    data = df[column].copy()
    
    if method == "iqr":
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - threshold * IQR
        upper = Q3 + threshold * IQR
        outliers = df[(data < lower) | (data > upper)]
        
    elif method == "zscore":
        z_scores = np.abs(stats.zscore(data.dropna()))
        outliers = df[z_scores > z_threshold]
    
    return outliers

In [15]:
NDVI_list = ['NDVI', 'NDVI_ff, NDVI_li']

# List of variables to analyze again about outliers
print("Outliers for filtered_NDVI_data")
NDVI_list = ['NDVI', 'NDVI_ff', 'NDVI_li']

# Dictionary to store results
outliers_global = {}

print("For IQR threshold = 1.5")
for var in NDVI_list:
    outliers = detect_global_outliers(filtered_NDVI_data, var, method="iqr")
    outliers_global[var] = outliers
    print(f"Outliers in {var}: {len(outliers)} ({(len(outliers)/len(filtered_NDVI_data))*100:.2f}%)")
    
print()

print("For IQR threshold = 2")
for var in NDVI_list:
    outliers = detect_global_outliers(filtered_NDVI_data, var, threshold=2.0,  method="iqr")
    outliers_global[var] = outliers
    print(f"Outliers in {var}: {len(outliers)} ({(len(outliers)/len(filtered_NDVI_data))*100:.2f}%)")

print()

print("For IQR threshold = 3")
for var in NDVI_list:
    outliers = detect_global_outliers(filtered_NDVI_data, var, threshold=3.0, method="iqr")
    outliers_global[var] = outliers
    print(f"Outliers in {var}: {len(outliers)} ({(len(outliers)/len(filtered_NDVI_data))*100:.2f}%)")

Outliers for filtered_NDVI_data
For IQR threshold = 1.5
Outliers in NDVI: 23 (0.35%)
Outliers in NDVI_ff: 24 (0.37%)
Outliers in NDVI_li: 23 (0.35%)

For IQR threshold = 2
Outliers in NDVI: 0 (0.00%)
Outliers in NDVI_ff: 0 (0.00%)
Outliers in NDVI_li: 0 (0.00%)

For IQR threshold = 3
Outliers in NDVI: 0 (0.00%)
Outliers in NDVI_ff: 0 (0.00%)
Outliers in NDVI_li: 0 (0.00%)


In [17]:
filtered_NDVI_data.head()

Unnamed: 0,市町村名,year,month,緯度,経度,NDVI,NDVI_ff,NDVI_li
4,札幌市,2014,5,43.061936,141.354292,0.260263,0.260263,0.260263
5,札幌市,2014,6,43.061936,141.354292,0.335899,0.335899,0.335899
6,札幌市,2014,7,43.061936,141.354292,0.599395,0.599395,0.599395
7,札幌市,2014,8,43.061936,141.354292,0.413911,0.413911,0.413911
8,札幌市,2014,9,43.061936,141.354292,0.40363,0.40363,0.40363


In [16]:
filtered_NDVI_data.to_csv("../datasets/filtered_NDVI_data.csv", index=False)