In [None]:
""""
Land Use Land Cover Regression Analysis

This Python script performs a regression analysis on land use land cover (LULC) data for multiple years. 
The analysis aims to explore the relationship between LULC classes (Forest, Grass, or Shrubland) and the corresponding rasterized arrays of land use data. 
The script uses the GDAL library for geospatial data processing, numpy for array manipulation, pandas for handling dummy variables, and statsmodels for regression modeling.

The script's functionality includes the following steps:
1. Define the file paths for the geotiff files containing original land use land cover data and rasterized arrays for each year.
2. Reclassify the LULC values into binary classes: 1 if equal to 20, 30, or 40 (Forest, Grass, or Shrubland) and 0 otherwise.
3. Flatten the land use land cover geotiff and rasterized geotiff into 1D arrays and store them in the `lulc_arrays` and `rasterized_arrays` lists, respectively, for each year.
4. Concatenate the arrays in `lulc_arrays` to create a single 1D array `lulc_y`, which represents the dependent variable for the regression.
5. Create the array `lulc_X` with rasterized arrays and corresponding years. Add a constant column for the intercept in the regression.
6. Add dummy variables for each year to capture year-specific effects using the pandas `get_dummies` function.
7. Concatenate the dummy variables with `lulc_X`.
8. Run an Ordinary Least Squares (OLS) regression model using the statsmodels `OLS` function.
9. Display the regression output summary, including coefficient estimates, standard errors, p-values, and other statistics for the independent variables.

Note:
- Before running the script, ensure that the required GDAL, numpy, pandas, and statsmodels libraries are installed.
- The code provided assumes that the geotiff files for the LULC and rasterized data are available for the specified years.
- Additional preprocessing or data handling may be necessary based on the specific use case.

Author: Matthew Braaksma
Date: 08-03-2023
"""

In [1]:
from osgeo import gdal
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Geotiff file paths for original land use land cover and rasterized arrays
geotiff_lulc_template = "../../data/GLASS-GLC/raw/GLASS-GLC_7classes_{}.tif"
geotiff_rasterized_template = "../../data/GLASS-GLC/rasterized/rasterized_{}.tif"

# Subset parameters
cr_xoff = 2095
cr_yoff = 1640
cr_xsize = 80
cr_ysize = 70

# List of years from 1982 to 2015
years = list(range(1983, 2016))

In [2]:
# Initialize arrays to store rasterized arrays and lulc_y
rasterized_arrays = []
lulc_arrays = []

# Loop through each year
for year in years:
    # Open the land use land cover geotiff for the current year
    geotiff_lulc_path = geotiff_lulc_template.format(year)
    geotiff_lulc_dataset = gdal.Open(geotiff_lulc_path, gdal.GA_ReadOnly)
    if geotiff_lulc_dataset is None:
        print(f"Error: Could not open the land use land cover geotiff file for year {year}.")
        exit(1)    

    # Reclassify the LULC values to 1 if equal to 20, 30, or 40, and 0 otherwise (Forest, Grass, or Shrubland)
    lulc_data = geotiff_lulc_dataset.ReadAsArray(cr_xoff, cr_yoff, cr_xsize, cr_ysize)
    lulc_data_reclassified = np.where((lulc_data == 20) | (lulc_data == 30) | (lulc_data == 40), 1, 0)

    # Flatten the land use land cover geotiff into a single 1D array and append to lulc_y
    lulc_arrays.append(lulc_data_reclassified.flatten())

    # Open the rasterized geotiff for the current year
    geotiff_rasterized_path = geotiff_rasterized_template.format(year)
    geotiff_rasterized_dataset = gdal.Open(geotiff_rasterized_path, gdal.GA_ReadOnly)
    if geotiff_rasterized_dataset is None:
        print(f"Error: Could not open the rasterized geotiff file for year {year}.")
        exit(1)

    # Flatten the rasterized geotiff into a single 1D array and append to rasterized_arrays
    rasterized_data = geotiff_rasterized_dataset.GetRasterBand(1).ReadAsArray()
    rasterized_arrays.append(rasterized_data.flatten())

    # Check if the size of lulc_data matches the size of the rasterized arrays
    if lulc_data.shape != rasterized_data.shape:
        print(f"Error: Size mismatch for the land use land cover data for year {year}.")
        exit(1)



In [3]:
# Concatenate lulc_y to create a single 1D array
lulc_y = np.concatenate(lulc_arrays)

# Create the array lulc_X with rasterized arrays and corresponding years
lulc_X = np.column_stack((np.concatenate(rasterized_arrays), np.repeat(years, cr_xsize * cr_ysize)))

# Add a constant column for the intercept in the regression
lulc_X = sm.add_constant(lulc_X)

# Add dummy variables for each year
dummy_years = pd.get_dummies(lulc_X[:, -1], prefix="Year")

# Concatenate dummy variables with lulc_X
lulc_X = np.column_stack((lulc_X[:, :-1], dummy_years))

In [4]:
# Run the regression
model = sm.OLS(lulc_y, lulc_X).fit()

# Column names for the regression results
col_names = ["const"] + ["PA"] + [f"Year_{year}" for year in years]

# Display the regression output with labeled columns
print(model.summary(xname=col_names))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.278
Method:                 Least Squares   F-statistic:                     2093.
Date:                Thu, 03 Aug 2023   Prob (F-statistic):               0.00
Time:                        11:30:57   Log-Likelihood:            -1.0384e+05
No. Observations:              184800   AIC:                         2.077e+05
Df Residuals:                  184765   BIC:                         2.081e+05
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.016e+08   1.96e+08      1.541      0.1