In [278]:
import pandas as pd
import numpy as np
import scipy.optimize as opt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [279]:
import os
print(os.getcwd()) #checking working directory 

/Users/luxifeyereisen/FWE438/Homework/Homework 4


In [280]:
file_path = 'Wcr_GPPdaily.csv' #load in dataset 
GPP = pd.read_csv(file_path)

In [281]:
# Data Preprocessing
# Rename Columns 

GPP.rename(columns={ #rename columns for easier reference
    'TA_F': 'TA',
    'SW_IN_F': 'SW',
    'VPD_F': 'VPD',
    'GPP_NT_VUT_REF': 'GPP'
}, inplace=True)

GPP.drop(columns=['TIMESTAMP'], errors='ignore', inplace=True) #drop timestamp column since it is not needed for regression
# The 'errors' argument prevents errors if the column does not exist
GPP.dropna(inplace=True) #drops any rows with Na values or NaNs

print(GPP.head(5))

       TA      SW    VPD       GPP
0 -20.063  72.603  0.413 -0.517364
1 -12.814  12.358  0.147 -0.094241
2 -12.625  33.132  0.128 -0.166819
3 -18.652  93.481  0.263 -0.582301
4 -20.269  45.502  0.261 -0.568240


In [282]:
# Feature Engineering - Interaction Terms

GPP['SW_VPD'] = GPP['SW'] * GPP['VPD'] #interaction terms capture potential variable interactions to help analyze combined effects of different environmental factors
GPP['SW_TA'] = GPP['SW'] * GPP['TA']
GPP['VPD_TA'] = GPP['VPD'] * GPP['TA']

print(GPP.head(5))

       TA      SW    VPD       GPP     SW_VPD        SW_TA    VPD_TA
0 -20.063  72.603  0.413 -0.517364  29.985039 -1456.633989 -8.286019
1 -12.814  12.358  0.147 -0.094241   1.816626  -158.355412 -1.883658
2 -12.625  33.132  0.128 -0.166819   4.240896  -418.291500 -1.616000
3 -18.652  93.481  0.263 -0.582301  24.585503 -1743.607612 -4.905476
4 -20.269  45.502  0.261 -0.568240  11.876022  -922.280038 -5.290209


In [294]:
# Build a Linear Regression Model using SciPy Optimization

X = GPP[['SW', 'VPD', 'TA', 'SW_VPD', 'SW_TA', 'VPD_TA']] #define independent (predictor)
y = GPP['GPP'] #and the dependent (target) variables

model = LinearRegression()
model.fit(X, y) #adds an intercept column for the regression model

coefficients = model.coef_
intercept = model.intercept_

y_pred = model.predict(X) #compute predictions using the linregress model which each prediction is weighted by its computed slope

r2_score_value = r2_score(y, y_pred)

print("Coefficients:", coefficients)  # Slopes for each predictor
print("Intercept:", intercept)  # Bias term
print("R² Score:", r2_score_value)

Coefficients: [-3.24914968e+00 -6.48574173e+02  8.21699813e+01  1.79249151e+00
 -4.56050357e-01  1.35954025e+01]
Intercept: -944.0672018208656
R² Score: 0.013582700933255087


In [284]:
GPP_results = pd.DataFrame({
    'Actual_GPP': y,
    'Predicted_GPP_Optimized': y_pred_opt,
    'Predicted_GPP_Linregress': y_pred_linreg
})

csv_filename = 'Feyereisen_Wcr_GPPdaily_HW4.csv' # creates a csv file 
GPP_results.to_csv(csv_filename, index=False) # this adds the annual_stats data frame into the newly made csv file 
print(f"Homework saved as {csv_filename}")

Homework saved as Feyereisen_Wcr_GPPdaily_HW4.csv
