In [19]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
from sklearn.feature_selection import r_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

df = pd.read_csv('./data/workingData')




In [20]:
# Multiple Linear Regression


test_p = .2
seed = 123

X = df[['Fruit','Vegetables']].values.reshape(-1, 2)
y = df[['co2_including_luc']].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=test_p, random_state=seed
)

linModel = LinearRegression()
linModel.fit(X_train,y_train)
y_pred = linModel.predict(X_test)

print('MSE =', mean_squared_error(y_test, y_pred))
print('MAE =', mean_absolute_error(y_test, y_pred))
print('R-squared =', r2_score(y_test, y_pred))

ten_fold_scores = -cross_val_score(
    linModel, X_train, y_train, scoring='neg_mean_squared_error', cv=10
)

LOOCV_scores = -cross_val_score(linModel, X_train, y_train, scoring='neg_mean_squared_error', cv=49)

print('k = 10: ', np.mean(ten_fold_scores))
print('LOOCV: ', np.mean(LOOCV_scores))




MSE = 1102.4368871910176
MAE = 27.728753962775183
R-squared = 0.5386424005059065
k = 10:  1918.2758827315424
LOOCV:  2029.6336312562075


In [21]:
# multiple quadradic regression degree 2
test_p = .2
seed = 123

X = df[['Fruit', "Vegetables"]].values.reshape(-1, 2)
y = df[['co2_including_luc']].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=test_p, random_state=seed
)

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
polyFeatures = PolynomialFeatures(degree=2, include_bias=False)
xPoly = polyFeatures.fit_transform(X_train.reshape(-1, 2))
polyModel = LinearRegression().fit(xPoly, y_train)
poly_test = polyFeatures.fit_transform(X_test.reshape(-1, 2))
y_pred = polyModel.predict(poly_test)

# metrics.mean_squared_error(y_test, y_pred)
# metrics.mean_squared_error(y_test, y_pred, squared=False)
# metrics.mean_absolute_error(y_test, y_pred)

print('MSE =', mean_squared_error(y_test, y_pred))
print('MAE =', mean_absolute_error(y_test, y_pred))
print('R-squared =', r2_score(y_test, y_pred))

ten_fold_scores = -cross_val_score(
    polyModel, xPoly, y_train, scoring='neg_mean_squared_error', cv=10
)

LOOCV_scores = -cross_val_score(polyModel, xPoly, y_train, scoring='neg_mean_squared_error', cv=49)
print('k = 10: ', np.mean(ten_fold_scores))
print('LOOCV: ', np.mean(LOOCV_scores))

MSE = 1004.0455530970878
MAE = 26.30319932754222
R-squared = 0.5798180816138369
k = 10:  1273.0696345353342
LOOCV:  1221.7889558766985


In [22]:
# multiple quadratic regression degree 3
test_p = .2
seed = 123

X = df[['Vegetables','Fruit']].values.reshape(-1, 2)
y = df[['co2_including_luc']].values.reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=test_p, random_state=seed
)

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

polyFeatures = PolynomialFeatures(degree=3, include_bias=False)
xPoly = polyFeatures.fit_transform(X_train.reshape(-1, 2))
polyModel = LinearRegression().fit(xPoly, y_train)
poly_test = polyFeatures.fit_transform(X_test.reshape(-1, 2))
y_pred = polyModel.predict(poly_test)



print('MSE =', mean_squared_error(y_test, y_pred))
print('MAE =', mean_absolute_error(y_test, y_pred))
print('R-squared =', r2_score(y_test, y_pred))

ten_fold_scores = -cross_val_score(
    polyModel, xPoly, y_train, scoring='neg_mean_squared_error', cv=10
)

LOOCV_scores = -cross_val_score(polyModel, xPoly, y_train, scoring='neg_mean_squared_error', cv=49)
print('k = 10: ', np.mean(ten_fold_scores))
print('LOOCV: ', np.mean(LOOCV_scores))

MSE = 1050.1847669821454
MAE = 27.06257979181939
R-squared = 0.5605093327794317
k = 10:  1368.2500074298428
LOOCV:  1375.3592041736888
