In [None]:
# Dependencies and setup
import numpy as np
import os
import pandas as pd
import warnings
import seaborn as sn
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualAllStateData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

In [None]:
# Drop row with United States totals data
annualData = annualData.loc[annualData['State'] != 'United States']

# Feature Selection

In [None]:
# Set features to be used as x values
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Year'])
X = features
X.head()

In [None]:
# Develop correlation matrix
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

In [None]:
# Re-set features to be used as x values, given above correlation
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Highway use of gasoline (thousand gallons)', 'Vehicles', 'Year'])
X = features
X.head()

In [None]:
# Re-develop correlation matrix with new features
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

# Multiple Linear Regression

### Create a Train-Test Split
Use Transportation (MtCO2e) for the y values.

In [None]:
# Set y values
y = annualData['Transportation (MtCO2e)'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
# Display values
print(y)

In [None]:
# Dependencies and setup
from sklearn.model_selection import train_test_split

# Use train-test split to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.head()

### Pre-Processing
Scale the data using the StandardScaler and perform some feature selection.

In [None]:
# Dependencies and setup
from sklearn.preprocessing import StandardScaler

# Scale data
X_scale = StandardScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

In [None]:
# Display array of scaled values
X_train_scaled

### Train the Linear Regression Model

In [None]:
# Dependencies and setup
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate scores for training and testing data
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

### Coefficients (Independent Values)

In [None]:
# Display coefficient factors
print(model.coef_)

In [None]:
# Display full amount (not rounded)
model.coef_[0][0]

In [None]:
# Display intercept
print(model.intercept_)

In [None]:
# Generate equation
print("The linear equation is: Y = {:.5} + {:.5}*GDP + {:.5}*Population + {:.5}*Transit + {:.5}*VMT + {:.5}*SQMI + {:.5}*Temp".format(model.intercept_[0], model.coef_[0][0], model.coef_[0][1], model.coef_[0][2], model.coef_[0][3], model.coef_[0][4], model.coef_[0][5]))

In [None]:
# Define function to predict GHG values
def make_prediction(GDP, population, transit, VMT, SQMI, temperature, coefs, intercept):

    GHG = coefs[0][0] * GDP + coefs[0][1] * population + coefs[0][2] * transit + coefs[0][3] * VMT + coefs[0][4] * SQMI \
          + coefs[0][5] * temperature + intercept[0]
        
    return GHG

In [None]:
# Display prediction
row = 3
GDP_param = X_test_scaled[row][0]
population_param = X_test_scaled[row][1]
transit_param = X_test_scaled[row][2]
VMT_param = X_test_scaled[row][3]
SQMI_param = X_test_scaled[row][4]
temperature_param = X_test_scaled[row][5]

make_prediction(GDP_param, population_param, transit_param, VMT_param, SQMI_param, temperature_param, model.coef_, model.intercept_)

In [None]:
# R squared
model.score(X_test_scaled, y_test, sample_weight = None)

In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)
predictions[:10]

In [None]:
# Display y test array of values
y_test[:10]

In [None]:
# Dependencies and setup
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate and display Mean Square Error
y_true = y_test
y_pred = predictions
mean_squared_error(y_true, y_pred)

In [None]:
# Calculate and display Mean Absolute Error
mean_absolute_error(y_true, y_pred)

In [None]:
# Calculate and display Root Mean Square Error
np.sqrt(mean_squared_error(y_true, y_pred))