In [None]:
# Dependencies and setup (will need more)
import numpy as np
import os
import pandas as pd
import warnings
import seaborn as sn
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")

In [None]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualAllStateData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

In [None]:
annualData = annualData.loc[annualData['State'] != 'United States']

# Feature Selection

In [None]:
# Set features to be used as x values
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Year'])
X = features
X.head()

In [None]:
# Develop correlation matrix
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

In [None]:
# Re-set features to be used as x values, given above correlation
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Highway use of gasoline (thousand gallons)', 'Vehicles', 'Year'])
X = features
X.head()

In [None]:
# Re-develop correlation matrix with new features
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

# Multiple Linear Regression

### Create a Train-Test Split
Use mnGHG for the y values.

In [None]:
# Set y values
y = annualData['Transportation (MtCO2e)'].values.reshape(-1, 1)
print(X.shape, y.shape)

In [None]:
print(y)

In [None]:
# Dependencies and setup
from sklearn.model_selection import train_test_split

# Use train-test split to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.head()

### Pre-Processing
Scale the data using the StandardScaler and perform some feature selection.

In [None]:
# Dependencies and setup
from sklearn.preprocessing import StandardScaler

# Scale data
X_scale = StandardScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

### Train the Linear Regression Model

In [None]:
# Dependencies and setup
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
# Fit the model to the training data and calculate scores for training and testing data
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

### Coefficients (Independent Values)

In [None]:
# Display coefficient factors
print(model.coef_)

In [None]:
# Display intercept
print(model.intercept_)

In [None]:
# R squared
model.score(X_test_scaled, y_test, sample_weight = None)

In [None]:
# Make predictions with the hypertuned model
predictions = model.predict(X_test_scaled)
predictions[:10]

In [None]:
y_test[:10]

In [None]:
# Dependencies and setup
from sklearn.metrics import mean_squared_error, mean_absolute_error

# MSE
y_true = y_test
y_pred = predictions
mean_squared_error(y_true, y_pred)

In [None]:
# MAE
mean_absolute_error(y_true, y_pred)

In [None]:
# RMSE
np.sqrt(mean_squared_error(y_true, y_pred))