In [1]:
# Dependencies and setup (will need more)
import numpy as np
import os
import pandas as pd
import warnings
import seaborn as sn
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")

In [2]:
# Read CSV into DataFrame
annualData = pd.read_csv("resources/annualAllStateData.csv")

# Drop the null rows
annualData = annualData.dropna()

# Display DataFrame
annualData

Unnamed: 0,State,Year,Transportation (MtCO2e),State GDP (Million US$ (chained 1997/2005)),Population (People),Transit Ridership,Highway use of gasoline (thousand gallons),Highway vehicle-miles traveled (millions),Vehicles,sq mi,tempHigh
0,Alabama,1994,32.389399,94803.0,4260229,9.682137e+06,2220444.0,4.895600e+04,3176560.0,50645.33,74.10
1,Alaska,1994,11.402707,26188.0,603308,3.097134e+06,273319.0,4.150000e+03,533496.0,570640.95,33.70
2,Arizona,1994,27.085805,104104.0,4245089,5.289982e+07,1899942.0,3.877400e+04,2813460.0,113594.08,74.70
3,Arkansas,1994,19.010329,53641.0,2494019,4.535502e+06,1286100.0,2.494800e+04,1566840.0,52035.48,71.10
4,California,1994,206.555572,904778.0,31484435,1.130728e+09,12932907.0,2.719430e+05,22338870.0,155779.22,70.60
...,...,...,...,...,...,...,...,...,...,...,...
1269,Washington,2018,47.607909,524486.9,7523869,2.702230e+08,2765858.0,6.236666e+04,7152413.0,66455.52,57.70
1270,West Virginia,2018,13.154700,71858.7,1804291,7.762619e+06,791672.0,1.944732e+04,1693719.0,24038.21,63.30
1271,Wisconsin,2018,31.257005,303767.4,5807406,5.896453e+07,2491234.0,6.588543e+04,5683061.0,54157.80,53.10
1272,Wyoming,2018,8.110488,38696.3,577601,2.504852e+06,294731.0,1.043844e+04,837024.0,97093.14,55.50


In [3]:
annualData = annualData.loc[annualData['State'] != 'United States']

# Feature Selection

In [4]:
# Set features to be used as x values
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Year'])
X = features
X.head()

Unnamed: 0,State GDP (Million US$ (chained 1997/2005)),Population (People),Transit Ridership,Highway use of gasoline (thousand gallons),Highway vehicle-miles traveled (millions),Vehicles,sq mi,tempHigh
0,94803.0,4260229,9682137.0,2220444.0,48956.0,3176560.0,50645.33,74.1
1,26188.0,603308,3097134.0,273319.0,4150.0,533496.0,570640.95,33.7
2,104104.0,4245089,52899820.0,1899942.0,38774.0,2813460.0,113594.08,74.7
3,53641.0,2494019,4535502.0,1286100.0,24948.0,1566840.0,52035.48,71.1
4,904778.0,31484435,1130728000.0,12932907.0,271943.0,22338870.0,155779.22,70.6


In [None]:
# Develop correlation matrix
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

In [5]:
# Re-set features to be used as x values, given above correlation
features = annualData.drop(columns=['Transportation (MtCO2e)','State', 'Highway use of gasoline (thousand gallons)', 'Vehicles', 'Year'])
X = features
X.head()

Unnamed: 0,State GDP (Million US$ (chained 1997/2005)),Population (People),Transit Ridership,Highway vehicle-miles traveled (millions),sq mi,tempHigh
0,94803.0,4260229,9682137.0,48956.0,50645.33,74.1
1,26188.0,603308,3097134.0,4150.0,570640.95,33.7
2,104104.0,4245089,52899820.0,38774.0,113594.08,74.7
3,53641.0,2494019,4535502.0,24948.0,52035.48,71.1
4,904778.0,31484435,1130728000.0,271943.0,155779.22,70.6


In [None]:
# Re-develop correlation matrix with new features
X.corr()
corrMatrix = X.corr()

# Show result of correlation matrix
plt.figure(figsize=(25,25))

sn.heatmap(corrMatrix, annot=True)

plt.savefig("Resources/mnCorr.png")

plt.show()

# Multiple Linear Regression

### Create a Train-Test Split
Use mnGHG for the y values.

In [6]:
# Set y values
y = annualData['Transportation (MtCO2e)'].values.reshape(-1, 1)
print(X.shape, y.shape)

(1224, 6) (1224, 1)


In [7]:
print(y)

[[32.38939867]
 [11.40270658]
 [27.08580517]
 ...
 [13.15470028]
 [31.25700488]
 [ 8.11048825]]


In [8]:
# Dependencies and setup
from sklearn.model_selection import train_test_split

# Use train-test split to create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
X_train.head()

Unnamed: 0,State GDP (Million US$ (chained 1997/2005)),Population (People),Transit Ridership,Highway vehicle-miles traveled (millions),sq mi,tempHigh
31,596435.0,18459470,2349900000.0,112970.0,47126.4,54.7
251,186541.0,5297672,95181440.0,56655.0,54157.8,57.4
484,70242.0,1738643,6222364.0,19016.0,76824.17,62.6
141,19410.0,742213,849503.0,7817.0,75811.0,53.0
1073,112938.9,2978048,6694851.0,34897.33423,52035.48,71.9


### Pre-Processing
Scale the data using the StandardScaler and perform some feature selection.

In [9]:
# Dependencies and setup
from sklearn.preprocessing import StandardScaler

# Scale data
X_scale = StandardScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)

### Train the Linear Regression Model

In [10]:
# Dependencies and setup
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [11]:
# Fit the model to the training data and calculate scores for training and testing data
model.fit(X_train_scaled, y_train)
training_score = model.score(X_train_scaled, y_train)
testing_score = model.score(X_test_scaled, y_test)
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.956382427410011
Testing Score: 0.953817967738916


### Coefficients (Independent Values)

In [12]:
# Display coefficient factors
print(model.coef_)

[[-1.12059706 36.59716076 -6.65646531  6.30468334  4.46556114  3.11017275]]


In [13]:
# Display intercept
print(model.intercept_)

[38.87467442]


In [23]:
# Generate regression equation
print("The linear model is: Y = {:.5} + {:.5}*GDP + {:.5}*Population + {:.5}*Transit + {:.5}*VMT + {:.5}*SQMI + {:.5}*Temp".format(model.intercept_[0], model.coef_[0][0], model.coef_[0][1], model.coef_[0][2], model.coef_[0][3], model.coef_[0][4], model.coef_[0][5]))

The linear model is: Y = 38.875 + -1.1206*GDP + 36.597*Population + -6.6565*Transit + 6.3047*VMT + 4.4656*SQMI + 3.1102*Temp


In [14]:
# R squared
model.score(X_test_scaled, y_test, sample_weight = None)

0.953817967738916

In [15]:
# Make predictions with the hypertuned model
predictions = model.predict(X_test_scaled)
predictions[:10]

array([[ 13.21350559],
       [ 12.63707102],
       [ 54.56146486],
       [168.24976787],
       [ 69.17802003],
       [ 22.58282139],
       [ 10.91935779],
       [ 32.05518002],
       [ 34.50114701],
       [ 33.0478583 ]])

In [16]:
y_test[:10]

array([[ 12.80369547],
       [ 14.09660168],
       [ 60.65783737],
       [207.1970538 ],
       [ 69.47666777],
       [ 25.47875385],
       [ 10.95469455],
       [ 60.77204619],
       [ 32.85436401],
       [ 31.95376058]])

In [17]:
# Dependencies and setup
from sklearn.metrics import mean_squared_error, mean_absolute_error

# MSE
y_true = y_test
y_pred = predictions
mean_squared_error(y_true, y_pred)

85.70827538721714

In [18]:
# MAE
mean_absolute_error(y_true, y_pred)

5.476542280548621

In [19]:
# RMSE
np.sqrt(mean_squared_error(y_true, y_pred))

9.257876397274762