In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_palette("pastel")

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read Data

In [None]:
df = pd.read_csv('/kaggle/input/co2-emissions/CO2 Emissions.csv')
df.columns = df.columns.str.replace(' ', '')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.groupby('VehicleClass').aggregate({'CO2Emissions(g/km)': 'mean', 'Model': 'count'})

In [None]:
df.groupby('FuelType').aggregate({'CO2Emissions(g/km)': 'mean', 'Model': 'count'})

We only have one car with Fuel Type of N.  Let's drop this from our dataset.

In [None]:
df = df[df.FuelType != 'N'].reset_index(drop=True)

# Explore Categorical Variables Relationship w/CO2 Emissions

In [None]:
plt.figure()
sns.boxplot(data = df, x = 'FuelType', y = 'CO2Emissions(g/km)')
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.boxenplot(data = df, x = 'CO2Emissions(g/km)', y = 'VehicleClass')
plt.show()

# Explore  Continous Variables relationship w/CO2 Emissions

In [None]:
sns.jointplot(data = df, x ='EngineSize(L)', y = 'CO2Emissions(g/km)', hue = 'Cylinders')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.scatterplot(data = df, x = 'FuelConsumptionComb(L/100km)', y = 'CO2Emissions(g/km)')
plt.title("Relationship between Fuel Consumption\n and CO2 Emissions")
plt.show()

In [None]:
sns.heatmap(df.corr(numeric_only = True).round(2), annot = True)
plt.show()

# Prepare Data for ML

Many of the predictors are heavily correlated, therefore we will run a PCA to turn multiple columns into a single predictor value.  

Also, the categorical variables seem to be fairly predictive, so we turn these into a series of one hot encoded values.

# Split into Train/Test set

In [None]:
predictors = ['VehicleClass', 'EngineSize(L)', 'Cylinders', 'FuelType', 'FuelConsumptionComb(L/100km)']
target = 'CO2Emissions(g/km)'

# use stratified shuffle split to preserve distribution of Vehicle Class between train and test set
split = StratifiedShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 42)

for train_index, test_index in split.split(df, df['VehicleClass']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    
strat_train_set.head()


X_train = strat_train_set[predictors].copy()
y_train = np.array(strat_train_set[target].copy())

X_test = strat_test_set[predictors].copy()
y_test = np.array(strat_test_set[target].copy())

# PCA

In [None]:
pca = PCA(n_components = 1, random_state = 42)
pca.fit(X_train[['EngineSize(L)', 'Cylinders', 'FuelConsumptionComb(L/100km)']])
print("These three components explain {:.2f}% of the variance".format(pca.explained_variance_ratio_[0] * 100))

# One Hot Encoding of Categorical Features

In [None]:
one_hot = OneHotEncoder()
one_hot.fit(X_train[['VehicleClass', 'FuelType']])
one_hot.categories_

In [None]:
one_hot.fit_transform(X_train[['VehicleClass', 'FuelType']]).toarray()

# Create Pipelines to Prepare Data

In [None]:
num_pipeline = Pipeline([
    ('pca', PCA(n_components=1, random_state=42)), # turn 3 numerically correlated variables into one variable
    ('scaler', StandardScaler()) # scale and standardize values
])

X_num_tr = num_pipeline.fit_transform(X_train[['EngineSize(L)', 'Cylinders', 'FuelConsumptionComb(L/100km)']])
X_num_tr[0:5]

In [None]:
num_attribs = ['EngineSize(L)', 'Cylinders', 'FuelConsumptionComb(L/100km)']
cat_attribs = ['VehicleClass', 'FuelType']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

X_train_prepared = full_pipeline.fit_transform(X_train).toarray()
X_train_prepared

# Modeling

In [None]:
# fit test dataset using pipeline
x_test_prepared = full_pipeline.fit_transform(X_test).toarray()
x_test_prepared[0:5]

# Random Forest

In [None]:
rf = RandomForestRegressor(n_estimators = 100, max_depth = 7, min_samples_split = 20)
rf.fit(X_train_prepared, y_train)

In [None]:
predictions = rf.predict(x_test_prepared)

rf_error = mean_absolute_error(y_test, predictions)
rf_error

# Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

In [None]:
lin_reg_predictions = lin_reg.predict(x_test_prepared)

lin_error = mean_absolute_error(y_test, lin_reg_predictions)
lin_error

# Ridge Regression

In [None]:
ridge = Ridge(alpha=5)
ridge.fit(X_train_prepared, y_train)

In [None]:
ridge_predictions = ridge.predict(x_test_prepared)

ridge_error = mean_absolute_error(y_test, ridge_predictions)
ridge_error

# Tensorflow Neural Network

In [None]:
tf.random.set_seed(42)

# define model with 3 layers, 50 nodes in first, 25 in second

model = Sequential([
    Dense(75, activation = 'linear', kernel_regularizer=tf.keras.regularizers.l2(.05)),
    Dense(50, activation = 'linear', kernel_regularizer=tf.keras.regularizers.l2(.05)),
    Dense(25, activation = 'linear', kernel_regularizer=tf.keras.regularizers.l2(.05)),
    Dense(1, activation = 'linear', kernel_regularizer=tf.keras.regularizers.l2(.05))
])


# define loss
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = .01),
    loss='mean_absolute_error'
)

# train model
history = model.fit(
    X_train_prepared,
    y_train,
    epochs = 1000,
    verbose = 0, # suppress logging
    validation_split = .2 # calculate validation results on 20% of training data
)

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, 50])

plt.title("Line plot of MAE")
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.grid(True)

In [None]:
tf_predictions = model.predict(x_test_prepared)

tf_error = mean_absolute_error(y_test, tf_predictions)
tf_error

# Evaluate Models

Let's take a peek at some of the predictions

In [None]:
x_test = strat_test_set[predictors].copy()
y_test = strat_test_set[target].copy()
df_test = pd.merge(x_test, y_test, how = 'inner', left_index = True, right_index = True)
df_test['RF_Pred'] = predictions.round(2)
df_test['LinReg_Pred'] = lin_reg_predictions.round(2)
df_test['Ridge_Pred'] = ridge_predictions.round(2)
df_test['NN_Pred'] = tf_predictions.round(2)
df_test

In [None]:
eval_metrics = pd.DataFrame(data = [rf_error, lin_error, ridge_error, tf_error],
                            columns = ['Avg. Error'],
                            index = ['RF Error', 'LinReg Error', 'Ridge Error', 'Neural Net Error'])
eval_metrics

The Random Forest regressor appears to fit the data best, only off by an average of ~6 for the CO2 Emissions target variable