In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import RidgeCV
import joblib

### **Importing CSV file**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [74]:
file_path = '/content/drive/MyDrive/W5 AI/CO2 Emission Dataset.csv'
co2_df = pd.read_csv(file_path)

### **Exploratory Data Analysis**

In [None]:
co2_df.head()

In [None]:
co2_df.info()

In [None]:
co2_df.describe()

###**EDA conclusion:**

We can conclude that the dataset has 7385 rows and 12 columns, a shape of (7385, 12) and no null values therefore modelling doesnt require cleaning

### **More EDA**

In [None]:
car_emissions = co2_df.groupby('Make')['CO2 Emissions(g/km)'].mean()
car_emissions.sort_values(ascending=False)

In [None]:
car_makes = co2_df['Make'].value_counts()
car_makes

The car make that generates the most CO2 emissions is Bugatti, however this is irrepresentive due to there only being 3 entries of bugatti cars so we can take its average emission as an outlier

### **Corr map to identify the best feature for linear regression**

In [None]:
# first we need to remove all object data types to rate the numerical features

clean_co2_df = co2_df.select_dtypes(exclude=['object'])
clean_co2_df.head()

# Now we can visualise the correlation

corr_matrix = clean_co2_df.corr()
plt.figure()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")

## **Linear regression model based on correlation**

In [None]:
## Spliting data for train and test

## subsetting data for high correlation features for regression
X = clean_co2_df.drop(['CO2 Emissions(g/km)'], axis=1)

y = clean_co2_df['CO2 Emissions(g/km)']

X_corr = X[["Fuel Consumption City (L/100 km)", "Fuel Consumption Comb (L/100 km)"]]

X_train_corr, X_test_corr, y_train, y_test = train_test_split(X_corr, y, test_size=0.2, random_state=42)

## **Training the model**

In [None]:
model = LinearRegression()
model.fit(X_train_corr, y_train)

## **Predicting y value**

In [None]:
y_pred = model.predict(X_test_corr)
print(model.coef_)
model.feature_names_in_

## **Testing model**

In [None]:
def eval_metric(actual, pred):
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    R2_score = r2_score(actual, pred)
    print(f"Model testing performance: ")
    print("---------------------------")
    print(f"R2_score \t: {R2_score}")
    print(f"MAE \t\t: {mae}")
    print(f"MSE \t\t: {mse}")
    print(f"RMSE \t\t: {rmse}")

In [None]:
eval_metric(y_test, y_pred)

# **Train model on all variables**

To improve the r2 value this model was made based on all the numerical variables of this dataset except the fuel consumption in the city and highways because the combined fuel consumption is representive of both these variables

### **Initiating the model object**

In [None]:
## Dropping irrelevant columns

X_comb_final = X.drop(['Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (mpg)'], axis=1)

## splitting the dataset

X_comb_final_train, X_comb_final_test, y_comb_final_train, y_comb_final_test = train_test_split(X_comb_final, y, test_size=0.2, random_state=42)

display(X_comb_final_train.head())

In [None]:
model_all = LinearRegression()
model_all.fit(X_comb_final_train, y_comb_final_train)

### **Predictions**

In [None]:
y_pred_all = model_all.predict(X_comb_final_test)
print(model_all.coef_)

### **Evaluating the model metrics**

In [None]:
eval_metric(y_comb_final_test, y_pred_all)

## **Comparing the prediction to the actual results**

In [None]:
y_test_pred = pd.concat([pd.Series(y_comb_final_test.values, name='Actual'), pd.Series(y_pred_all, name='Predicted')], axis=1)
y_test_pred


### **Saving the final model**

In [None]:
joblib.dump(model_all, 'final_model.pkl')

###**Conclusion:**
This model while needs the most features provides the highest r2 score essentially giving the smallest error of all the models and has minimal error