# Global Sustainable Energy #

### finding information about the Energy in Germany and the conection of the type of Energy to the Co2 Emissions ###

In [None]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import sys
import matplotlib

print('python:',sys.version)
print('Numpy:',np.__version__)
print('Pandas:',pd.__version__)
print('Scikitlearn:',sklearn.__version__)
print('Seaborn: ',sns.__version__)
print('matplotlib:',matplotlib.__version__)


# I. EDA #

In [None]:
SE=pd.read_csv(r'Data-sustainable-energy2.csv')
SE


In [None]:
#For Google Collab ONLY :

from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/ML/Data-sustainable-energy2.csv'
SE=pd.read_csv(file_path )
SE


In [None]:
SE.shape
SE.info()


In [None]:
SE.describe()
numeric_col=list(SE.describe().columns)
categorical_col=list(set(SE.columns).difference(numeric_col))
numeric_col


In [None]:
SE.iloc[:,17]
Target=np.array(SE['Value_co2_emissions_kt_by_country'])
Target


In [None]:
SE.isnull().any()


## 1) Imputation ##

In [None]:
for a in SE.columns:
    print(SE[a].value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=False))


In [None]:
from sklearn.impute import SimpleImputer

# define imputer
imputer = SimpleImputer(strategy='most_frequent')
# fit on the dataset
DF=pd.DataFrame(SE[numeric_col], columns=numeric_col)
imputer.fit(DF)
# transform the dataset
Xtran = imputer.transform(DF)
# print total missing
SE2=pd.DataFrame(Xtran, columns=numeric_col)
SE2


In [None]:
SE2.describe()


### 2nd Method Imputation ###

In [None]:
from sklearn.impute import KNNImputer
from numpy import isnan


# split into input and output elements
data = SE[numeric_col].values
ix = [i for i in range(data.shape[1]) if i <=20]#!= 20]
x, y = data[:, ix], Target

# define imputer
imputer = KNNImputer()
# fit on the dataset
imputer.fit(x)
# transform the dataset
Xtran = imputer.transform(x)
# print total missing
print('Missing: %d' % sum(isnan(x).flatten()))
print('Missing after imputing: %d' % sum(isnan(Xtran).flatten()))
SE3=pd.DataFrame(Xtran, columns=numeric_col)
SE3


In [None]:
SE3.describe()


## 2) Plotting ##

In [None]:
plt.figure(figsize=(12,20))
#mask for upper triangle
mask=np.zeros_like(SE3[numeric_col].corr(), dtype=bool)
mask[np.triu_indices_from(mask)]=True
#Generate custom diverging colormap
#cmap=sns.color_palette("flare")
cmap=sns.diverging_palette(h_neg=12, h_pos=12, s=100, l=40, sep=2, n=10, center='light', as_cmap=False)
#Heatmap with mask with correct aspect ratio
sns.heatmap(SE3[numeric_col].corr(), mask=mask, cmap=cmap, vmin=-1,vmax=1,
            center=0, square=True, linewidths=0.5,annot=True, fmt='.2f', cbar_kws={'shrink':0.5})
plt.show()


In [None]:

SE3['Country']=SE['Entity']
SE3


In [None]:
numeric_col


## 3) Plotting per Country ##

In [None]:
# DataFrame per country

MR=['Year','Access to electricity (% of population)','Renewable energy share in the total final energy consumption (%)','Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)','Low-carbon electricity (% electricity)',
    'Primary energy consumption per capita (kWh/person)','Value_co2_emissions_kt_by_country','gdp_per_capita']

indx=[]
Pays=input('insert country name plz: ')
for i in SE3.index:
    if SE3['Country'][i]==Pays:
        indx.append(i)
    
indx


In [None]:
ia=indx[0]
ib=indx[-1]

SE_c=SE3[MR].iloc[ia:ib+1,:]
SE_c


In [None]:
# plot
Years=np.arange(2000,2021, 1, dtype=np.int32)
fig, ax = plt.subplots(figsize=(10, 6))
sns.set_style('whitegrid') 
ax = sns.scatterplot(x=Years , y=SE_c['Access to electricity (% of population)'], palette='Set2', label='Access to electricity (% of population)')
ax = sns.scatterplot(x=Years , y=SE_c['Renewable energy share in the total final energy consumption (%)'], palette='Set2', 
                     label='Renewable energy share in the total final energy consumption (%)')
ax = sns.scatterplot(x=Years , y=SE_c['Low-carbon electricity (% electricity)'],label='Low-carbon electricity (% electricity)')
ax.set_title('Evolution through Time', fontsize=10)
ax.set(xlabel='Years', ylabel='Percentage')

ax.legend(loc='best')
plt.show()


In [None]:
#pairplot

plt.figure(figsize=(100, 120))
sns.pairplot(SE_c, hue='Year' ,palette='bright', kind='scatter', markers=None,dropna=False, height=3)
plt.show()


# II. Data Plotting #

## 1) Renewable Energy Share Vs  ##

In [None]:
MR=['Year','Access to electricity (% of population)','Renewable energy share in the total final energy consumption (%)','Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)','Low-carbon electricity (% electricity)',
    'Primary energy consumption per capita (kWh/person)','Value_co2_emissions_kt_by_country','gdp_per_capita']
SE3

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.set_style('whitegrid') 
ax = sns.scatterplot(x=SE3['Primary energy consumption per capita (kWh/person)'] , y=SE3['Renewable energy share in the total final energy consumption (%)'], 
                     palette='Set2',label='Primary energy consumption per capita (kWh/person)')
ax = sns.scatterplot(x=SE3['gdp_per_capita'] , y=SE3['Renewable energy share in the total final energy consumption (%)'], palette='Set2', label='gdp_per_capita')
#ax = sns.scatterplot(x=SE3['Value_co2_emissions_kt_by_country'] , y=SE3['Renewable energy share in the total final energy consumption (%)'],label='Value_co2_emissions')
ax.set_title('Renewable Energy Share Regression', fontsize=10)
ax.set(ylabel='Renewable energy share in consumption')
ax.legend(loc='best')
plt.show()


## 2) Carbon Emissions Vs  ##

In [None]:
# Subplots for 1 country
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 8))
ax1.plot(SE_c['Electricity from fossil fuels (TWh)'])
ax1.set_title('Fossil Fuels')
ax2.plot(SE_c['Electricity from nuclear (TWh)'])
ax2.set_title('Nuclear')
ax3.plot(SE_c['Electricity from renewables (TWh)'])
ax3.set_title('Renewables')
plt.tight_layout()
plt.show()


In [None]:
h=SE3['Country']
fig, ax = plt.subplots(figsize=(15, 8))
ax = sns.scatterplot(x=SE3['Electricity from fossil fuels (TWh)'], y=SE3['Value_co2_emissions_kt_by_country'],
                     palette='Set2',label='fossil fuels (TWh)')
ax = sns.scatterplot(x=SE3['Electricity from nuclear (TWh)'] , y=SE3['Value_co2_emissions_kt_by_country'],
                     palette='Set2', label='nuclear (TWh)')
ax = sns.scatterplot(x=SE3['Electricity from renewables (TWh)'] , y=SE3['Value_co2_emissions_kt_by_country'],
                     label='Renewables (TWh)')
ax.set_title('Carbon Emissions Data', fontsize=16)
ax.set(xlabel='Energy Production TWH', ylabel='Value_co2_emissions')
ax.legend(loc='best')
plt.show()


## 3) Colormap Renewable Energy Share Vs :

In [None]:
SE3

In [None]:

df_subset = SE3[['Country', 'Year', 'Renewable energy share in the total final energy consumption (%)', 
                'Value_co2_emissions_kt_by_country', 'gdp_per_capita']]
df_subset


In [None]:
import plotly.express as px
import plotly.graph_objects as go
sns.set_theme(style='darkgrid', palette='bright') #'colorblind')
sns.set()

def plot_map(df, column, title):
    
    # Create a choropleth map using Plotly Express
    fig = px.choropleth(
        df,
        locations = 'Country',
        locationmode = 'country names',
        color = column,
        hover_name = 'Country',
        color_continuous_scale = 'RdYlGn',
        animation_frame = 'Year',
        range_color = [0, 100])

    # Update geographic features
    fig.update_geos(
        showcoastlines = True,
        coastlinecolor = "Black",
        showland = True,
        landcolor = "white",
        showcountries = True,
        showocean = True,
        oceancolor = "LightBlue")
    
    # Update the layout of the figure
    fig.update_layout(
        title_text = title,
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection_type = 'equirectangular',
            showland = True,
            landcolor = "white",
            showcountries = True,
            showocean = True,
            oceancolor = "LightBlue"),
        width = 1000,
        height = 850,
        dragmode = 'pan',
        hovermode = 'closest',
        coloraxis_colorbar = dict(
            title = column,
            title_font_size = 14,
            title_side = 'right',
            lenmode = 'pixels',
            len = 300,
            thicknessmode = 'pixels',
            thickness = 15),
        updatemenus = [
            {"type": "buttons", "showactive": False, "x": 0.1, "y": 0.9, "buttons": [{"label": "Play", "method": "animate"}]},
            {"type": "buttons", "showactive": False, "x": 0.18, "y": 0.9, "buttons": [{"label": "Pause", "method": "animate"}]},
            {"type": "buttons", "showactive": False, "x": 0.26, "y": 0.9, "buttons": [{"label": "Stop", "method": "animate"}]}],
        sliders = [{"yanchor": "top", "xanchor": "left", "currentvalue": {"font": {"size": 20}}, "steps": []}])

    # Create slider steps for animation
    slider_steps = []

    for year in df['Year'].unique():
        step = {
            "args": [
                [year],
                {"frame": {"duration": 300, "redraw": False}, "mode": "immediate", "transition": {"duration": 0}}],
            "label": str(year),
            "method": "animate"}
        slider_steps.append(step)

    # Assign slider steps to the figure layout
    fig.layout.updatemenus[0].buttons[0].args[1]['steps'] = slider_steps

    return fig

In [None]:
plot_map(df_subset,'Renewable energy share in the total final energy consumption (%)', 'Renewable Energy Share Evolution')


## 4) Renewable Energy Share / Country :

In [None]:

average_Renewable_Energy = SE3.groupby('Country')['Renewable energy share in the total final energy consumption (%)'].mean()
top_5_countries = average_Renewable_Energy.nlargest(10)
top_5_countries


In [None]:
plt.figure(figsize = (15, 7))
sns.barplot(x = top_5_countries.index, y = top_5_countries.values)
plt.xlabel('Country')
plt.ylabel('Renewable energy share in the total final energy consumption (%)')
plt.title('Top 5 Countries with Highest Renewable energy share in the total energy')
plt.xticks(rotation = 45, ha = 'center')
plt.tight_layout()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>
The highest Renewable energy share are for the poorest countries of the planet(~90% in total energy production)

In [None]:
average_Renewable_Energy = SE3.groupby('Country')['gdp_per_capita'].mean()
top_5_countries = average_Renewable_Energy.nsmallest(10)
top_5_countries


In [None]:
plt.figure(figsize = (15, 7))
sns.barplot(x = top_5_countries.index, y = top_5_countries.values)
plt.xlabel('Country')
plt.ylabel('gdp_per_capita')
plt.title('Top 5 Countries with lowest GDP')
plt.xticks(rotation = 60, ha = 'center')
plt.tight_layout()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>
Highest renewable energy share : Somalia, Uganda, Ethiopia, Burundi, Central African Republic,Bhutan, 
Guinea-Bissau,Liberia,Rwanda,Zambia.
All of these countries are 3rd world countries, 4 of them have the lowest GDP on the planet, which is very contraversial, knowing that renewable energies usually have higher cost of production than fossil fuel energy.


## 5) Carbon Emissions / Country :

In [None]:
SE3[MR]

In [None]:

average_Renewable_Energy = SE3.groupby('Country')['Value_co2_emissions_kt_by_country'].mean()
top_5_countries = average_Renewable_Energy.nlargest(10)
top_5_countries


In [None]:
plt.figure(figsize = (15, 7))
sns.barplot(x = top_5_countries.index, y = top_5_countries.values)
plt.xlabel('Country')
plt.ylabel('Co2 emissions KT')
plt.title('Top 5 Countries with Co2 emissions KT')
plt.xticks(rotation = 45, ha = 'center')
plt.tight_layout()
plt.show()


In [None]:

average_Renewable_Energy = SE3.groupby('Country')['gdp_per_capita'].mean()
top_5_countries = average_Renewable_Energy.nlargest(10)
top_5_countries


In [None]:
plt.figure(figsize = (15, 7))
sns.barplot(x = top_5_countries.index, y = top_5_countries.values)
plt.xlabel('Country')
plt.ylabel('GDP per_capita')
plt.title('Top 5 Countries with Highest GDP per capita')
plt.xticks(rotation = 45, ha = 'center')
plt.tight_layout()
plt.show()



## 6) Biggest polluters - Evolution Co2 Emissions and Renwable Energy Share :

In [None]:
polluters=['China','United States','India','Japan','Germany','Canada',
           'United Kingdom','Mexico','Indonesia','Saudi Arabia']
BP=[]
for b in polluters:
    for j in SE3[SE3['Country']==b].index:
        BP.append(j)
        
len(BP)
SE3

In [None]:
BP_DF=SE3.iloc[BP,:]
BP_DF


In [None]:

fig, ax = plt.subplots(figsize=(15, 7))
sns.set_style('whitegrid') 
ax = sns.scatterplot(x=BP_DF['Year'] , y=BP_DF['Value_co2_emissions_kt_by_country'], palette='bright',hue=BP_DF['Country'], label='Co2')
#ax = sns.scatterplot(x=BP_DF['Year']  , y=BP_DF['Renewable energy share in the total final energy consumption (%)'], 
#                     hue=BP_DF['Country'],palette='colorblind',label='Renewables share in total energy')

ax.set_title('Evolution of the biggest polluters', fontsize=10)
ax.set(xlabel='Years')
ax.legend(loc='best')
plt.show()

## <b><span style="color: #FF0000"> Interpretation:</span> </b>



# III. Data Preprocessing #

## 1) Scaling ##

In [None]:
MR=['Year','Access to electricity (% of population)','Renewable energy share in the total final energy consumption (%)','Electricity from fossil fuels (TWh)',
    'Electricity from nuclear (TWh)', 'Electricity from renewables (TWh)','Low-carbon electricity (% electricity)',
    'Primary energy consumption per capita (kWh/person)','Value_co2_emissions_kt_by_country','gdp_per_capita']
SE3

In [None]:
SE3.describe()


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
SE4=pd.DataFrame(scaler.fit_transform(SE3[MR]), columns= MR)
SE4.describe()


In [None]:
#Optional
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
SE4=pd.DataFrame(scaler.fit_transform(SE3[MR]), columns= MR)
SE4.describe()


## 2) Label Encoding ##

In [None]:
SE3['Country']


In [None]:
from sklearn.preprocessing import LabelEncoder
SE5=SE4
le = LabelEncoder()
SE5['Country'] = le.fit_transform(SE3['Country'])
SE5


## 3) PCA ##

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=3)
pca.fit(X_train)


In [None]:
pca.components_, pca.explained_variance_, pca.explained_variance_ratio_


In [None]:
X_pca=pca.fit_transform(X_train)
c=Y_train

plt.figure(figsize=(8,15))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=c, cmap='viridis')  # 'y' is the target variable for
#coloring points

plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA Scatter Plot')
plt.colorbar()  # Add a colorbar if you're using class labels for coloring
plt.show()


## 4) Data Spliting ##

In [None]:
from sklearn.model_selection import train_test_split

x1=SE5.iloc[:,0:8]
x2=SE5.iloc[:,10:11]
X=pd.concat([x1,x2], axis=1, join='outer',ignore_index=False,verify_integrity=False, sort=False)
Y=SE5['Value_co2_emissions_kt_by_country']

X_train,X_test, Y_train,Y_test=train_test_split(X, Y, test_size=0.3, random_state=353 )
X_train.shape, Y_test.shape
X_train.describe()


# IV. Model training and evaluation #

In [None]:
from sklearn.metrics import mean_squared_error,r2_score

def print_score(clf, X_train, X_test, Y_train, Y_test, train=True):
    "Print the accuracy score, Classification report and confusion matrix"
    if train:
        "Train performance"
        Y_train_pred=clf.predict(X_train.reshape(-1,1))
        print(f'Train Results:\n')
        print(f'MSE Train={mean_squared_error(Y_train,Y_train_pred):.4f}')
        print(f'R² Train={r2_score(Y_train,Y_train_pred):.4f}')
        
   
    elif train==False:
        "Test performance"
        Y_test_pred=clf.predict(X_test.reshape(-1,1))
        print(f'Test Results:\n')
        print(f'MSE Test ={mean_squared_error(Y_test,Y_test_pred):.4f}')
        print(f'R² Test ={r2_score(Y_test,Y_test_pred):.4f}')


## A) Renewable Energy Share Vs Primary energy consumption per capita (kWh/person) ##

In [None]:
x1=X_train['Primary energy consumption per capita (kWh/person)'].values
y1=pd.DataFrame(X_train['Renewable energy share in the total final energy consumption (%)'])
x1_test=X_test['Primary energy consumption per capita (kWh/person)'].values
y1_test=pd.DataFrame(X_test['Renewable energy share in the total final energy consumption (%)'])
x1

## 1) Random Forest ##

In [None]:
from sklearn.ensemble import RandomForestRegressor

#Criterion: 'poisson', 'squared_error', 'friedman_mse', 'absolute_error'
forest=RandomForestRegressor(n_estimators=500, criterion='squared_error', random_state=42, n_jobs=-1)
forest.fit(x1.reshape(-1,1),y1)
forest


In [None]:
#sort_idx problem !!!!
sort_idx=x1.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(x1,y1, c='b')
plt.plot(x1[sort_idx],forest.predict(x1[sort_idx].reshape(-1,1)), color='r', linewidth=2)
plt.xlabel('Primary energy consumption')
plt.ylabel('Renewable energy share')


In [None]:
print_score(forest, x1, x1_test, y1, y1_test, train=True)
print('------------------------------------------>>')
print_score(forest, x1, x1_test, y1, y1_test, train=False)


## 2) Decision Tree ##

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=2, min_samples_leaf=10) 
T=tree.fit(x1.reshape(-1,1),y1)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(x1 ,y1 , s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='max_depth=3',
         linewidth=2 )
plt.xlabel('Primary energy consumption')
plt.ylabel('Renewable energy share')
plt.legend()
plt.show()


In [None]:
print_score(tree, x1, x1_test, y1, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, x1, x1_test, y1, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>
R² test is highest with Decison tree, sems like the best regressor for this case.

## 3) Polynomial ##

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=5)
X_poly=poly.fit_transform(np.array(x1).reshape(-1,1))
poly2=LinearRegression()
poly2.fit(X_poly,np.array(y1).reshape(-1,1))
X_fit=np.arange(x1.min(), x1.max(),0.1)[:,np.newaxis]
Y_pred=poly2.predict(poly.fit_transform(X_fit.reshape(-1,1)))
X_fit


In [None]:

plt.figure(figsize=(12,6))
plt.scatter(x1,y1, c='green')
plt.plot(X_fit,Y_pred,c='yellow', linewidth=3)
plt.show()


In [None]:
print(f'model Coef={poly2.coef_}, R² ={r2_score(y1,poly2.predict(X_poly)):.3f}')


## 4) Ridge ##

In [None]:
from sklearn.linear_model import Ridge

#rng_N=preprocessing.normalize(rng.reshape(-1,1))
ridge=Ridge(alpha=0.5, fit_intercept=True)#, normalize=True)
ridge.fit(x1.reshape(-1,1), y1)
ridge_pred=ridge.predict(x1.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(x1, y1, color='blue')
plt.plot(x1,ridge_pred, color='red' )
ridge.coef_


In [None]:
print_score(ridge, x1, x1_test, y1, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, x1, x1_test, y1, y1_test, train=False)


## B) Renewable Energy Share Vs Access to electricity (% of population) ##

In [None]:
X=X_train['Access to electricity (% of population)'].values
Y=pd.DataFrame(X_train['Renewable energy share in the total final energy consumption (%)'])
x1_test=X_test['Access to electricity (% of population)'].values
y1_test=pd.DataFrame(X_test['Renewable energy share in the total final energy consumption (%)'])
X


### 1) Decision Tree 


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=2, min_samples_leaf=10)
tree.fit(X.reshape(-1,1),Y)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(X,Y, s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='max_depth=3',linewidth=2 )
plt.xlabel('Primary energy consumption')
plt.ylabel('Renewable energy share')
plt.legend()
plt.show()


In [None]:
print_score(tree, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, X, x1_test, Y, y1_test, train=False)


### 2) Ridge:

In [None]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=0.5, fit_intercept=True)
ridge.fit(X.reshape(-1,1), Y)
ridge_pred=ridge.predict(X.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(X,Y, color='blue',edgecolor='k')
plt.plot(X,ridge_pred, color='red', linewidth=3 )
ridge.coef_


In [None]:
print_score(ridge, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is highest (with an MSE=3%) with Ridge model, sems like the best regressor for this case.

### 3) AdaBoost:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=500, random_state=42)
ada.fit(X.reshape(-1,1) ,np.array(Y))
Y_train_predict=ada.predict(X.reshape(-1,1))
Y_test_predict=ada.predict(np.array(x1_test).reshape(-1,1))


In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X,Y, color='blue',edgecolor='k')
plt.plot(X,Y_train_predict, color='red', linewidth=3 )
plt.show()


In [None]:
print_score(ada, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ada, X, x1_test, Y, y1_test, train=False)


## C) Renewable Energy Share Vs Value_co2_emissions_kt_by_country ##

In [None]:
#SE4
X_train


In [None]:
X=np.array(Y_train)
Y=X_train['Renewable energy share in the total final energy consumption (%)']
x1_test=np.array(Y_test)
y1_test=X_test['Renewable energy share in the total final energy consumption (%)']
Y_train


### 1) Decision Tree 


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=3, min_samples_leaf=10)
tree.fit(X.reshape(-1,1),Y)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(X,Y, s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='max_depth=3',linewidth=2 )
plt.xlabel('Co2 Emissions')
plt.ylabel('Renewable energy share')
plt.legend()
plt.show()


In [None]:
print_score(tree, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, X, x1_test, Y, y1_test, train=False)


### 2) Ridge:

In [None]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=0.5, fit_intercept=True)
ridge.fit(X.reshape(-1,1), Y)
ridge_pred=ridge.predict(X.reshape(-1,1))

plt.figure(figsize=(10,8))
plt.scatter(X,Y, color='blue',edgecolor='k')
plt.plot(X,ridge_pred, color='red', linewidth=3 )
ridge.coef_


In [None]:
print_score(ridge, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, X, x1_test, Y, y1_test, train=False)


## 3) Polynomial ##

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=2)
X_poly=poly.fit_transform(np.array(X).reshape(-1,1))
poly2=LinearRegression()
poly2.fit(X_poly,np.array(Y).reshape(-1,1))
X_fit=np.arange(X.min(), X.max(),0.1)[:,np.newaxis]
Y_pred=poly2.predict(poly.fit_transform(X_fit.reshape(-1,1)))
X_fit


In [None]:

plt.figure(figsize=(12,6))
plt.scatter(x1,y1, c='green', edgecolor='k')
plt.plot(X_fit,Y_pred,c='yellow', linewidth=3)
plt.show()


In [None]:
print(f'model Coef={poly2.coef_}, R² ={r2_score(y1,poly2.predict(X_poly)):.3f}')


### 4) AdaBoost:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=500, random_state=42)
ada.fit(X.reshape(-1,1) ,np.array(Y))
Y_train_predict=ada.predict(X.reshape(-1,1))
Y_test_predict=ada.predict(np.array(x1_test).reshape(-1,1))


In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X,Y, color='blue',edgecolor='k')
plt.plot(x1_test,Y_test_predict, color='red', linewidth=3 )
plt.show()


In [None]:
print_score(ada, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ada, X, x1_test, Y, y1_test, train=False)


### 5) Log:

In [None]:
for a in X:
    if a==0:
        a==0.00000001
for b in x1_test:
    if b==0:
        b==0.00000001
x1_test


In [None]:
X_log=np.log(X.reshape(-1,1))
Y_log=Y.values.reshape(-1,1)
xtest_log=np.log(x1_test)
ytest_log=y1_test.values.reshape(-1,1)
xtest_log


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Add a constant to the independent variable
x_data = sm.add_constant(X)
# Fit the model
log = sm.OLS(np.log(Y), x_data)
results = log.fit()
# Extract the coefficients
a, b = results.params
# Predict the values
y_pred = np.exp(a + b * x_data[:, 1])
# Plot the data and the fitted curve
plt.scatter(x_data[:, 1], Y, label="Data")
plt.plot(x_data[:, 1], y_pred, 'r', label="Fitted Curve")
plt.legend()
plt.ylabel("co2_emissions_kt_by_country")
plt.xlabel("Renewable Energy Share")
plt.show()

print(f"Estimated coefficients: a = {a}, b = {b}")


## D\ Carbon Emissions Vs Primary energy consumption per capita (kWh/person) :

In [None]:

X=np.array(X_train['Primary energy consumption per capita (kWh/person)']).reshape(-1,1)
Y=Y_train
x1_test=np.array(X_test['Primary energy consumption per capita (kWh/person)'])
y1_test=Y_test
X


### 1) Ridge:

In [None]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=0.5, fit_intercept=True)
ridge.fit(X,Y)
ridge_pred=ridge.predict(X)

plt.figure(figsize=(10,8))
plt.scatter(X,Y, color='blue',edgecolor='k',label='data')
plt.plot(X,ridge_pred, color='red', linewidth=3,label='regression curve' )
plt.legend(loc='best')
plt.xlabel('Primary energy consumption per capita (kWh/person)')
plt.ylabel("co2_emissions_kt_by_country")
ridge.coef_


In [None]:
print_score(ridge, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, X, x1_test, Y, y1_test, train=False)


### 2) Decision Tree 


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=3, min_samples_leaf=10)
tree.fit(X,Y)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(X,Y, s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='Regression max_depth=3',linewidth=2 )
plt.xlabel('Primary energy consumption per capita (kWh/person)')
plt.ylabel('Co2 Emissions')
plt.legend()
plt.show()


In [None]:
print_score(tree, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is -0.8% (with an MSE=0.4%) with Decision Tree  model.

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(tree,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2200],cv = 6,
                                                              scoring = 'r2')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=tree.predict(X)
MSE_p=r2_score(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2200],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('R²', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim(-0.2,0.4)
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Underfitting?

### 3) AdaBoost:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=500, random_state=42)
ada.fit(X ,np.array(Y))
Y_train_predict=ada.predict(X)
Y_test_predict=ada.predict(np.array(x1_test).reshape(-1,1))


In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X,Y, color='blue',edgecolor='k', label='Data')
plt.scatter(x1_test,Y_test_predict, color='red', linewidth=3, label='Adaboost Regression' )
plt.xlabel('Primary energy consumption per capita (kWh/person)')
plt.ylabel('Co2 Emissions')
plt.ylim(-0.05,0.6)
plt.legend()
plt.show()


In [None]:
print_score(ada, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ada, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is -18% (with an MSE=1%) with Ada Boost  model. low fitting

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(ada,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2200],cv = 6,
                                                              scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=ada.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2200],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Ada Boost model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim(0.005,0.013)
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Underfitting

## 3) Polynomial ##

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=2)
X_poly=poly.fit_transform(X)
poly2=LinearRegression()
poly2.fit(X_poly,np.array(Y).reshape(-1,1))
X_fit=np.arange(X.min(), X.max(),0.1)[:,np.newaxis]
Y_pred=poly2.predict(poly.fit_transform(X_fit.reshape(-1,1)))
X_fit


In [None]:

plt.figure(figsize=(12,6))
plt.scatter(X,Y, c='lightblue', edgecolor='k', label='Data')
plt.plot(X_fit,Y_pred,c='orange', linewidth=3, label='Poly Regression')
plt.xlabel('Primary energy consumption per capita (kWh/person)')
plt.ylabel('Co2 Emissions')
plt.legend()
plt.show()


In [None]:

print(f'model Coef={poly2.coef_}, R² ={r2_score(Y,poly2.predict(X_poly)):.3f}')
print(f'R² test ={r2_score(np.array(y1_test).reshape(-1,1),poly2.predict(poly.fit_transform(x1_test.reshape(-1,1)))):.3f}')


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is 2.1% with Polynomial model.

## E\ Carbon Emissions Vs Electricity from fossil fuels :

In [None]:

X=np.array(X_train['Electricity from fossil fuels (TWh)']).reshape(-1,1)
Y=Y_train
X1_test=np.array(X_test['Electricity from fossil fuels (TWh)']).reshape(-1,1)
y1_test=Y_test
len(X1_test)


### 1) Ridge:

In [None]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=0.5, fit_intercept=True)
ridge.fit(X,Y)
ridge_pred=ridge.predict(X)

plt.figure(figsize=(10,8))
plt.scatter(X,Y, color='blue',edgecolor='k', label='Data')
plt.plot(X,ridge_pred, color='red', linewidth=3, label='Regression Curve' )
plt.legend(loc='best')
plt.xlabel('Fossil Fuels(TWh)')
plt.ylabel("co2_emissions_kt_by_country")
ridge.coef_


In [None]:
print_score(ridge, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is 75% (with an MSE=10^-3 %) with Ridge  model. High fitting

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(ridge,X,Y
                                                              ,train_sizes=[100,500, 900, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=ridge.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 1900, 2100],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Ridge model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim(0,0.0002)
plt.xlim(750,2100)
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Overfitting

### 2) Decision Tree 


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=3, min_samples_leaf=50)
tree.fit(X,Y)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(X,Y, s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='Regression max_depth=3',linewidth=2 )
plt.xlabel('Fossil Fuels(TWh)')
plt.ylabel('Co2 Emissions')
plt.legend()
plt.show()


In [None]:
print_score(tree, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, X, x1_test, Y, y1_test, train=False)


## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(tree,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'r2')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=tree.predict(X)
MSE_p=r2_score(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2100],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('R²', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Underfitting. model needing more samples

### 3) AdaBoost:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=500, random_state=42)
ada.fit(X ,np.array(Y))
Y_train_predict=ada.predict(X)
Y_test_predict=ada.predict(np.array(x1_test).reshape(-1,1))


In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X,Y, color='blue',edgecolor='k', label='Data')
plt.scatter(x1_test,Y_test_predict, color='red', linewidth=3, label='Adaboost Regression' )
plt.xlabel('Fossil Fuel TW/h')
plt.ylabel('Co2 Emissions')
plt.ylim()
plt.legend()
plt.show()


In [None]:
print_score(ada, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ada, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is 76% (with an MSE=9^-4%) with Ada Boost  model. high fitting

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(ada,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=ada.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2200],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Ada Boost model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Good Vaiance/Bias trade off

## F\ Carbon Emissions Vs Electricity from Renewables :

In [None]:

X=np.array(X_train['Electricity from renewables (TWh)']).reshape(-1,1)
Y=Y_train
x1_test=np.array(X_test['Electricity from renewables (TWh)'])
y1_test=Y_test
len(x1_test)


### 1) Ridge:

In [None]:
from sklearn.linear_model import Ridge

ridge=Ridge(alpha=0.5, fit_intercept=True)
ridge.fit(X,Y)
ridge_pred=ridge.predict(X)

plt.figure(figsize=(10,8))
plt.scatter(X,Y, color='blue',edgecolor='k', label='Data')
plt.plot(X,ridge_pred, color='red', linewidth=3, label='Regression Curve' )
plt.legend(loc='best')
plt.xlabel('Electricity from renewables (TWh)')
plt.ylabel("co2_emissions_kt_by_country")
ridge.coef_


In [None]:
print_score(ridge, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ridge, X, x1_test, Y, y1_test, train=False)


## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(ridge,X,Y
                                                              ,train_sizes=[100,500, 900, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=ridge.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 1900, 2100],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Ridge model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim(0.0013, 0.0018)
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Overfitting

### 2) Decision Tree 


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree=DecisionTreeRegressor(max_depth=3, min_samples_leaf=50)
tree.fit(X,Y)


In [None]:
sort_idx=x1_test.reshape(-1,1).flatten().argsort()
plt.figure(figsize=(10,6))
plt.scatter(X,Y, s=20,edgecolor='k', c='blue', label='data')
plt.plot(x1_test[sort_idx],tree.predict(x1_test.reshape(-1,1))[sort_idx], 
         color='red', label='Regression max_depth=3',linewidth=2 )
plt.xlabel('Electricity from renewables (TWh)')
plt.ylabel('Co2 Emissions')
plt.legend()
plt.show()


In [None]:
print_score(tree, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(tree, X, x1_test, Y, y1_test, train=False)


## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(tree,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'r2')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=tree.predict(X)
MSE_p=r2_score(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2100],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('R²', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Decision Tree model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Underfitting. model needing more samples

### 3) AdaBoost:

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada=AdaBoostRegressor(DecisionTreeRegressor(max_depth=3), n_estimators=500, random_state=42)
ada.fit(X ,np.array(Y))
Y_train_predict=ada.predict(X)
Y_test_predict=ada.predict(np.array(x1_test).reshape(-1,1))


In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X,Y, color='blue',edgecolor='k', label='Data')
plt.scatter(x1_test,Y_test_predict, color='red', linewidth=3, label='Adaboost Regression' )
plt.xlabel('Fossil Fuel TW/h')
plt.ylabel('Co2 Emissions')
plt.ylim()
plt.legend()
plt.show()


In [None]:
print_score(ada, X, x1_test, Y, y1_test, train=True)
print('------------------------------------------>>')
print_score(ada, X, x1_test, Y, y1_test, train=False)


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is 76% (with an MSE=9*10^-4%)  with Ada Boost  model. high fitting

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, validation_scores = learning_curve(ada,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 2000, 2100],cv = 6,
                                                              scoring = 'neg_mean_squared_error')
print('Training scores:\n\n',train_scores)
print('\nValidation scores:\n\n', validation_scores)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) #np.abs(
y_pred=ada.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.plot([500,700, 1200, 1500, 2000, 2200],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Ada Boost model', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim()
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Good Vaiance/Bias trade off

## 4) Polynomial ##

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


poly=PolynomialFeatures(degree=3)
X_poly=poly.fit_transform(X)
poly2=LinearRegression()
model = make_pipeline(PolynomialFeatures(3), LinearRegression())
poly2.fit(X_poly,np.array(Y).reshape(-1,1))
X_fit=np.arange(X.min(), X.max(),0.1)[:,np.newaxis]
Y_pred=poly2.predict(poly.fit_transform(X_fit.reshape(-1,1)))
X_fit


In [None]:

plt.figure(figsize=(12,6))
plt.scatter(X,Y, c='lightblue', edgecolor='k', label='Data')
plt.plot(X_fit,Y_pred,c='orange', linewidth=3, label='Poly Regression')
plt.xlabel('Energy from Renewables TW/h')
plt.ylabel('Co2 Emissions')
plt.legend()
plt.show()


In [None]:

print(f'model Coef={poly2.coef_}, R² ={r2_score(Y,poly2.predict(X_poly)):.3f}')
print(f'R² test ={r2_score(np.array(y1_test).reshape(-1,1),poly2.predict(poly.fit_transform(x1_test.reshape(-1,1)))):.3f}')


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

R² test is 76% with Polynomial model. best fitting result.

## Plottig learning curve

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(model,X,Y
                                                              ,train_sizes=[500,700, 1200, 1500, 1800, 2000],cv = 5,
                                                              scoring = 'neg_mean_squared_error')
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
print('Training scores:\n\n',train_scores_mean)
print('\nTest scores:\n\n', test_scores_mean)


In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1) 
y_pred=ada.predict(X)
MSE_p=mean_squared_error(Y,y_pred)

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, '-.', color="b", label="Training error")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.plot([500,700, 1200, 1500, 2000, 2200],[MSE_p,MSE_p,MSE_p,MSE_p,MSE_p,MSE_p], label='Desired', color='r')

plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for Polynomial 3', fontsize = 14, y = 1.03)
plt.legend()
plt.ylim(0.0010,0.0021)
plt.show()


## <b><span style="color: #FF0000"> Interpretation:</span> </b>

Overfitting