In [1]:
# Import the required variables
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt, style # for ploting
import seaborn as sns

In [2]:
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

In [13]:
def num_plot(df, col, title, symb):
    fig, ax = plt.subplots(2, 1, sharex=True, figsize=(8,5),gridspec_kw={"height_ratios": (.2, .8)})
    ax[0].set_title(title,fontsize=18)
    sns.boxplot(x=col, data=df, ax=ax[0])
    ax[0].set(yticks=[])
    sns.histplot(x=col, data=df, ax=ax[1])
    ax[1].set_xlabel(col, fontsize=16)
    plt.axvline(df[col].mean(), color='darkgreen', linewidth=2.2, label='mean=' + str(np.round(df[col].mean(),1)) + symb)
    plt.axvline(df[col].median(), color='red', linewidth=2.2, label='median='+ str(np.round(df[col].median(),1)) + symb)
    plt.axvline(df[col].mode()[0], color='purple', linewidth=2.2, label='mode='+ str(df[col].mode()[0]) + symb)
    plt.legend(bbox_to_anchor=(1, 1.03), ncol=1, fontsize=17, fancybox=True, shadow=True, frameon=True)
    plt.tight_layout()
    plt.show()

In [14]:
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10)

<Figure size 432x288 with 0 Axes>

In [3]:
# Read the data
Res_Nonhydro_Capita = pd.read_excel ("../data/processed/Res_Nonhydro_Capita.xlsx", index_col = 0)
SDGs_Ranks_2021 = pd.read_excel ("../data/processed/SDGs_Ranks_2021.xlsx", index_col = 0)
SDGs_Scores_2021 = pd.read_excel ("../data/processed/SDGs_Scores_2021.xlsx", index_col = 0)

# Only consider the 2021 year
Res_Nonhydro_Capita_2021 = Res_Nonhydro_Capita[[2021]]
Res_Nonhydro_Capita_2021= Res_Nonhydro_Capita_2021.rename(columns={ 2021: 'Res_Nonhydro_Capita_2021'})

# Do not consider the first column
SDGs_Ranks_2021 = SDGs_Ranks_2021.iloc[:,1:]
SDGs_Scores_2021 = SDGs_Scores_2021.iloc[:,1:]

# Append the target to the Datasets
SDGs_Ranks_2021 = pd.concat([SDGs_Ranks_2021, Res_Nonhydro_Capita_2021], axis=1)
SDGs_Scores_2021 = pd.concat([SDGs_Scores_2021, Res_Nonhydro_Capita_2021], axis=1)

# Predicciones - Modelo KNN SDGs_Scores_2021

In [4]:
from sklearn.neighbors import KNeighborsRegressor # Propio de este modelo
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

X = SDGs_Scores_2021.drop('Res_Nonhydro_Capita_2021',1)
y = SDGs_Scores_2021['Res_Nonhydro_Capita_2021']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

k_range = range(1,40)
scores = []

for k in k_range:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train,y_train)
    scores.append(knn.score(X_test,y_test))

print('El k optimo es', np.array(k_range)[scores == max(scores)])

El k optimo es [1]


In [6]:
knn = KNeighborsRegressor(n_neighbors=1)
knn.fit(X_train,y_train)
print("KNeighbors Test: coefficient of determination R^2 of the prediction: ", knn.score(X_test,y_test))
print("KNeighbors Train: coefficient of determination R^2 of the prediction: ", knn.score(X_train,y_train))

KNeighbors Test: coefficient of determination R^2 of the prediction:  0.9088257552301467
KNeighbors Train: coefficient of determination R^2 of the prediction:  1.0


**KNeighborsRegressor' object has no attribute 'feature_importances_**

# Predicciones - Modelo Decission Trees SDGs_Ranks_2021

In [44]:
from sklearn.neighbors import KNeighborsRegressor # Propio de este modelo
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

X = SDGs_Ranks_2021.drop('Res_Nonhydro_Capita_2021',1)
y = SDGs_Ranks_2021['Res_Nonhydro_Capita_2021']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

k_range = range(1,40)
scores = []

for k in k_range:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train,y_train)
    scores.append(knn.score(X_test,y_test))

print('El k optimo es', np.array(k_range)[scores == max(scores)])

El k optimo es [2]


In [48]:
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train,y_train)
print("KNeighbors Test: coefficient of determination R^2 of the prediction: ", knn.score(X_test,y_test))
print("KNeighbors Train: coefficient of determination R^2 of the prediction: ", knn.score(X_train,y_train))

KNeighbors Test: coefficient of determination R^2 of the prediction:  0.9478716341292264
KNeighbors Train: coefficient of determination R^2 of the prediction:  0.8018615997859501


# CONCLUSIÓN

# El mejor Modelo es el KNN SDGs_Ranks_2021 con n_neighbors=2 obtiene un coefficient of determination R^2 of the prediction de 0.94 en test frente SDGs_Scores_2021 que obtiene un 0.90 con n_neighbors=1

In [49]:
from sklearn.neighbors import KNeighborsRegressor # Propio de este modelo
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split

X = SDGs_Ranks_2021.drop('Res_Nonhydro_Capita_2021',1)
y = SDGs_Ranks_2021['Res_Nonhydro_Capita_2021']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train,y_train)

print("KNeighbors Test: coefficient of determination R^2 of the prediction: ", knn.score(X_test,y_test))
print("KNeighbors Train: coefficient of determination R^2 of the prediction: ", knn.score(X_train,y_train))

KNeighbors Test: coefficient of determination R^2 of the prediction:  0.9478716341292264
KNeighbors Train: coefficient of determination R^2 of the prediction:  0.8018615997859501
