We can use principal component analysis (PCA) for the following purposes:
* To reduce the number of dimensions in the dataset.
* To find patterns in the high-dimensional dataset

**Eigenvectors and eigenvalues**
The eigenvectors and eigenvalues ​​of a covariance matrix (or correlation) describe the source of the PCA. Eigenvectors (main components) determine the direction of the new attribute space, and eigenvalues ​​determine its magnitude.

In [10]:
# Import the required variables
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

In [3]:
def num_plot(df, col, title, symb):
    fig, ax = plt.subplots(2, 1, sharex=True, figsize=(8,5),gridspec_kw={"height_ratios": (.2, .8)})
    ax[0].set_title(title,fontsize=18)
    sns.boxplot(x=col, data=df, ax=ax[0])
    ax[0].set(yticks=[])
    sns.histplot(x=col, data=df, ax=ax[1])
    ax[1].set_xlabel(col, fontsize=16)
    plt.axvline(df[col].mean(), color='darkgreen', linewidth=2.2, label='mean=' + str(np.round(df[col].mean(),1)) + symb)
    plt.axvline(df[col].median(), color='red', linewidth=2.2, label='median='+ str(np.round(df[col].median(),1)) + symb)
    plt.axvline(df[col].mode()[0], color='purple', linewidth=2.2, label='mode='+ str(df[col].mode()[0]) + symb)
    plt.legend(bbox_to_anchor=(1, 1.03), ncol=1, fontsize=17, fancybox=True, shadow=True, frameon=True)
    plt.tight_layout()
    plt.show()

In [4]:
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10)

<Figure size 432x288 with 0 Axes>

In [11]:
# Import the required variables
import pandas as pd
import numpy as np

# Read the data
Res_Nonhydro_Capita = pd.read_excel ("../data/processed/Res_Nonhydro_Capita.xlsx", index_col = 0)
SDGs_Ranks_2021 = pd.read_excel ("../data/processed/SDGs_Ranks_2021.xlsx", index_col = 0)
SDGs_Scores_2021 = pd.read_excel ("../data/processed/SDGs_Scores_2021.xlsx", index_col = 0)

# Only consider the 2021 year
Res_Nonhydro_Capita_2021 = Res_Nonhydro_Capita[[2021]]
Res_Nonhydro_Capita_2021= Res_Nonhydro_Capita_2021.rename(columns={ 2021: 'Res_Nonhydro_Capita_2021'})

# Do not consider the first column
SDGs_Ranks_2021 = SDGs_Ranks_2021.iloc[:,1:]
SDGs_Scores_2021 = SDGs_Scores_2021.iloc[:,1:]

# Append the target to the Datasets
SDGs_Ranks_2021 = pd.concat([SDGs_Ranks_2021, Res_Nonhydro_Capita_2021], axis=1)
SDGs_Scores_2021 = pd.concat([SDGs_Scores_2021, Res_Nonhydro_Capita_2021], axis=1)

# Conclusion - Modelo PCA XGBRegressor SDGs_Scores_2021

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score


X = SDGs_Scores_2021.drop('Res_Nonhydro_Capita_2021',1)
y = SDGs_Scores_2021['Res_Nonhydro_Capita_2021']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)



In [13]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score


ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

maxComponents = [ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
list_n_estimators = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
list_max_depth = [5, 10, 15, 20, 25]
list_learning_rate = [0.01, 0.1, 0.5, 0.75, 1, 1.25, 1.5]
n = 0
for componentsi in maxComponents:
    pca = PCA(n_components=componentsi, random_state=42)
    pca.fit(X_train_scaled)
    X_train_scaled_pca = pca.transform(X_train_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    for n_estimatorsi in list_n_estimators:
        for max_depthi in list_max_depth:
            for learning_ratei in list_learning_rate:
                xgb_reg = XGBRegressor(n_estimators=n_estimatorsi, max_depth=max_depthi, learning_rate=learning_ratei, random_state=42)
                xgb_reg.fit(X_train_scaled_pca, y_train)
                predictions = xgb_reg.predict(X_test_scaled_pca)
                if (n == 0): # The first time initialize
                    MAE_Best = mean_absolute_error(y_test, predictions)
                    n_estimators_Best = n_estimatorsi
                    max_depth_Best = max_depthi
                    learning_rate_Best = learning_ratei
                    max_components_Best = componentsi
                else:
                    MAE_New = mean_absolute_error(y_test, predictions)
                    if (MAE_New < MAE_Best):
                        MAE_Best = MAE_New
                        n_estimators_Best = n_estimatorsi
                        max_depth_Best = max_depthi
                        learning_rate_Best = learning_ratei
                        max_components_Best = componentsi
                    else:
                        pass
        n +=1
print('MAE_Best', MAE_Best, 
        'max_components_Best', max_components_Best,
        'n_estimators', n_estimators_Best, 
        'max_depth', max_depth_Best,
        'learning_rate', learning_rate_Best)

MAE_Best 124.30026474949962 max_components_Best 2 n_estimators 7 max_depth 20 learning_rate 0.5


# CONCLUSION PIPELINE TO SAVE

In [14]:
from xgboost import XGBRegressor


pipeline = Pipeline(steps = [
    ("scaler", StandardScaler()), # primero escalo
    ("pca", PCA()), # segundo aplica PCA 
    ("xGBRegressor", XGBRegressor()) # Despues un XGBRegressor
])

pipeline_param = {
    'pca__n_components' :  [2],
    'pca__random_state' :  [42],
    'xGBRegressor__n_estimators' :  [7],
    'xGBRegressor__max_depth' :  [20],   
    'xGBRegressor__learning_rate' :  [0.5]
}

search = GridSearchCV(pipeline, pipeline_param, cv=5).fit(X_train, y_train)

print("Train: Coeficiente de determinacion de la predicción:", search.best_estimator_.score(X_train, y_train))
print("Test: Coeficiente de determinacion de la predicción:", search.best_estimator_.score(X_test, y_test))

Train: Coeficiente de determinacion de la predicción: 0.9828559407903492
Test: Coeficiente de determinacion de la predicción: 0.914312814311357
