In [1]:
# Import the required variables
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

In [3]:
def num_plot(df, col, title, symb):
    fig, ax = plt.subplots(2, 1, sharex=True, figsize=(8,5),gridspec_kw={"height_ratios": (.2, .8)})
    ax[0].set_title(title,fontsize=18)
    sns.boxplot(x=col, data=df, ax=ax[0])
    ax[0].set(yticks=[])
    sns.histplot(x=col, data=df, ax=ax[1])
    ax[1].set_xlabel(col, fontsize=16)
    plt.axvline(df[col].mean(), color='darkgreen', linewidth=2.2, label='mean=' + str(np.round(df[col].mean(),1)) + symb)
    plt.axvline(df[col].median(), color='red', linewidth=2.2, label='median='+ str(np.round(df[col].median(),1)) + symb)
    plt.axvline(df[col].mode()[0], color='purple', linewidth=2.2, label='mode='+ str(df[col].mode()[0]) + symb)
    plt.legend(bbox_to_anchor=(1, 1.03), ncol=1, fontsize=17, fancybox=True, shadow=True, frameon=True)
    plt.tight_layout()
    plt.show()

In [4]:
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlesize=14, titlepad=10)

<Figure size 432x288 with 0 Axes>

In [8]:
# Import the required variables
import pandas as pd
import numpy as np

# Read the data
Res_Nonhydro_Capita = pd.read_excel ("../data/processed/Res_Nonhydro_Capita.xlsx", index_col = 0)
SDGs_Ranks_2021 = pd.read_excel ("../data/processed/SDGs_Ranks_2021.xlsx", index_col = 0)
SDGs_Scores_2021 = pd.read_excel ("../data/processed/SDGs_Scores_2021.xlsx", index_col = 0)

# Only consider the 2021 year
Res_Nonhydro_Capita_2021 = Res_Nonhydro_Capita[[2021]]
Res_Nonhydro_Capita_2021= Res_Nonhydro_Capita_2021.rename(columns={ 2021: 'Res_Nonhydro_Capita_2021'})

# Do not consider the first column
SDGs_Ranks_2021 = SDGs_Ranks_2021.iloc[:,1:]
SDGs_Scores_2021 = SDGs_Scores_2021.iloc[:,1:]

# Append the target to the Datasets
SDGs_Ranks_2021 = pd.concat([SDGs_Ranks_2021, Res_Nonhydro_Capita_2021], axis=1)
SDGs_Scores_2021 = pd.concat([SDGs_Scores_2021, Res_Nonhydro_Capita_2021], axis=1)

# Dicotomizing Res_Nonhydro_Capita_2021 in Wealthy and Unwealthy Countries
median = Res_Nonhydro_Capita_2021['Res_Nonhydro_Capita_2021'].median()
Res_Nonhydro_Capita_2021['Res_Nonhydro_Capita_2021'] = Res_Nonhydro_Capita_2021.Res_Nonhydro_Capita_2021.map(lambda x: 1 if x >= median else 0)
Res_Nonhydro_Capita_2021 = Res_Nonhydro_Capita_2021.rename(columns={'Res_Nonhydro_Capita_2021': 'Wealthy'})

# Append the target column "Wealthy"
SDGs_Scores_2021 = pd.concat([SDGs_Scores_2021, Res_Nonhydro_Capita_2021], axis=1)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

X = SDGs_Scores_2021.drop('Wealthy',1)
y = SDGs_Scores_2021['Wealthy']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [16]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

maxComponents = [ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
list_n_neighbors = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

n = 0
for componentsi in maxComponents:
    pca = PCA(n_components=componentsi, random_state=42)
    pca.fit(X_train_scaled)
    X_train_scaled_pca = pca.transform(X_train_scaled)
    X_test_scaled_pca = pca.transform(X_test_scaled)
    for n_neighborsi in list_n_neighbors:
        kn_clas = KNeighborsClassifier(n_neighbors = n_neighborsi)
        kn_clas.fit(X_train_scaled_pca, y_train)
        predictions = kn_clas.predict(X_test_scaled_pca)
        if (n == 0): # The first time initialize
            Accuracy_Best = accuracy_score(y_test, predictions)
            n_neighbors_Best = n_neighborsi
            max_components_Best = componentsi
        else:
            Accuracy_New = accuracy_score(y_test, predictions)
            if ( Accuracy_Best < Accuracy_New):
                Accuracy_Best = Accuracy_New
                n_neighbors_Best = n_neighborsi
                max_components_Best = componentsi
            else:
                pass
        n +=1
print('Accuracy_Best', Accuracy_Best, 
        'max_components_Best', max_components_Best,
        'n_estimators', n_neighbors_Best)

Accuracy_Best 0.8214285714285714 max_components_Best 1 n_estimators 3


In [18]:
pipeline = Pipeline(steps = [
    ("scaler", StandardScaler()), # primero escalo
    ("pca", PCA()), # segundo aplica PCA 
    ("kneighborsclassifier", KNeighborsClassifier()) # Despues un KNeighborsClassifier
])

pipeline_param = {
    'pca__n_components' :  [1],
    'pca__random_state' :  [42],
    'kneighborsclassifier__n_neighbors' :  [3]
}

gs_pipeline = GridSearchCV(pipeline,
                            pipeline_param,
                            cv = 10,
                            scoring = 'accuracy',
                            verbose = 1, # mensajes del output
                            n_jobs = -1)

search = GridSearchCV(pipeline, pipeline_param, cv=5).fit(X_train, y_train)

print("Train: Coeficiente de determinacion de la predicción:", search.best_estimator_.score(X_train, y_train))
print("Test: Coeficiente de determinacion de la predicción:", search.best_estimator_.score(X_test, y_test))

Train: Coeficiente de determinacion de la predicción: 0.8807339449541285
Test: Coeficiente de determinacion de la predicción: 0.8214285714285714


# ABAJO ANTOGUO

In [35]:
pipeline = Pipeline(steps = [
    ("scaler", StandardScaler()), # primero escalo
    ("pca", PCA()), # segundo aplica PCA 
    ("kneighborsclassifier", KNeighborsClassifier()) # Despues un KNeighborsClassifier
])

pipeline_param = {
    'pca__n_components' :  list(range(1,17)),
    'kneighborsclassifier__n_neighbors' :  list(range(1,50))
}

gs_pipeline = GridSearchCV(pipeline,
                            pipeline_param,
                             cv = 10,
                            scoring = 'accuracy',
                            verbose = 1, # mensajes del output
                            n_jobs = -1)

grids = {"gs_pipeline": gs_pipeline}

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


In [36]:
print(gs_pipeline.best_score_)
print(gs_pipeline.best_params_)
print(gs_pipeline.best_estimator_)

0.889090909090909
{'kneighborsclassifier__n_neighbors': 11, 'pca__n_components': 2}
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=2)),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=11))])


In [37]:
print("Accuracy train", gs_pipeline.best_estimator_.score(X_train, y_train))
print("Accuracy test", gs_pipeline.best_estimator_.score(X_test, y_test))

Accuracy train 0.8807339449541285
Accuracy test 0.8214285714285714


## MEJOR MODEL
* n_neighbors 11
* n_components 2
* Accuracy train 0.8807339449541285
* Accuracy test 0.8214285714285714

In [187]:
import pickle

with open('model_v1', 'wb') as archivo_salida:
    pickle.dump(gs_pipeline.best_estimator_, archivo_salida)

In [188]:
with open('model_v1', 'rb') as archivo_entrada:
    loaded_model = pickle.load(archivo_entrada)

In [189]:
loaded_model.score(X_test,y_test)

0.8214285714285714

In [171]:
test_csv = pd.concat([X_test, y_test], axis=1)
test_csv

Unnamed: 0_level_0,SDG1,SDG2,SDG3,SDG4,SDG5,SDG6,SDG7,SDG8,SDG9,SDG10,SDG11,SDG12,SDG13,SDG14,SDG15,SDG16,SDG17,Wealthy
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Portugal,99.3815,64.1045,90.954769,99.003333,83.27275,84.1368,95.022667,82.155333,75.417667,69.697,88.422,63.0565,75.819667,43.065,72.673,88.0677,62.67875,1
Poland,99.4655,68.159875,85.269923,98.298667,72.18,84.9112,89.265667,84.405333,71.76,75.4285,82.428,75.869333,75.097,65.122,92.32,81.4482,62.381,1
Belize,55.346,59.362571,72.339308,77.694333,56.37825,75.8738,92.3135,58.4702,22.748,36.501007,79.086,72.233,91.131667,67.9176,54.1744,60.812125,71.505,1
Central African Republic,0.0,43.6455,16.026071,9.207,30.88825,45.5114,12.8275,47.698,3.526,9.641,19.243,72.737667,99.673,48.160239,89.9138,44.737,37.324333,0
Turkey,99.624,65.068,82.928,96.273,46.32975,75.1284,89.161,59.060667,60.3015,38.105,73.9875,79.810833,86.967667,51.998167,53.2594,66.496444,71.96325,1
Italy,98.06,71.018625,94.043,96.708333,72.829,82.135,93.847333,76.572,83.6045,71.545,82.34125,63.609667,76.82,53.541167,80.2214,78.9895,62.95275,1
Ireland,99.795,75.894875,93.231846,99.738708,76.39025,87.2854,93.429333,87.3286,82.017,86.423,88.75525,45.179,58.196333,71.895167,89.083,90.434333,51.30775,1
Sri Lanka,91.544,62.129625,77.814571,97.413667,52.00425,56.627,66.947667,76.2195,34.159167,36.333,82.970667,85.360167,95.152,71.7165,55.9466,66.2264,49.136,0
Vietnam,95.1425,71.032625,72.467429,97.107667,72.20325,73.138,83.200333,72.149167,43.975667,70.4995,80.11625,91.899667,93.97,49.192167,46.689,65.96,59.68125,0
Mauritania,70.742,43.194571,41.840154,39.399,32.27375,58.3254,42.2215,45.026167,17.474833,81.838,40.36025,89.013,95.74,65.951667,76.103,52.6346,51.450667,0


In [172]:
test_csv.to_csv('test.csv', index = True)

In [173]:
import json
  
# Data to be written
dictionary ={
    "nombre_alumno" : "Miguel Chamochin",
    "titulo" : "Conectando los Objetivos de Desarrollo Sostenible con el cambio climático y la transición energética",
    "tipo_ml" : "O",
    "target" : "Wealthy"
}
  
with open("info.json", "w") as outfile:
    json.dump(dictionary, outfile)

# Predicciones - SDGs_Ranks_2021

In [190]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

X = SDGs_Ranks_2021.drop('Wealthy',1)
y = SDGs_Ranks_2021['Wealthy']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [191]:
pipeline = Pipeline(steps = [
    ("scaler", StandardScaler()), # primero escalo
    ("pca", PCA()), # segundo aplica PCA 
    ("kneighborsclassifier", KNeighborsClassifier()) # Despues un KNeighborsClassifier
])

pipeline_param = {
    'pca__n_components' :  list(range(1,17)),
    'kneighborsclassifier__n_neighbors' :  list(range(1,50))
}

gs_pipeline = GridSearchCV(pipeline,
                            pipeline_param,
                             cv = 10,
                            scoring = 'accuracy',
                            verbose = 1, # mensajes del output
                            n_jobs = -1)

grids = {"gs_pipeline": gs_pipeline}

for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


In [192]:
print(gs_pipeline.best_score_)
print(gs_pipeline.best_params_)
print(gs_pipeline.best_estimator_)

0.8890909090909093
{'kneighborsclassifier__n_neighbors': 1, 'pca__n_components': 10}
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=10)),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=1))])


In [193]:
print("Accuracy train", gs_pipeline.best_estimator_.score(X_train, y_train))
print("Accuracy test", gs_pipeline.best_estimator_.score(X_test, y_test))

Accuracy train 1.0
Accuracy test 0.6785714285714286


# CONCLUSION