In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc, confusion_matrix, average_precision_score, precision_recall_curve


## Importing Data

In [2]:
goal_11 = pd.read_excel("sdg_data/Goal11.xlsx")

In [4]:
goal_11.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19296 entries, 0 to 19295
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Goal               19296 non-null  int64  
 1   Target             19296 non-null  object 
 2   Indicator          19296 non-null  object 
 3   SeriesCode         19296 non-null  object 
 4   SeriesDescription  19296 non-null  object 
 5   GeoAreaCode        19296 non-null  int64  
 6   GeoAreaName        19296 non-null  object 
 7   TimePeriod         19296 non-null  int64  
 8   Value              19296 non-null  float64
 9   Time_Detail        19296 non-null  object 
 10  TimeCoverage       0 non-null      float64
 11  UpperBound         0 non-null      float64
 12  LowerBound         0 non-null      float64
 13  BasePeriod         0 non-null      float64
 14  Source             19296 non-null  object 
 15  GeoInfoUrl         0 non-null      float64
 16  FootNote           152

In [8]:
subset_goal_11 = goal_11[['GeoAreaName', 'SeriesCode', 'Value']]

In [14]:
goal_11_transformed = subset_goal_11.pivot_table(index='GeoAreaName', columns = 'SeriesCode', aggfunc='mean')

In [27]:
goal_11_transformed.loc[['World']]

Unnamed: 0_level_0,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value,Value
SeriesCode,EN_ATM_PM25,EN_LND_SLUM,EN_REF_WASCOL,SD_CPA_UPRDP,SG_DSR_LGRGSR,SG_DSR_SFDRR,SG_DSR_SILN,SG_DSR_SILS,SG_GOV_LOGV,VC_DSR_AFFCT,...,VC_DSR_IJILN,VC_DSR_LSGP,VC_DSR_MISS,VC_DSR_MMHN,VC_DSR_MORT,VC_DSR_MTMP,VC_DSR_OBDN,VC_DSR_PDAN,VC_DSR_PDLN,VC_DSR_PDYN
GeoAreaName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
World,34.708362,23.5,,156.0,,118.0,,,,,...,,,,,,,,,,


In [28]:
goal_11_T_noWorld = goal_11_transformed.drop(index='World')

In [30]:
goal_11_T_noWorld.info()

<class 'pandas.core.frame.DataFrame'>
Index: 265 entries, Afghanistan to Åland Islands
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   (Value, EN_ATM_PM25)    202 non-null    float64
 1   (Value, EN_LND_SLUM)    133 non-null    float64
 2   (Value, EN_REF_WASCOL)  89 non-null     float64
 3   (Value, SD_CPA_UPRDP)   264 non-null    float64
 4   (Value, SG_DSR_LGRGSR)  132 non-null    float64
 5   (Value, SG_DSR_SFDRR)   8 non-null      float64
 6   (Value, SG_DSR_SILN)    107 non-null    float64
 7   (Value, SG_DSR_SILS)    106 non-null    float64
 8   (Value, SG_GOV_LOGV)    110 non-null    float64
 9   (Value, VC_DSR_AFFCT)   142 non-null    float64
 10  (Value, VC_DSR_AGLH)    102 non-null    float64
 11  (Value, VC_DSR_BSDN)    99 non-null     float64
 12  (Value, VC_DSR_CDAN)    80 non-null     float64
 13  (Value, VC_DSR_CDYN)    27 non-null     float64
 14  (Value, VC_DSR_CHLN)    24 

In [36]:
#dropping countries that don't have any data for %in urban slums
goal_11_final = goal_11_transformed[goal_11_transformed[('Value', 'EN_LND_SLUM')].notna()]

In [37]:
goal_11_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, Afghanistan to Zimbabwe
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   (Value, EN_ATM_PM25)    130 non-null    float64
 1   (Value, EN_LND_SLUM)    134 non-null    float64
 2   (Value, EN_REF_WASCOL)  61 non-null     float64
 3   (Value, SD_CPA_UPRDP)   134 non-null    float64
 4   (Value, SG_DSR_LGRGSR)  81 non-null     float64
 5   (Value, SG_DSR_SFDRR)   8 non-null      float64
 6   (Value, SG_DSR_SILN)    64 non-null     float64
 7   (Value, SG_DSR_SILS)    64 non-null     float64
 8   (Value, SG_GOV_LOGV)    67 non-null     float64
 9   (Value, VC_DSR_AFFCT)   96 non-null     float64
 10  (Value, VC_DSR_AGLH)    73 non-null     float64
 11  (Value, VC_DSR_BSDN)    68 non-null     float64
 12  (Value, VC_DSR_CDAN)    60 non-null     float64
 13  (Value, VC_DSR_CDYN)    18 non-null     float64
 14  (Value, VC_DSR_CHLN)    15 non-n

In [38]:
# Setting Percent of Urban Population Living in Slums as Target to predict
target = np.array(goal_11_final[('Value', 'EN_LND_SLUM')])

#Remove Target from features
features = goal_11_final.drop(('Value', 'EN_LND_SLUM'), axis=1)

#Saving Feature names for future use
feature_list = list(features.columns)

#Converting to numpy array
features = np.array(features)


In [39]:
# Using Skicit-learn to split data into training and testing sets
# Split the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size = 0.25, random_state = 42)

In [40]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_target.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_target.shape)

Training Features Shape: (100, 33)
Training Labels Shape: (100,)
Testing Features Shape: (34, 33)
Testing Labels Shape: (34,)


In [42]:
# # The baseline predictions are the historical averages
# baseline_preds = test_features[:, feature_list.index('average')]
# # Baseline errors, and display average baseline error
# baseline_errors = abs(baseline_preds - test_labels)
# print('Average baseline error: ', round(np.mean(baseline_errors), 2))

In [43]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_target);

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').