In [None]:
#import dependencies

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [None]:
#Use to allow colab access to GoogleDrive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import glob

# Search for the file across all of MyDrive
file_path = glob.glob('/content/drive/MyDrive/**/*World_Health_Data.xlsx', recursive=True)
print(file_path)

['/content/drive/MyDrive/Colab Notebooks/Project4/World_Health_Data.xlsx']


In [None]:
import pandas as pd
import os

#Replace Folder Name with actual folder name
file_path = '/content/drive/MyDrive/Colab Notebooks/Project4/World_Health_Data.xlsx'



#Read the excel file
world_health_df = pd.read_excel(file_path)

world_health_df.head()

Unnamed: 0,IND_NAME,DIM_GEO_NAME,IND_CODE,DIM_GEO_CODE,DIM_TIME_YEAR,DIM_1_CODE,VALUE_NUMERIC,VALUE_STRING,VALUE_COMMENTS
0,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0,62.0,Afghanistan 2022-2023 Multiple Indicator Clust...
1,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0,18.0,Afghanistan 2022-2023 Multiple Indicator Clust...
2,Age-standardized mortality rate attributed to ...,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452,265.7,
3,Age-standardized prevalence of hypertension am...,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001,40.2,
4,Age-standardized prevalence of obesity among a...,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589,19.2,


In [None]:
#Use ETL to begin cleaning the data

#Get list of column names
world_health_df.columns.tolist()

['IND_NAME',
 'DIM_GEO_NAME',
 'IND_CODE',
 'DIM_GEO_CODE',
 'DIM_TIME_YEAR',
 'DIM_1_CODE',
 'VALUE_NUMERIC',
 'VALUE_STRING',
 'VALUE_COMMENTS']

In [None]:
#Rename some of the columns to make it easier to read

world_health_df = world_health_df.rename(columns={"IND_NAME": "Health Indicator Name",
                                "DIM_GEO_NAME": "Country",
                                "IND_CODE": "Health Indicator Code",
                                "DIM_GEO_CODE": "Country Code",
                                "DIM_TIME_YEAR": "Year",
                                "DIM_1_CODE": "Demographic",
                                "VALUE_NUMERIC": "Health Indicator Value",
                                "VALUE_STRING": "Rounded Indicator Value",
                                "VALUE_COMMENTS": "Data Source"})
world_health_df.head()

Unnamed: 0,Health Indicator Name,Country,Health Indicator Code,Country Code,Year,Demographic,Health Indicator Value,Rounded Indicator Value,Data Source
0,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS15-19,62.0,62.0,Afghanistan 2022-2023 Multiple Indicator Clust...
1,Adolescent birth rate (per 1000 women),Afghanistan,MDG_0000000003,AFG,2021,AGEGROUP_YEARS10-14,18.0,18.0,Afghanistan 2022-2023 Multiple Indicator Clust...
2,Age-standardized mortality rate attributed to ...,Afghanistan,SDGAIRBODA,AFG,2019,SEX_BTSX,265.66452,265.7,
3,Age-standardized prevalence of hypertension am...,Afghanistan,NCD_HYP_PREVALENCE_A,AFG,2019,SEX_BTSX,40.200001,40.2,
4,Age-standardized prevalence of obesity among a...,Afghanistan,NCD_BMI_30A,AFG,2022,SEX_BTSX,19.222589,19.2,


In [None]:
#See what kind of datatype is in each column
world_health_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10503 entries, 0 to 10502
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Health Indicator Name    10503 non-null  object 
 1   Country                  10503 non-null  object 
 2   Health Indicator Code    10503 non-null  object 
 3   Country Code             10503 non-null  object 
 4   Year                     10503 non-null  int64  
 5   Demographic              3863 non-null   object 
 6   Health Indicator Value   10503 non-null  float64
 7   Rounded Indicator Value  10503 non-null  object 
 8   Data Source              1847 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 738.6+ KB


In [None]:
#Look for missing values
world_health_df.count()

Unnamed: 0,0
Health Indicator Name,10503
Country,10503
Health Indicator Code,10503
Country Code,10503
Year,10503
Demographic,3863
Health Indicator Value,10503
Rounded Indicator Value,10503
Data Source,1847


In [None]:
#Reorder Columns
world_health_df = world_health_df[['Year','Country','Country Code','Health Indicator Name','Health Indicator Code',	'Health Indicator Value','Rounded Indicator Value','Demographic',	'Data Source']]
world_health_df.head()


Unnamed: 0,Year,Country,Country Code,Health Indicator Name,Health Indicator Code,Health Indicator Value,Rounded Indicator Value,Demographic,Data Source
0,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),MDG_0000000003,62.0,62.0,AGEGROUP_YEARS15-19,Afghanistan 2022-2023 Multiple Indicator Clust...
1,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),MDG_0000000003,18.0,18.0,AGEGROUP_YEARS10-14,Afghanistan 2022-2023 Multiple Indicator Clust...
2,2019,Afghanistan,AFG,Age-standardized mortality rate attributed to ...,SDGAIRBODA,265.66452,265.7,SEX_BTSX,
3,2019,Afghanistan,AFG,Age-standardized prevalence of hypertension am...,NCD_HYP_PREVALENCE_A,40.200001,40.2,SEX_BTSX,
4,2022,Afghanistan,AFG,Age-standardized prevalence of obesity among a...,NCD_BMI_30A,19.222589,19.2,SEX_BTSX,


In [None]:
#Drop the Health Indicator Code, Rounded Indicator Value and Data Source Columns
cleaned_health_df = world_health_df.drop(["Health Indicator Code", "Rounded Indicator Value", "Data Source"], axis=1)
cleaned_health_df.head()

Unnamed: 0,Year,Country,Country Code,Health Indicator Name,Health Indicator Value,Demographic
0,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),62.0,AGEGROUP_YEARS15-19
1,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),18.0,AGEGROUP_YEARS10-14
2,2019,Afghanistan,AFG,Age-standardized mortality rate attributed to ...,265.66452,SEX_BTSX
3,2019,Afghanistan,AFG,Age-standardized prevalence of hypertension am...,40.200001,SEX_BTSX
4,2022,Afghanistan,AFG,Age-standardized prevalence of obesity among a...,19.222589,SEX_BTSX


In [None]:
#Begin with Random Forest Model
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

# Define features set, Want to predict the overall happiness, (X) Target: Life ladder (happiness), Features are all other columns

X = cleaned_health_df.copy()
X.drop("Health Indicator Value", axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Country,Country Code,Health Indicator Name,Demographic
0,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),AGEGROUP_YEARS15-19
1,2021,Afghanistan,AFG,Adolescent birth rate (per 1000 women),AGEGROUP_YEARS10-14
2,2019,Afghanistan,AFG,Age-standardized mortality rate attributed to ...,SEX_BTSX
3,2019,Afghanistan,AFG,Age-standardized prevalence of hypertension am...,SEX_BTSX
4,2022,Afghanistan,AFG,Age-standardized prevalence of obesity among a...,SEX_BTSX


In [None]:
# Define target vector
y = cleaned_health_df["Health Indicator Value"].ravel()
y[:5]


  y = cleaned_health_df["Health Indicator Value"].ravel()


array([ 62.        ,  18.        , 265.66452026,  40.20000076,
        19.22258949])

In [None]:
#Encode the categorical variables using get_dummies

X = pd.get_dummies((cleaned_health_df[['Country', 'Country Code', 'Health Indicator Name',	'Demographic']]))
X

Unnamed: 0,Country_Afghanistan,Country_African Region,Country_Albania,Country_Algeria,Country_Andorra,Country_Angola,Country_Antigua and Barbuda,Country_Argentina,Country_Armenia,Country_Australia,...,Health Indicator Name_Tuberculosis incidence (per 100 000 population),Health Indicator Name_UHC: Service coverage index,Health Indicator Name_Under-five mortality rate (per 1000 live births),Demographic_AGEGROUP_YEARS10-14,Demographic_AGEGROUP_YEARS15-19,Demographic_ALCOHOL,Demographic_DRUG,Demographic_SEX_BTSX,Demographic_SEX_FMLE,Demographic_SEX_MLE
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10498,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
10499,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10500,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
10501,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#Fit the Random Forest Model

# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.