In [9]:
import pandas as pd
import numpy as np

### Preprocessing and data visualization/analysis
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures, normalize, MinMaxScaler
from scipy import stats

### Models and model parameters ###
from sklearn import linear_model
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit

In [10]:
data_pretty = pd.read_csv('Data Collection - Literacy Rate.csv')
data = data_pretty.values
data_pretty

Unnamed: 0,literacy_rate_percent_all,Number of doctors per 1000 people (most recent),Nurses Per 1000,Health Spending as a percentage of GDP,Total Health Spending per Capita,Prepaid Private Spending Per Capita,Out-of-Pocket Spending per Capita,Development Assistance for Health Spending per Capita,Output
0,38.20,0.295,0.360,0.102,168,1,129,29,25.9
1,97.60,1.286,3.900,0.071,848,23,465,7,75.4
2,80.20,1.192,1.924,0.069,1026,13,282,0,63.1
3,100.00,3.690,1.924,0.119,9203,691,3306,0,94.7
4,71.10,0.144,1.442,0.026,197,9,63,8,33.4
5,98.90,0.158,3.064,0.051,1198,104,277,0,69.8
6,98.10,3.907,4.212,0.067,1457,151,264,9,68.1
7,99.80,2.803,5.423,0.092,849,12,678,11,70.7
8,99.00,3.496,12.566,0.090,4400,567,855,0,95.9
9,99.00,5.230,8.304,0.103,5183,338,928,0,93.9


In [12]:
### Extract input and output ###

HAQ = data[:,-1]
input_vals = np.delete(data,-1,axis=1)
input_vals_normalized = np.zeros(np.shape(input_vals))

#### Calculate covariances of raw data and normalized data

In [13]:
### Raw data ###

input_vals_df = data_pretty.drop('Output', axis=1)

input_vals_cov = input_vals_df.cov()
print(input_vals_cov)

                                                    literacy_rate_percent_all   \
literacy_rate_percent_all                                           370.480124   
Number of doctors per 1000 people (most recent)                      17.695202   
Nurses Per 1000                                                      39.185745   
Health Spending as a percentage of GDP                                0.004632   
Total Health Spending per Capita                                  14587.863064   
Prepaid Private Spending Per Capita                                 932.248522   
Out-of-Pocket Spending per Capita                                  3187.393174   
Development Assistance for Health Spending per ...                 -237.143788   

                                                    Number of doctors per 1000 people (most recent)  \
literacy_rate_percent_all                                                                 17.695202   
Number of doctors per 1000 people (most recent)        

In [14]:
### Normalized data ###

input_vals_norm_df = pd.DataFrame(input_vals_normalized, columns=input_vals_df.columns)
input_vals_norm_cov = input_vals_norm_df.cov()
print(input_vals_norm_cov)

                                                    literacy_rate_percent_all   \
literacy_rate_percent_all                                                  0.0   
Number of doctors per 1000 people (most recent)                            0.0   
Nurses Per 1000                                                            0.0   
Health Spending as a percentage of GDP                                     0.0   
Total Health Spending per Capita                                           0.0   
Prepaid Private Spending Per Capita                                        0.0   
Out-of-Pocket Spending per Capita                                          0.0   
Development Assistance for Health Spending per ...                         0.0   

                                                    Number of doctors per 1000 people (most recent)  \
literacy_rate_percent_all                                                                       0.0   
Number of doctors per 1000 people (most recent)        

In [18]:
### Rescale all data to [0,1] as required ###

indices_to_normalize = [1,2,4,5,6,7]
for i in indices_to_normalize:
    input_vals_normalized[:,i] = input_vals[:,i]/input_vals[:,i].max()
input_vals_normalized[:,0] = input_vals[:,0]/100.0 # make literacy rate [0,1]d

In [17]:
### Displaying the current set of input_vals 

input_vals_normalized

array([[3.82000000e-01, 3.92339407e-02, 1.97476687e-02, ...,
        2.64550265e-04, 3.90199637e-02, 6.57596372e-02],
       [9.76000000e-01, 1.71033382e-01, 2.13933077e-01, ...,
        6.08465608e-03, 1.40653358e-01, 1.58730159e-02],
       [8.02000000e-01, 1.58531720e-01, 1.05540318e-01, ...,
        3.43915344e-03, 8.52994555e-02, 0.00000000e+00],
       ...,
       [7.01000000e-01, 4.13618832e-02, 4.15249589e-02, ...,
        5.29100529e-04, 4.29522081e-02, 2.72108844e-02],
       [6.34000000e-01, 1.21026732e-02, 4.86012068e-02, ...,
        6.61375661e-03, 1.84513007e-02, 1.76870748e-01],
       [8.65000000e-01, 1.02407235e-02, 6.40153593e-02, ...,
        8.73015873e-03, 1.69388990e-02, 1.20181406e-01]])

In [21]:
### split into training and test data ###

percent_training = 0.8
training_index = int(input_vals.shape[0] // (1/percent_training))
x_training = input_vals_normalized[0:training_index]
x_test = input_vals_normalized[training_index:,]

y_training = HAQ[0:training_index]
y_test = HAQ[training_index:,]

#### Multivariate Linear Regression

In [22]:
lin_regr = linear_model.LinearRegression()

lin_regr.fit(x_training, y_training)
predicted_y = lin_regr.predict(x_test)

In [23]:
print('MSE for linear regression =', mean_squared_error(y_test, predicted_y))
print('R2 for linear regression =',r2_score(y_test, predicted_y))
print('linear regression coeffs =',lin_regr.coef_)

MSE for linear regression = 91.44275496187751
R2 for linear regression = 0.8025309848479995
linear regression coeffs = [ 4.06410659e+01  2.42538217e+01  1.63470756e+01  1.42108547e-14
  3.34033159e+01 -1.20042722e+01  1.40207113e+01 -1.63977769e+01]


#### 10-fold CV 

In [24]:
x_training, x_test, y_training, y_test = train_test_split(input_vals_normalized, HAQ, test_size=0.2)

In [25]:
### no data shuffling ###
scores = cross_val_score(lin_regr, input_vals_normalized, HAQ, cv=10)
print('No shuffle =',scores)
### with data shuffling ###
CV = ShuffleSplit(n_splits=10,test_size=0.2)
scores_rand_shuffle = cross_val_score(lin_regr, input_vals_normalized, HAQ, cv=CV)
print('With shuffle =', scores_rand_shuffle)
print('mean =', np.mean(scores_rand_shuffle), '\n2 STDs of error =', np.std(scores_rand_shuffle)*2)

No shuffle = [0.70791533 0.29882545 0.84829483 0.87275414 0.48178034 0.85793134
 0.8775729  0.88414836 0.82062416 0.78088546]
With shuffle = [0.73705667 0.3828078  0.70483875 0.85480302 0.73550601 0.61089792
 0.76138471 0.84164566 0.83364824 0.81346973]
mean = 0.7276058512025815 
2 STDs of error = 0.269903176281694


#### Regression Trees

In [26]:
tree_regrs = []
max_depths = [4, 5, 8]

for d in max_depths:
    tree_regrs.append(tree.DecisionTreeRegressor(max_depth=d))

tree_regr_test_results = []
for tr in tree_regrs:
    tr = tr.fit(x_training, y_training)
    tree_regr_test_results.append(tr.predict(x_test))

In [27]:
### Calculate mean squared error between D.T. Regressor and test data ###
error = []
for r in tree_regr_test_results:
    error.append(mean_squared_error(y_test, r))
print(error)

[109.07553191130098, 154.1203698865289, 138.68934751461987]


#### SDG Data

In [30]:
SDG_scaled_pretty = pd.read_csv('IHME_HEALTH_RELATED_SDG_1990_2030_SCALED.CSV')
SDG_unscaled_pretty = pd.read_csv('IHME_HEALTH_RELATED_SDG_1990_2030_UNSCALED.CSV')

In [31]:
SDG_scaled_pretty

Unnamed: 0,location_id,location_name,year_id,estimate_type,indicator_id,indicator_short,ihme_indicator_description,indicator_is_mdg,indicator_outline,indicator_unit,target_description,goal_description,scaled_value,scaled_lower,scaled_upper
0,6,China,1990,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,33.2,31.1,35.8
1,6,China,1995,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,39.7,37.7,41.2
2,6,China,2000,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,45.7,44.5,46.8
3,6,China,2005,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,51.4,50.4,52.4
4,6,China,2010,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,54.0,52.4,55.8
5,6,China,2016,Past,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,60.6,59.5,62.0
6,6,China,2017,Projection,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,61.6,60.4,62.9
7,6,China,2018,Projection,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,62.6,61.4,63.8
8,6,China,2019,Projection,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,63.0,61.8,64.4
9,6,China,2020,Projection,1054,SDG Index,SDG Index: Geometric mean of all health-relate...,,,Scaled/Normalized value ranging from 0-100,,,63.6,62.3,65.0


In [34]:
# Filtering into the most recent past data and getting rid of projections

# Getting rid of estimate_type projections
SDG_without_projections = SDG_scaled_pretty[SDG_scaled_pretty[3]!='Projection', :]
print(SDG_without_projections)


KeyError: 3

## Tasks to complete

* Import in SDG data and rerun all current models
* Analyze current models and choose better ones
* Double check covariances and remove/combine attributes
* Find new unique attributes 