In [1]:
import numpy as np
import random 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from regression_utils import *

np.set_printoptions(suppress=True)
%matplotlib inline

In [2]:
# convention followed in relation to scikit documentation 
# y = countryQuantity 
# y1 = countryQuantityTrain
# Y = countryQuantityTest
# y_pred = countryQuantityPredict

# X = years 
# X1 = yearsTrain
# x_mis = yearsTest
# x = yearsPredict

# c = combinedZip

In [3]:
#reading data from csv 
df = pd.read_csv('datasets_12603_17232_Life Expectancy Data.csv')
df.fillna(df.mean(), inplace=True)
# Quantity variation of 3 countries (Germany, Netherlands, Spain)
# change here 
country = 'Germany'
countryQuantity = 'Life expectancy'
countryDataFrame = countryDF(country, df)   

randomRowsTrain = countryDataFrame.sample(frac=0.7)
randomRowsTest = countryDataFrame.loc[~countryDataFrame.index.isin(randomRowsTrain.index)]

countryQuantityTrain = columnExtractor(randomRowsTrain, countryQuantity)
countryQuantityTest = columnExtractor(randomRowsTest, countryQuantity)
print('Training data (X) : \n',countryQuantityTrain)
print('Test data (X): \n',countryQuantityTest)


desiredFeatures = ['Year', 'Total expenditure']
allFeaturesTrain = variableColumnExtractor(randomRowsTrain, desiredFeatures)
allFeaturesTest = variableColumnExtractor(randomRowsTest, desiredFeatures)
print('Training data (Y): \n',allFeaturesTrain)
print('Test data (Y): \n',allFeaturesTest)

#df_percent = df.sample(frac=0.7)
#df_rest = df.loc[~df.index.isin(df_percent.index)]
#countryQuantityTrain, yearsTrain, countryQuantityTest, yearsTest


Training data (X) : 
 996     86.0
997     86.0
1002    79.8
998     85.0
995     89.0
1000    80.0
1005    79.1
999     81.0
1003    79.6
1004    79.2
1006    78.5
Name: Life expectancy, dtype: float64
Test data (X): 
 994     81.0
1001    79.9
1007    78.4
1008    78.3
1009    78.0
Name: Life expectancy, dtype: float64
Training data (Y): 
       Year  Total expenditure
996   2013              11.16
997   2012               1.99
1002  2007               1.18
998   2011               1.93
995   2014              11.30
1000  2009              11.40
1005  2004               1.37
999   2010              11.25
1003  2006               1.34
1004  2005               1.52
1006  2003               1.62
Test data (Y): 
       Year  Total expenditure
994   2015            5.93819
1001  2008            1.39000
1007  2002            1.40000
1008  2001            1.15000
1009  2000            1.10000


In [5]:
##Gaussian regression analysis 

lengthScale = np.random.randint(50) 
kernel = C(1.0, (1e-3, 1e3)) * RBF(lengthScale, (1e-2, 1e2))

gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

gp.fit(allFeaturesTrain, countryQuantityTrain)
countryQuantityPredict, sigma = gp.predict(allFeaturesTest, return_std=True)

print('Test data (X): \n',countryQuantityTest)
print('Test data (Y): \n',allFeaturesTest)
print('Predicted data (X): \n',allFeaturesTest)
print('Predicted data (Y): \n',countryQuantityPredict)

Test data (X): 
 994     81.0
1001    79.9
1007    78.4
1008    78.3
1009    78.0
Name: Life expectancy, dtype: float64
Test data (Y): 
       Year  Total expenditure
994   2015            5.93819
1001  2008            1.39000
1007  2002            1.40000
1008  2001            1.15000
1009  2000            1.10000
Predicted data (X): 
       Year  Total expenditure
994   2015            5.93819
1001  2008            1.39000
1007  2002            1.40000
1008  2001            1.15000
1009  2000            1.10000
Predicted data (Y): 
 [97.92596316 80.78169504 79.11468646 80.08306333 80.23820585]




In [7]:
##Linear regression analysis 

regr = linear_model.LinearRegression()
regr.fit(allFeaturesTrain, countryQuantityTrain)

countryQuantityPredictLR = regr.predict(allFeaturesTest)

print(countryQuantityPredictLR)

[88.73253113 82.26206361 75.94968174 74.94695835 73.90502416]


In [None]:
# error computation for GPR 
years = columnExtractor(countryDataFrame,'Year').tolist()
error = errorComputation(countryDataFrame, countryQuantityPredict, 'Life expectancy',regression_type='GPR')
#print('GPR error: ',error)
errorPlot(years, error, 'Year', 'Relative Error in %',regression_type='GPR',color='#1f77b4')

#print('------------------------------------------')

years = columnExtractor(countryDataFrame,'Year').tolist()
error = errorComputation(countryDataFrame, countryQuantityPredictLR, 'Life expectancy',regression_type='LR')
#print('LR error ',error)
errorPlot(years, error, 'Year', 'Relative Error in %',regression_type='LR',color='#ff7f0e')

plt.show()

#go with rmse for outliers 
#try using normalization

In [None]:
#https://yugeten.github.io/posts/2019/09/GP/
#https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html
#https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html#sphx-glr-download-auto-examples-gaussian-process-plot-gpr-co2-py
#https://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/tutorial/statistical_inference/supervised_learning.html
