In [86]:
#Import the necessary items from libraries

from statistics import mean
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

In [87]:
#Load the data and divide the data into sets, initiate K-Fold cross validation

df = pd.read_csv('CostOfLivingData.csv')
df = df[["City", "Country", "City Population", "City Cost of Living", "Country Cost of Living"]]

X = df[['City Population', 'Country Cost of Living']].values[:, 0:2]
y = df.values[:, 3]

X_rem, X_test, y_rem, y_test = train_test_split(X,y,test_size=0.1,random_state=42)
kFold = KFold(n_splits=5, shuffle=True, random_state=42)

In [88]:
#Linear regression model

tr_errors = []
val_errors = []
coefficients = []
intercepts = []
i = 0

print("===LINEAR REGRESSION===")

for (train_indices, val_indices) in kFold.split(X_rem):
    X_train, y_train, X_val, y_val = X[train_indices], y[train_indices], X[val_indices], y[val_indices]
        
    lin_regr = LinearRegression()
    lin_regr.fit(X_train, y_train)    

    y_pred_train = lin_regr.predict(X_train)   
    tr_error = mean_squared_error(y_train, y_pred_train)   
    y_pred_val = lin_regr.predict(X_val) 
    val_error = mean_squared_error(y_val, y_pred_val)

    tr_errors.append(tr_error)
    val_errors.append(val_error)
    coefficients.append(lin_regr.coef_)
    intercepts.append(lin_regr.intercept_)
    i += 1

    print("\n\nCV iteration " + str(i))
    print("Training error: " + str(tr_error))
    print("Validation error: " + str(val_error))

print("\n\nAverage training error: " + str(sum(tr_errors)/5))
print("Average validation error: " + str(sum(val_errors)/5))
print("\nLearned weights: ",)
for coef in coefficients:
    print(coef)
print("\nLearned intercepts: " + str(intercepts))

===LINEAR REGRESSION===


CV iteration 1
Training error: 22.241421351907544
Validation error: 10.440545211933722


CV iteration 2
Training error: 17.43881582219404
Validation error: 29.883323836956745


CV iteration 3
Training error: 21.454969202113478
Validation error: 13.49497389745292


CV iteration 4
Training error: 17.436935128651196
Validation error: 29.540055921295867


CV iteration 5
Training error: 20.56629531968704
Validation error: 17.06491193816013


Average training error: 19.827687364910663
Average validation error: 20.084762161159876

Learned weights: 
[1.84098082e-07 1.01210710e+00]
[1.63547091e-07 1.00679802e+00]
[1.59144116e-07 1.00953156e+00]
[1.80489106e-07 1.00907648e+00]
[1.87166681e-07 1.01172866e+00]

Learned intercepts: [0.4766260694986073, 0.5809287487540118, 0.802067402758226, 0.6757875643785525, 0.6151013780858676]


In [89]:
#Decision tree models

tr_errors = []
val_errors = []
maxDepth = 1
print("===Decision tree regressor===")
while maxDepth < 21:
    tr_errors = []
    val_errors = []
    for (train_indices, val_indices) in kFold.split(X_rem):
        X_train, y_train, X_val, y_val = X[train_indices], y[train_indices], X[val_indices], y[val_indices]
        
        tree_reg = DecisionTreeRegressor(max_depth=maxDepth)
        tree_reg.fit(X_train,y_train)

        y_pred_train = tree_reg.predict(X_train)
        tr_errors.append(mean_squared_error(y_train, y_pred_train))
        y_pred_val = tree_reg.predict(X_val)
        val_errors.append(mean_squared_error(y_val, y_pred_val))
    print("\n\nDecision tree max depth " + str(maxDepth) + ":")
    print("Average training error: " + str(sum(tr_errors)/5))
    print("Average validation error: " + str(sum(val_errors)/5))
    maxDepth += 1

===Decision tree regressor===


Decision tree max depth 1:
Average training error: 137.62533666126706
Average validation error: 143.1095726113014


Decision tree max depth 2:
Average training error: 49.74042874006814
Average validation error: 57.27070094432911


Decision tree max depth 3:
Average training error: 24.97944589061118
Average validation error: 33.3612001992028


Decision tree max depth 4:
Average training error: 17.563827324565757
Average validation error: 31.525029876085267


Decision tree max depth 5:
Average training error: 13.978593556539755
Average validation error: 28.840896377311584


Decision tree max depth 6:
Average training error: 11.659275778211022
Average validation error: 30.252709235612247


Decision tree max depth 7:
Average training error: 8.565431364307516
Average validation error: 32.300985055808816


Decision tree max depth 8:
Average training error: 5.798310270416641
Average validation error: 31.674608671914136


Decision tree max depth 9:
Average train

In [90]:
#After comparing the errors, linear regression model is chosen.
#The final model is obtained by fitting to the set for K-Fold validation (90%).

print("===Final model===")

lin_regr = LinearRegression()
lin_regr.fit(X_rem, y_rem)
y_pred_rem = lin_regr.predict(X_rem)
tr_error = mean_squared_error(y_rem, y_pred_rem)
print("\nTraining error: " + str(tr_error))

y_pred_test = lin_regr.predict(X_test)
test_error = mean_squared_error(y_test, y_pred_test)
print("Testing error: " + str(test_error))

===Final model===

Training error: 18.83074604885515
Testing error: 12.68378981952182
