In [38]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error

In [3]:
test_file_path = '/home/lucyanthony/University/bristol/year_4/DataScienceToolbox/Assessment1/Datasets/test.xlsx'
train_file_path = '/home/lucyanthony/University/bristol/year_4/DataScienceToolbox/Assessment1/Datasets/train.xlsx'
test_data = pd.read_excel(test_file_path)
train_data = pd.read_excel(train_file_path)

In [14]:
indep_train = train_data[['Mean_Stringency_Index', 'CH_Index', 'Gov_Resp_Index', 'Econ_Sup_Index', 'days_since', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths']]
dep_train = train_data['reproduction_rate']
indep_test = test_data[['Mean_Stringency_Index', 'CH_Index', 'Gov_Resp_Index', 'Econ_Sup_Index', 'days_since', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths']]
dep_test = test_data['reproduction_rate']

# Knn models are distance based, so scaling of features (indep. variables) is essential

scaler = RobustScaler()

# We apply the function fit_transform to the training data which chooses model parameters according to the mean and standard deviation
# We apply simply the transform function to the test data since the model parameters have already been chosen

indep_scaled_train = scaler.fit_transform(indep_train)
indep_scaled_test = scaler.transform(indep_test)

In [21]:
knn = KNeighborsRegressor(n_neighbors=15, p=1, weights='uniform')
knn.fit(indep_scaled_train, dep_train)
prediction = knn.predict(indep_scaled_test)

# Now we will test the performance of our model

mse = mean_squared_error(dep_test, prediction)
print(f'Mean Squared Error for k=15: {mse}')

Mean Squared Error for k=15: 0.1888025598963954


In [26]:
base_model = LinearRegression()
knn_model = KNeighborsRegressor(n_neighbors=15, p=1, weights = 'uniform')

rfe = RFE(estimator=base_model, n_features_to_select=4)
rfe.fit(indep_scaled_train, dep_train)
train_rfe = rfe.transform(indep_scaled_train)
test_rfe = rfe.transform(indep_scaled_test)

# RFE and selects important features using the linear regression model
# Now we test the KNN model using these variables

knn_model.fit(train_rfe, dep_train)
predictions = knn_model.predict(test_rfe)
mse = mean_squared_error(dep_test, predictions)

print(f'Mean Squared Error for k=16, p=1, weights="uniform", with selected features: {mse}')
print(f'Selected Features: {rfe.support_}')

Mean Squared Error for k=16, p=1, weights="uniform", with selected features: 0.18681731698409274
Selected Features: [False  True  True  True  True False False False False]


In [27]:
knn = KNeighborsRegressor(n_neighbors=15, p=1, weights='uniform')
pca = PCA(n_components=0.95)
pca_train = pca.fit_transform(indep_scaled_train)
pca_test = pca.transform(indep_scaled_test)

scores_pca = cross_val_score(knn, pca_train, dep_train, cv=3, scoring='neg_mean_squared_error')
average_mse_pca = -scores_pca.mean()

print(f"Average MSE from 3-fold cross-validation with PCA: {average_mse_pca}")

# We can also train on the full data now, and test on the test data

knn.fit(pca_train, dep_train)
predictions_pca = knn.predict(pca_test)
mse_test_pca = mean_squared_error(dep_test, predictions_pca)

print(f"Test MSE with PCA: {mse_test_pca}")

Average MSE from 3-fold cross-validation with PCA: 0.189561857175738
Test MSE with PCA: 0.19194715950102612


In [43]:
# We will try PLS

pls = PLSRegression(n_components=3)
indep_train_pls = pls.fit_transform(indep_scaled_train, dep_train)[0]
indep_test_pls = pls.transform(indep_scaled_test)

knn = KNeighborsRegressor(n_neighbors=15, p=1, weights='uniform')
knn.fit(indep_train_pls, dep_train)

predictions = knn.predict(indep_test_pls)
mse = mean_squared_error(dep_test, predictions)
print(f'Mean Squared Error after PLS: {mse}')

Mean Squared Error after PLS: 0.1802949371897259


In [50]:
# Now we will try Bagging (indirectly) to help the knn

knn = KNeighborsRegressor(n_neighbors=15, p=1, weights='uniform')
bagging_knn = BaggingRegressor(knn, n_estimators=20)
bagging_knn.fit(indep_scaled_train, dep_train)
predictions = bagging_knn.predict(indep_scaled_test)
mse = mean_squared_error(dep_test, predictions)
print(f'Mean Squared Error after PLS: {mse}')

Mean Squared Error after PLS: 0.18603565048122678


In [None]:
# Now we will try t-SNE

