In [6]:
import pandas as pd

df = pd.read_csv("../data/999_selected_features_60days.csv", index_col=0)

features = df.iloc[:, 1:]
target = df.cases_60days

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target)

In [21]:
from sklearn.preprocessing import MinMaxScaler
def preprocessed_regression():
    #create the scaler
    scaler = MinMaxScaler()

    #fit the scaler to the training data(features only)
    scaler.fit(X_train)

    #transform X_train and X_test based on the (same) scaler
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    run_estimators(X_train_scaled, y_train, X_test_scaled, y_test)

    return X_train_scaled, X_test_scaled
X_train, X_test = preprocessed_regression()

Linear Regression:
	R-squared value for training set:  0.4936771522634782
	R-squared value for testing set:  0.3269537351662223

Ridge:
	R-squared value for training set:  0.45269096344985615
	R-squared value for testing set:  0.3176577382284371

Lasso:
	R-squared value for training set:  0.4585308454711695
	R-squared value for testing set:  0.3189119026850671

k-Nearest Neighbor:
	R-squared value for training set:  0.6559388171939207
	R-squared value for testing set:  0.265121489610776

Support Vector Machine:
	R-squared value for training set:  -0.021995301406087853
	R-squared value for testing set:  -0.02191612334885984



In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

estimator = LinearRegression()
estimator.fit(X_train, y_train)


print("R-squared value for training set: ", r2_score(y_train, estimator.predict(X_train)))
print("R-squared value for testing set: ", r2_score(y_test, estimator.predict(X_test)))

R-squared value for training set:  0.4936771522634782
R-squared value for testing set:  0.3269537351662223


In [23]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR

estimators = {
    "Linear Regression": LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
    "Ridge": Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
                    normalize=False, random_state=None, solver='auto', tol=0.001),
    "Lasso": Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
                    normalize=False, positive=False, precompute=False, random_state=None,
                    selection='cyclic', tol=0.0001, warm_start=False),
    "k-Nearest Neighbor": KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                                                metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                                                weights='uniform'),
    "Support Vector Machine": LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
                                        intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000000,
                                        random_state=None, tol=0.0001, verbose=0),
}

def run_estimators(X_train, y_train, X_test, y_test):
    """
    This helper function runs through the estimators and prints the results with the given data.
    """
    for name, estimator in estimators.items():
        estimator.fit(X=X_train, y=y_train)

        print(f"{name}:")
        print("\tR-squared value for training set: ", r2_score(y_train, estimator.predict(X_train)))
        print("\tR-squared value for testing set: ", r2_score(y_test, estimator.predict(X_test)), end="\n\n")

In [24]:
run_estimators(X_train, y_train, X_test, y_test)

Linear Regression:
	R-squared value for training set:  0.4936771522634782
	R-squared value for testing set:  0.3269537351662223

Ridge:
	R-squared value for training set:  0.45269096344985615
	R-squared value for testing set:  0.3176577382284371

Lasso:
	R-squared value for training set:  0.4585308454711695
	R-squared value for testing set:  0.3189119026850671

k-Nearest Neighbor:
	R-squared value for training set:  0.6559388171939207
	R-squared value for testing set:  0.265121489610776

Support Vector Machine:
	R-squared value for training set:  -0.021985728357890144
	R-squared value for testing set:  -0.02190974504406662



In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {"n_neighbors":[1, 5, 10, 15, 20, 25], "metric": ["euclidean", "manhattan", "minkowski"]}

def grid_search_kNN():
  grid_search = GridSearchCV(estimators["k-Nearest Neighbor"], param_grid, cv=5)
  grid_search.fit(X=X_train, y=y_train)
  print("Best parameters: ", grid_search.best_params_)
  print("Training set score with best parameters: ", grid_search.score(X_train, y_train))
  print("Test set score with best parameters: ", grid_search.score(X_test, y_test))

In [27]:
grid_search_kNN()

Best parameters:  {'metric': 'manhattan', 'n_neighbors': 5}
Training set score with best parameters:  0.6380390822044386
Test set score with best parameters:  0.29718507978441644


Linear Regression:
	R-squared value for training set:  0.4936771522634782
	R-squared value for testing set:  0.3269537351662223

Ridge:
	R-squared value for training set:  0.45269096344985615
	R-squared value for testing set:  0.3176577382284371

Lasso:
	R-squared value for training set:  0.4585308454711695
	R-squared value for testing set:  0.3189119026850671

k-Nearest Neighbor:
	R-squared value for training set:  0.6559388171939207
	R-squared value for testing set:  0.265121489610776

Support Vector Machine:
	R-squared value for training set:  -0.02227640208655579
	R-squared value for testing set:  -0.022099411324228546

