# read polymer data

In [1]:
import polymer
data =  polymer.read("polymer.csv", normalise = False)

# split to train-test

In [2]:
p = 0.8   # percentage for training
splits, s = [], 100  # get s splits

for i in range(s):
    train, test = data.split(p, i)
    splits.append((train, test))

# vanilla linear regression

In [3]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm
import numpy as np

mse, r2 = [], []
for i in tqdm(range(s)):
    train, test = splits[i]
    
    reg = linear_model.LinearRegression()
    _ = reg.fit(train["X"], train["Y"])
    Z = reg.predict(test["X"])

    mse.append(mean_squared_error(test["Y"], Z))
    r2.append(r2_score(test["Y"], Z))
    
print("mse: ", round(np.mean(mse), 3), "+-", round(np.std(mse), 3))
print("r2: ", round(np.mean(r2), 3), "+-", round(np.std(r2), 3))

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1323.46it/s]


mse:  2.395 +- 1.758
r2:  0.354 +- 1.1


# Lasso linear regression

In [4]:
mse, r2 = [], []
for i in tqdm(range(s)):
    train, test = splits[i]
    
    reg = linear_model.LassoCV(cv=5, random_state=0)
    _ = reg.fit(train["X"], train["Y"])
    Z = reg.predict(test["X"])

    mse.append(mean_squared_error(test["Y"], Z))
    r2.append(r2_score(test["Y"], Z))
    
print("mse: ", round(np.mean(mse), 3), "+-", round(np.std(mse), 3))
print("r2: ", round(np.mean(r2), 3), "+-", round(np.std(r2), 3))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.90it/s]


mse:  2.46 +- 1.781
r2:  0.279 +- 1.19


# Ridge linear regression

In [5]:
mse, r2 = [], []
for i in tqdm(range(s)):
    train, test = splits[i]
    
    reg = linear_model.RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100, 1000])
    _ = reg.fit(train["X"], train["Y"])
    Z = reg.predict(test["X"])

    mse.append(mean_squared_error(test["Y"], Z))
    r2.append(r2_score(test["Y"], Z))
    
print("mse: ", round(np.mean(mse), 3), "+-", round(np.std(mse), 3))
print("r2: ", round(np.mean(r2), 3), "+-", round(np.std(r2), 3))

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 672.76it/s]


mse:  2.407 +- 1.736
r2:  0.333 +- 1.117


# decision tree regression

In [6]:
from sklearn.tree import DecisionTreeRegressor
mse, r2 = [], []
for i in tqdm(range(s)):
    train, test = splits[i]
    
    reg = DecisionTreeRegressor()
    _ = reg.fit(train["X"], train["Y"])
    Z = reg.predict(test["X"])

    mse.append(mean_squared_error(test["Y"], Z))
    r2.append(r2_score(test["Y"], Z))
    
print("mse: ", round(np.mean(mse), 3), "+-", round(np.std(mse), 3))
print("r2: ", round(np.mean(r2), 3), "+-", round(np.std(r2), 3))

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1431.69it/s]


mse:  1.281 +- 1.204
r2:  0.796 +- 0.335


# support vector regression

In [7]:
from sklearn.svm import SVR
mse, r2 = [], []
for i in tqdm(range(s)):
    train, test = splits[i]
    
    reg = SVR(gamma=0.1, C=10, epsilon=0.3)
    _ = reg.fit(train["X"], train["Y"])
    Z = reg.predict(test["X"])

    mse.append(mean_squared_error(test["Y"], Z))
    r2.append(r2_score(test["Y"], Z))
    
print("mse: ", round(np.mean(mse), 3), "+-", round(np.std(mse), 3))
print("r2: ", round(np.mean(r2), 3), "+-", round(np.std(r2), 3))

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1089.48it/s]


mse:  1.303 +- 1.027
r2:  0.832 +- 0.202


# [run all cells](https://stackoverflow.com/questions/33143753/jupyter-ipython-notebooks-shortcut-for-run-all)

In [8]:
%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('r', {
    help : 'run all cells',
    help_index : 'zz',
    handler : function (event) {
        IPython.notebook.execute_all_cells();
        return false;
    }}
);

<IPython.core.display.Javascript object>

In [9]:
'''import sklearn, math, itertools, optunity, optunity.metrics

# we explicitly generate the outer_cv decorator so we can use it twice
train, test = splits[0]
data, targets = train["X"], train["Y"]
outer_cv = optunity.cross_validated(x=data, y=targets, num_folds=3)

space = {'kernel': {'linear': {'C': [0, 100]},
                    'rbf': {'gamma': [0, 50], 'C': [1, 100]},
                    'poly': {'degree': [2, 5], 'C': [1000, 20000], 'coef0': [0, 1]}
                    }
         }

def compute_mse_all_tuned(x_train, y_train, x_test, y_test):
    """Computes MSE of an SVR with RBF kernel and optimized hyperparameters."""

    # define objective function for tuning
    @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5)
    def tune_cv(x_train, y_train, x_test, y_test, kernel, C, gamma, degree, coef0):
        if kernel == 'linear':
            model = sklearn.svm.SVR(kernel=kernel, C=C)
        elif kernel == 'poly':
            model = sklearn.svm.SVR(kernel=kernel, C=C, degree=degree, coef0=coef0)
        elif kernel == 'rbf':
            model = sklearn.svm.SVR(kernel=kernel, C=C, gamma=gamma)
        else:
            raise ArgumentError("Unknown kernel function: %s" % kernel)
        model.fit(x_train, y_train)

        predictions = model.predict(x_test)
        return optunity.metrics.mse(y_test, predictions)

    # optimize parameters
    optimal_pars, _, _ = optunity.minimize_structured(tune_cv, num_evals=150, search_space=space)

    # remove hyperparameters with None value from optimal pars
    for k, v in optimal_pars.items():
        if v is None: del optimal_pars[k]
    print("optimal hyperparameters: " + str(optimal_pars))

    tuned_model = sklearn.svm.SVR(**optimal_pars).fit(x_train, y_train)
    predictions = tuned_model.predict(x_test)
    return optunity.metrics.mse(y_test, predictions)

# wrap with outer cross-validation
compute_mse_all_tuned = outer_cv(compute_mse_all_tuned)

compute_mse_all_tuned()'''

'import sklearn, math, itertools, optunity, optunity.metrics\n\n# we explicitly generate the outer_cv decorator so we can use it twice\ntrain, test = splits[0]\ndata, targets = train["X"], train["Y"]\nouter_cv = optunity.cross_validated(x=data, y=targets, num_folds=3)\n\nspace = {\'kernel\': {\'linear\': {\'C\': [0, 100]},\n                    \'rbf\': {\'gamma\': [0, 50], \'C\': [1, 100]},\n                    \'poly\': {\'degree\': [2, 5], \'C\': [1000, 20000], \'coef0\': [0, 1]}\n                    }\n         }\n\ndef compute_mse_all_tuned(x_train, y_train, x_test, y_test):\n    """Computes MSE of an SVR with RBF kernel and optimized hyperparameters."""\n\n    # define objective function for tuning\n    @optunity.cross_validated(x=x_train, y=y_train, num_iter=2, num_folds=5)\n    def tune_cv(x_train, y_train, x_test, y_test, kernel, C, gamma, degree, coef0):\n        if kernel == \'linear\':\n            model = sklearn.svm.SVR(kernel=kernel, C=C)\n        elif kernel == \'poly\':