# Imports libraries

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import time
sns.set_style('whitegrid')
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.simplefilter("ignore")
from sklearn.model_selection import train_test_split
import time
import xlwt
from xlwt import Workbook
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from scipy.stats.mstats import winsorize
from pylab import rcParams
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import max_error
from sklearn.metrics import explained_variance_score
!pip install vowpalwabbit

Collecting vowpalwabbit
[?25l  Downloading https://files.pythonhosted.org/packages/00/f9/19792e9ce6b192e4b0475347c9c048f73c2bf2765a79fdf25bda7e78a26b/vowpalwabbit-8.10.2-cp37-cp37m-manylinux2010_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 7.7MB/s 
[?25hInstalling collected packages: vowpalwabbit
Successfully installed vowpalwabbit-8.10.2


# Import dataset

In [2]:
path = "/content/sample_data/analcatdata_neavote.csv"
df = pd.read_csv(path)

In [3]:
df.shape

(100, 3)

In [4]:
df.columns

Index(['Party', 'Bills', 'Favorable'], dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Party      100 non-null    object
 1   Bills      100 non-null    int64 
 2   Favorable  100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Party = df['Party']
Party = le.fit_transform(Party)
df['Party'] = Party

# Split the dataset

In [7]:
X = df.drop(['Favorable'], axis=1)
y = df['Favorable']

In [8]:
# Workbook is created
wb = Workbook()
sheet1 = wb.add_sheet('analcatdata_neavote')
sheet1.write(0, 1, 'Dataset name')
sheet1.write(0, 2, 'Algorithm Name')
sheet1.write(0, 3, 'Cross Validation [1-10]')
sheet1.write(0, 4, 'Hyper-Parameters Values')
sheet1.write(0, 5, 'Mean Squared Error')
sheet1.write(0, 6, 'Mean Absolute Error')
sheet1.write(0, 7, 'Median Absolute Error')
sheet1.write(0, 8, 'Max Error')
sheet1.write(0, 9, 'Explained Variance')
sheet1.write(0, 10, 'Train Time')
sheet1.write(0, 11, 'Inference Time')

sheet1.write(1, 1, 'analcatdata_neavote')

# VWRegressor evaluation

In [9]:
from vowpalwabbit.sklearn_vw import VWRegressor
from sklearn.model_selection import RandomizedSearchCV
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from scipy.stats import uniform
from scipy.stats import loguniform
from scipy.stats import randint

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
mean_squared_results = list()
mean_absolute_results = list()
median_absolute_results = list()
max_err_results = list()
explained_var_results = list()
train_time = list()
test_time = list()
params = list()
i = 1
sheet1.write(1, 2, 'VWRegressor')

for train_ix, test_ix in cv_outer.split(X):
	# split data
	X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
	y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
	# configure the cross-validation procedure
	cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
	# define the model
	vw = VWRegressor()
	# define search space
	distributions = {
     'l1': loguniform(1e-8, 1e-1),
     'l2': loguniform(1e-8, 1e-1),
     'l': loguniform(0.01, 10),
     'power_t': uniform(0.01, 1),
     'random_weights': ["on", "off"],
     'loss_function': ["squared", "hinge", "logistic", "quantile", "poisson"],
     'passes': randint(1, 10),
  }
	# define search
	clf = RandomizedSearchCV(vw, distributions, random_state=0, cv=cv_inner, n_iter=50)
	# execute search
	search = clf.fit(X_train, y_train)
	# get the best performing model fit on the whole training set
	best_model = search.best_estimator_
	print(i)
	print(best_model.get_params())
	params.append(str(best_model.get_params()))
	start = time.time()
	best_model = best_model.fit(X_train, y_train)
	end = time.time()
	train = (end - start)
	# evaluate model on the hold out dataset
	start = time.time()
	yhat = best_model.predict(X_test)
	end = time.time()
	test = (end - start)
	# evaluate the model
	mean_squared = mean_squared_error(y_test, yhat)
	mean_absolute = mean_absolute_error(y_test, yhat)
	median_absolute = median_absolute_error(y_test, yhat)
	max_err = max_error(y_test, yhat)
	explained_var = explained_variance_score(y_test, yhat)
	# store the result
	mean_squared_results.append(mean_squared)
	mean_absolute_results.append(mean_absolute)
	median_absolute_results.append(median_absolute)
	max_err_results.append(max_err)
	explained_var_results.append(explained_var)
	sheet1.write(i, 3, i)
	sheet1.write(i, 4, str(best_model.get_params()))
	sheet1.write(i, 5, (mean_squared))
	sheet1.write(i, 6, (mean_absolute))
	sheet1.write(i, 7, (median_absolute))
	sheet1.write(i, 8, (max_err))
	sheet1.write(i, 9, (explained_var))
	sheet1.write(i, 10, (train))
	sheet1.write(i, 11, (test))
	i += 1

# summarize the estimated performance of the model
print('Mean Squared Error: %.3f (%.3f)' % (mean(mean_squared_results), std(mean_squared_results)))
print('Mean Absolute Error: %.3f (%.3f)' % (mean(mean_absolute_results), std(mean_absolute_results)))
print('Median Absolute Error: %.3f (%.3f)' % (mean(median_absolute_results), std(median_absolute_results)))
print('Max Error: %.3f (%.3f)' % (mean(max_err_results), std(max_err_results)))
print('Explained Variance: %.3f (%.3f)' % (mean(explained_var_results), std(explained_var_results)))
print('Train Time: %.3f (%.3f)' % (mean(train_time), std(train_time)))
print('Test Time: %.3f (%.3f)' % (mean(test_time), std(test_time)))

1
{'convert_to_vw': True, 'convert_labels': True, 'ring_size': None, 'strict_parse': None, 'learning_rate': None, 'l': 0.5566099791809369, 'power_t': 0.34800761483889175, 'decay_learning_rate': None, 'initial_t': None, 'feature_mask': None, 'initial_regressor': None, 'i': None, 'initial_weight': None, 'random_weights': 'off', 'normal_weights': None, 'truncated_normal_weights': None, 'sparse_weights': None, 'input_feature_regularizer': None, 'quiet': True, 'random_seed': None, 'hash': None, 'hash_seed': None, 'ignore': None, 'ignore_linear': None, 'keep': None, 'redefine': None, 'bit_precision': None, 'b': None, 'noconstant': None, 'constant': None, 'C': None, 'ngram': None, 'skips': None, 'feature_limit': None, 'affix': None, 'spelling': None, 'dictionary': None, 'dictionary_path': None, 'interactions': None, 'permutations': None, 'leave_duplicate_interactions': None, 'quadratic': None, 'q': None, 'cubic': None, 'testonly': None, 't': None, 'holdout_off': None, 'holdout_period': None, 

# Suggestion for improvement

In [10]:
from vowpalwabbit.sklearn_vw import VWRegressor
from sklearn.model_selection import RandomizedSearchCV
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
mean_squared_results = list()
mean_absolute_results = list()
median_absolute_results = list()
max_err_results = list()
explained_var_results = list()
train_time = list()
test_time = list()
j = 1
sheet1.write(11, 2, 'Suggestion for improvement')
for train_ix, test_ix in cv_outer.split(X):
	# split data
	X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
	y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
	# configure the cross-validation procedure
	cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
	# define the model
	vw_squared = VWRegressor(loss_function='squared')
	vw_hinge = VWRegressor(loss_function='hinge')
	vw_logistic = VWRegressor(loss_function='logistic')
	vw_quantile = VWRegressor(loss_function='quantile')
	vw_poisson = VWRegressor(loss_function='poisson')
	# define search space
	distributions = {
     'l1': loguniform(1e-8, 1e-1),
     'l2': loguniform(1e-8, 1e-1),
     'l': loguniform(0.01, 10),
     'power_t': uniform(0.01, 1),
     'random_weights': ["on", "off"],
     'passes': randint(1, 10),
	}
	# define search
	clf_squared = RandomizedSearchCV(vw_squared, distributions, random_state=0, cv=cv_inner, n_iter=50)
	clf_hinge = RandomizedSearchCV(vw_hinge, distributions, random_state=0, cv=cv_inner, n_iter=50)
	clf_logistic = RandomizedSearchCV(vw_logistic, distributions, random_state=0, cv=cv_inner, n_iter=50)
	clf_quantile = RandomizedSearchCV(vw_quantile, distributions, random_state=0, cv=cv_inner, n_iter=50)
	clf_poisson = RandomizedSearchCV(vw_poisson, distributions, random_state=0, cv=cv_inner, n_iter=50)
	# execute search
	search_squared = clf_squared.fit(X_train, y_train)
	search_hinge = clf_hinge.fit(X_train, y_train)
	search_logistic = clf_logistic.fit(X_train, y_train)
	search_quantile = clf_quantile.fit(X_train, y_train)
	search_poisson = clf_poisson.fit(X_train, y_train)
	# get the best performing model fit on the whole training set
	best_model_squared = search_squared.best_estimator_
	best_model_hinge = search_hinge.best_estimator_
	best_model_logistic = search_logistic.best_estimator_
	best_model_quantile = search_quantile.best_estimator_
	best_model_poisson = search_poisson.best_estimator_
	print(j)
	print(best_model_squared.get_params())
	print(best_model_hinge.get_params())
	print(best_model_logistic.get_params())
	print(best_model_quantile.get_params())
	print(best_model_poisson.get_params())
	# fit the model
	start = time.time()
	best_model_squared = best_model_squared.fit(X_train, y_train)
	best_model_hinge = best_model_hinge.fit(X_train, y_train)
	best_model_logistic = best_model_logistic.fit(X_train, y_train)
	best_model_quantile = best_model_quantile.fit(X_train, y_train)
	best_model_poisson = best_model_poisson.fit(X_train, y_train)
	end = time.time()
	train = (end - start)
	# evaluate model on the hold out dataset
	start = time.time()
	yhat_squared = best_model_squared.predict(X_test)
	yhat_hinge = best_model_hinge.predict(X_test)
	yhat_logistic = best_model_logistic.predict(X_test)
	yhat_quantile = best_model_quantile.predict(X_test)
	yhat_poisson = best_model_poisson.predict(X_test)
	avg = []
	for i in range(len(yhat_squared)):
		avg.append((yhat_squared[i] + yhat_hinge[i] + yhat_logistic[i] + yhat_quantile[i] + yhat_poisson[i]) / 5)
	end = time.time()
	test = (end - start)
	# evaluate the model
	mean_squared = mean_squared_error(y_test, avg)
	mean_absolute = mean_absolute_error(y_test, avg)
	median_absolute = median_absolute_error(y_test, avg)
	max_err = max_error(y_test, avg)
	explained_var = explained_variance_score(y_test, avg)
	# store the result
	mean_squared_results.append(mean_squared)
	mean_absolute_results.append(mean_absolute)
	median_absolute_results.append(median_absolute)
	max_err_results.append(max_err)
	explained_var_results.append(explained_var)
	sheet1.write(j + 10, 3, j)
	sheet1.write(j + 10, 4, str(best_model_squared.get_params()))
	sheet1.write(j + 10, 5, (mean_squared))
	sheet1.write(j + 10, 6,  (mean_absolute))
	sheet1.write(j + 10, 7, (median_absolute))
	sheet1.write(j + 10, 8, (max_err))
	sheet1.write(j + 10, 9, (explained_var))
	sheet1.write(j + 10, 10, (train))
	sheet1.write(j + 10, 11, (test))
	j += 1

# summarize the estimated performance of the model
print('Mean Squared Error: %.3f (%.3f)' % (mean(mean_squared_results), std(mean_squared_results)))
print('Mean Absolute Error: %.3f (%.3f)' % (mean(mean_absolute_results), std(mean_absolute_results)))
print('Median Absolute Error: %.3f (%.3f)' % (mean(median_absolute_results), std(median_absolute_results)))
print('Max Error: %.3f (%.3f)' % (mean(max_err_results), std(max_err_results)))
print('Explained Variance: %.3f (%.3f)' % (mean(explained_var_results), std(explained_var_results)))
print('Train Time: %.3f (%.3f)' % (mean(train_time), std(train_time)))
print('Test Time: %.3f (%.3f)' % (mean(test_time), std(test_time)))

1
{'convert_to_vw': True, 'convert_labels': True, 'ring_size': None, 'strict_parse': None, 'learning_rate': None, 'l': 0.8663279761354555, 'power_t': 0.2826562945801132, 'decay_learning_rate': None, 'initial_t': None, 'feature_mask': None, 'initial_regressor': None, 'i': None, 'initial_weight': None, 'random_weights': 'off', 'normal_weights': None, 'truncated_normal_weights': None, 'sparse_weights': None, 'input_feature_regularizer': None, 'quiet': True, 'random_seed': None, 'hash': None, 'hash_seed': None, 'ignore': None, 'ignore_linear': None, 'keep': None, 'redefine': None, 'bit_precision': None, 'b': None, 'noconstant': None, 'constant': None, 'C': None, 'ngram': None, 'skips': None, 'feature_limit': None, 'affix': None, 'spelling': None, 'dictionary': None, 'dictionary_path': None, 'interactions': None, 'permutations': None, 'leave_duplicate_interactions': None, 'quadratic': None, 'q': None, 'cubic': None, 'testonly': None, 't': None, 'holdout_off': None, 'holdout_period': None, '

# Random forest

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold

# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
mean_squared_results = list()
mean_absolute_results = list()
median_absolute_results = list()
max_err_results = list()
explained_var_results = list()
train_time = list()
test_time = list()
i = 1
sheet1.write(21, 2, 'Random forest')
for train_ix, test_ix in cv_outer.split(X):
	# split data
	X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
	y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
	# configure the cross-validation procedure
	cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
	# define the model
	rf = RandomForestRegressor()
	# define search space
	distributions = {
     'criterion': ['mse', 'mae'],
     'min_samples_split': [2, 3, 4, 5],
     'min_samples_leaf': [1, 2, 3, 4],
     'min_weight_fraction_leaf': uniform(0.01, 0.5),
     'max_features': ['auto', 'sqrt', 'log2']
  }
	# define search
	clf = RandomizedSearchCV(rf, distributions, random_state=0, cv=cv_inner, n_iter=50)
	# execute search
	search = clf.fit(X_train, y_train)
	# get the best performing model fit on the whole training set
	best_model = search.best_estimator_
	print(i)
	print(best_model.get_params())
	start = time.time()
	best_model = best_model.fit(X_train, y_train)
	end = time.time()
	train = (end - start)
	# evaluate model on the hold out dataset
	start = time.time()
	yhat = best_model.predict(X_test)
	end = time.time()
	test = (end - start)
	# evaluate the model
	mean_squared = mean_squared_error(y_test, yhat)
	mean_absolute = mean_absolute_error(y_test, yhat)
	median_absolute = median_absolute_error(y_test, yhat)
	max_err = max_error(y_test, yhat)
	explained_var = explained_variance_score(y_test, yhat)
	# store the result
	mean_squared_results.append(mean_squared)
	mean_absolute_results.append(mean_absolute)
	median_absolute_results.append(median_absolute)
	max_err_results.append(max_err)
	explained_var_results.append(explained_var)
	sheet1.write(i + 20, 3, i)
	sheet1.write(i + 20, 4, str(best_model_squared.get_params()))
	sheet1.write(i + 20, 5, (mean_squared))
	sheet1.write(i + 20, 6, (mean_absolute))
	sheet1.write(i + 20, 7, (median_absolute))
	sheet1.write(i + 20, 8, (max_err))
	sheet1.write(i + 20, 9, (explained_var))
	sheet1.write(i + 20, 10, (train))
	sheet1.write(i + 20, 11, (test))
	i += 1

wb.save('analcatdata_neavote.xls')
# summarize the estimated performance of the model
print('Mean Squared Error: %.3f (%.3f)' % (mean(mean_squared_results), std(mean_squared_results)))
print('Mean Absolute Error: %.3f (%.3f)' % (mean(mean_absolute_results), std(mean_absolute_results)))
print('Median Absolute Error: %.3f (%.3f)' % (mean(median_absolute_results), std(median_absolute_results)))
print('Max Error: %.3f (%.3f)' % (mean(max_err_results), std(max_err_results)))
print('Explained Variance: %.3f (%.3f)' % (mean(explained_var_results), std(explained_var_results)))
print('Train Time: %.3f (%.3f)' % (mean(train_time), std(train_time)))
print('Test Time: %.3f (%.3f)' % (mean(test_time), std(test_time)))

1
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.35973963765875216, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
2
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mae', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.26066219096335114, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
3
{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mae', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrea