In [1]:
import numpy as np
import pandas as pd
import random

import time
from scipy.stats import gaussian_kde
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error

from statistics import mean, stdev
import timeit
import multiprocessing
from dask import compute, delayed
import dask.multiprocessing


In [2]:
auto = pd.read_csv('Auto.csv', na_values="?")
auto.dropna(inplace=True)
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [3]:
auto['orgn1'] = 0
auto.orgn1[auto['origin']==1] = 1
auto['orgn2'] = 0
auto.orgn2[auto['origin']==2] = 1
auto_median = auto['mpg'].median()
auto['mpg_high'] = 0
auto.mpg_high[auto['mpg']>=auto_median] = 1

### Question1

In [4]:
X = auto[['cylinders', 'displacement', 'horsepower', 'weight','acceleration', 'year', 'orgn1', 'orgn2']].values
y = auto['mpg_high'].values
start_time = timeit.default_timer()
mse = []

for i in range(100):
    j = i+1
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35, random_state=j)
    LR = LogisticRegression(solver='lbfgs', max_iter=200, n_jobs=1)
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    mse_i = ((y_test - y_pred) ** 2).mean()
    mse.append(mse_i)
    
average_mse = np.array(mse).mean()
elapsed_time = timeit.default_timer() - start_time

print('Elapsed time: ', elapsed_time, 'seconds')
print('Average error rate:', average_mse)

Elapsed time:  2.8689723579999997 seconds
Average error rate: 0.10079710144927535


### Question2

In [6]:
num_cores = multiprocessing.cpu_count()

start_time = timeit.default_timer()

def cal_err_rate(boot_num, seed, data):
    print('Now on bootstrap number', boot_num)
    X,y = data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35, random_state=seed)
    LR = LogisticRegression(solver='lbfgs', max_iter=200, n_jobs=1)
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    mse = ((y_test - y_pred) ** 2).mean()
    return mse

mse = []
for i in range(100):
    mse.append(delayed(cal_err_rate)(i+1,i+1,[X,y]))
results_par= compute(*mse, scheduler=dask.multiprocessing.get, num_workers=num_cores)
average_mse2 = np.array(results_par).mean()

elapsed_time = timeit.default_timer() - start_time
print('Elapsed time:', elapsed_time, 'seconds') 

print('Average error rate:', average_mse2)

Now on bootstrap number 81
Now on bootstrap number 89
Now on bootstrap number 38
Now on bootstrap number 45
Now on bootstrap number 98
Now on bootstrap number 65
Now on bootstrap number 95
Now on bootstrap number 2
Now on bootstrap number 16
Now on bootstrap number 93
Now on bootstrap number 72
Now on bootstrap number 83
Now on bootstrap number 27
Now on bootstrap number 22
Now on bootstrap number 26
Now on bootstrap number 96
Now on bootstrap number 78
Now on bootstrap number 85
Now on bootstrap number 17
Now on bootstrap number 97
Now on bootstrap number 7
Now on bootstrap number 18
Now on bootstrap number 87
Now on bootstrap number 68
Now on bootstrap number 73
Now on bootstrap number 79
Now on bootstrap number 47
Now on bootstrap number 34
Now on bootstrap number 53
Now on bootstrap number 36
Now on bootstrap number 37
Now on bootstrap number 13
Now on bootstrap number 60
Now on bootstrap number 24
Now on bootstrap number 11
Now on bootstrap number 30
Now on bootstrap number 70
Now

From the two results above, the error rate is the same and this means that both methods are returning the same result. However, it seems that parallel computing does not shorten the computation time. 