In [9]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from dask import compute, delayed
import dask.multiprocessing
import multiprocessing
import timeit

In [10]:
df = pd.read_csv('data/Auto.csv', na_values='?')
df.dropna(inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [11]:
df['mpg_high']=np.where(df['mpg']>= np.median(df['mpg']),1,0)
df['orgn1']=np.where(df['origin']==1,1,0)
df['orgn2']=np.where(df['origin']==2,1,0)

## a)

In [16]:
y = df['mpg_high'].values
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 
        'orgn1', 'orgn2']].values
start_time = timeit.default_timer()
mse_lst = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=i)
    LR = LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=1) 
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    mse = ((y_test - y_pred) ** 2).mean()
    mse_lst.append(mse)
    
print('The average error rate is ', np.mean(mse_lst))
print('The computation takes ', timeit.default_timer() - start_time, 'seconds')

The average error rate is  0.10166666666666666
The computation takes  6.282585936000004 seconds


## b)

In [13]:
num_cores = multiprocessing.cpu_count()
print('Number of available cores is', num_cores)

Number of available cores is 4


In [15]:
def cal_err(bs_num, seed, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35, random_state=seed)
    LR = LogisticRegression(solver='lbfgs', max_iter=1000, n_jobs=1)
    LR.fit(X_train, y_train)
    y_pred = LR.predict(X_test)
    mse = ((y_test - y_pred) ** 2).mean()
    return mse

start_time2 = timeit.default_timer()
lazy_value = []
for i in range(100):
    lazy_value.append(delayed(cal_err)(i, i, X, y))
    
results = compute(*lazy_value, scheduler=dask.multiprocessing.get, num_workers=num_cores)

print('The average error rate is ', np.mean(results))
print('The computation takes ', timeit.default_timer() - start_time2, 'seconds')

The average error rate is  0.10166666666666666
The computation takes  4.028785011000011 seconds


As we can see the average error rate is the same, while the computation time is smaller.