                                                                                                   Thoucentric 

# To Scale The Existing Sklearn Code On Cluster

## Importing libraries

In [12]:
from sklearn.datasets import make_classification #to create dataset
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd

## Creating dataset

In [13]:
#Number of features is 20 (default)
#Number of classes is 2 (default)
X, y = make_classification(n_samples=1000, random_state=0)
X[:5]

array([[-1.06377997,  0.67640868,  1.06935647, -0.21758002,  0.46021477,
        -0.39916689, -0.07918751,  1.20938491, -0.78531472, -0.17218611,
        -1.08535744, -0.99311895,  0.30693511,  0.06405769, -1.0542328 ,
        -0.52749607, -0.0741832 , -0.35562842,  1.05721416, -0.90259159],
       [ 0.0708476 , -1.69528125,  2.44944917, -0.5304942 , -0.93296221,
         2.86520354,  2.43572851, -1.61850016,  1.30071691,  0.34840246,
         0.54493439,  0.22532411,  0.60556322, -0.19210097, -0.06802699,
         0.9716812 , -1.79204799,  0.01708348, -0.37566904, -0.62323644],
       [ 0.94028404, -0.49214582,  0.67795602, -0.22775445,  1.40175261,
         1.23165333, -0.77746425,  0.01561602,  1.33171299,  1.08477266,
        -0.97805157, -0.05012039,  0.94838552, -0.17342825, -0.47767184,
         0.76089649,  1.00115812, -0.06946407,  1.35904607, -1.18958963],
       [-0.29951677,  0.75988955,  0.18280267, -1.55023271,  0.33821802,
         0.36324148, -2.10052547, -0.4380675 , -

## Defining parameters for grid search

In [15]:
param_grid = {"C": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
              "kernel": ['rbf', 'poly', 'sigmoid'],
              "shrinking": [True, False]}

grid_search = GridSearchCV(SVC(gamma='auto', random_state=0, probability=True),
                           param_grid=param_grid,
                           return_train_score=False,
                           iid=True,
                           cv=3,
                           n_jobs=-1)

## Time Taken To Do Grid Search Using Only Sklearn (Joblib)

In [16]:
import time
a=time.time()
grid_search.fit(X, y) #normal
print(time.time()-a)

8.304359197616577


## Setting Up The Cluster

In [17]:
#distributed
from sklearn.externals import joblib
from dask.distributed import Client

In [18]:
# To setup a local cluster
from dask import delayed,compute
from distributed import Client, LocalCluster
client = Client(processes=False, threads_per_worker=4,
                n_workers=1, memory_limit='2GB')



# To setup a cluster using more than 1 laptop
# client = Client('10.0.2.4:8786')

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [19]:
client

0,1
Client  Scheduler: inproc://192.168.1.104/22737/10  Dashboard: http://localhost:45192/status,Cluster  Workers: 1  Cores: 4  Memory: 2.00 GB


In [20]:
import pandas as pd
df=pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df.head()

#finding the null values in the dataset
df.isnull().sum()

#defining the data and the target
categorical_variables = df[['Gender','Age','Occupation', 'City_Category','Stay_In_Current_City_Years','Marital_Status']]
target = df['Purchase']

#creating dummies for the categorical variables
data = pd.get_dummies(categorical_variables)

#converting dataframe to array
datanew = data.values


In [21]:
#Creating Sklearn linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

## Time Taken To Do Grid Search On Cluster

In [22]:
# with joblib.parallel_backend('dask', scheduler_host='192.168.1.108:8786'): #To run code on other laptop of cluster
with joblib.parallel_backend('dask'): #To run code on local cluster
    a=time.time()
    grid_search.fit(X, y)
    print(time.time()-a)

6.828246831893921


In [24]:
pd.DataFrame(grid_search.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_shrinking,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.214357,0.01312,0.022193,0.004731,0.001,rbf,True,"{'C': 0.001, 'kernel': 'rbf', 'shrinking': True}",0.502994,0.501502,0.501502,0.502,0.000704,41
1,0.211826,0.014536,0.017964,0.001202,0.001,rbf,False,"{'C': 0.001, 'kernel': 'rbf', 'shrinking': False}",0.502994,0.501502,0.501502,0.502,0.000704,41
2,0.157212,0.029619,0.009336,0.000556,0.001,poly,True,"{'C': 0.001, 'kernel': 'poly', 'shrinking': True}",0.502994,0.501502,0.501502,0.502,0.000704,41
3,0.152463,0.021071,0.010361,0.000663,0.001,poly,False,"{'C': 0.001, 'kernel': 'poly', 'shrinking': Fa...",0.502994,0.501502,0.501502,0.502,0.000704,41
4,0.25848,0.020127,0.02285,0.002552,0.001,sigmoid,True,"{'C': 0.001, 'kernel': 'sigmoid', 'shrinking':...",0.502994,0.501502,0.501502,0.502,0.000704,41


In [25]:
grid_search.predict(X)[:5]

array([0, 1, 1, 1, 0])

In [26]:
grid_search.score(X, y)

0.972

In [14]:
#To close the scheduler
# client.sync(client.scheduler.terminate)