# Playing with Scikit-learn

## Defining applications for data science

http://scikit-learn.org/stable/developers/<BR>
http://scikit-learn.org/stable/faq.html<BR>

In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data,boston.target
print("X:%s y:%s" % (X.shape, y.shape))

X:(506, 13) y:(506,)


In [2]:
from sklearn.linear_model import LinearRegression
hypothesis = LinearRegression(normalize=True)
hypothesis.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [3]:
print(hypothesis.coef_)

[-1.07170557e-01  4.63952195e-02  2.08602395e-02  2.68856140e+00
 -1.77957587e+01  3.80475246e+00  7.51061703e-04 -1.47575880e+00
  3.05655038e-01 -1.23293463e-02 -9.53463555e-01  9.39251272e-03
 -5.25466633e-01]


In [4]:
import numpy as np
new_observation = np.array([1, 0, 1, 0, 0.5, 7, 59, 
                            6, 3, 200, 20, 350, 4], 
                           dtype=float).reshape(1, -1)
print(hypothesis.predict(new_observation))

[25.8972784]


In [5]:
hypothesis.score(X, y)

0.7406077428649428

In [6]:
#help(LinearRegression)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
print(scaler.transform(new_observation))

[[0.01116872 0.         0.01979472 0.         0.23662551 0.65893849
  0.57775489 0.44288845 0.08695652 0.02480916 0.78723404 0.88173887
  0.06263797]]


# Performing the Hashing Trick

## Demonstrating the hashing trick

In [7]:
print(hash('Python'))

-2668182151156858128


In [8]:
print(abs(hash('Python')) % 1000)

128


In [9]:
from sklearn.feature_extraction.text import *
oh_enconder = CountVectorizer()
oh_enconded = oh_enconder.fit_transform([
'Python for data science','Python for machine learning'])

print(oh_enconder.vocabulary_)

{'python': 4, 'for': 1, 'data': 0, 'science': 5, 'machine': 3, 'learning': 2}


In [10]:
string_1 = 'Python for data science'
string_2 = 'Python for machine learning'

def hashing_trick(input_string, vector_size=20):
    feature_vector = [0] * vector_size
    for word in input_string.split(' '):
        index = abs(hash(word)) % vector_size
        feature_vector[index] = 1
    return feature_vector

In [11]:
print(hashing_trick(
    input_string='Python for data science', 
    vector_size=20))

[0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]


In [12]:
print(hashing_trick(
    input_string='Python for machine learning', 
    vector_size=20))

[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]


## Working with deterministic selection

In [13]:
from scipy.sparse import csc_matrix
print(csc_matrix([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]))

  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
  (0, 18)	1


http://scikit-learn.org/stable/modules/feature_extraction.html
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

In [14]:
import sklearn.feature_extraction.text as txt
htrick = txt.HashingVectorizer(n_features=20, 
                           binary=True, norm=None)
hashed_text = htrick.transform(['Python for data science',
                           'Python for machine learning'])
hashed_text

<2x20 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [15]:
oh_enconder.transform(['New text has arrived']).todense()

matrix([[0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
htrick.transform(['New text has arrived']).todense()

matrix([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 1.]])

# Considering Timing and Performance

## Benchmarking with timeit

In [17]:
%timeit l = [k for k in range(10**6)]

72.4 ms ± 252 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%timeit -n 20 -r 5 l = [k for k in range(10**6)]

74 ms ± 45.4 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [19]:
%%timeit 
l = list()
for k in range(10**6):
    l.append(k)

124 ms ± 347 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [20]:
import sklearn.feature_extraction.text as txt
htrick = txt.HashingVectorizer(n_features=20, 
                           binary=True, 
                           norm=None) 
oh_enconder = txt.CountVectorizer()
texts = ['Python for data science', 
         'Python for machine learning']

In [21]:
%timeit oh_enconded = oh_enconder.fit_transform(texts)

653 µs ± 4.66 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
%timeit hashing = htrick.transform(texts)

105 µs ± 448 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [23]:
import timeit
cumulative_time = timeit.timeit(
    "hashing = htrick.transform(texts)", 
    "from __main__ import htrick, texts", 
    number=10000)
print(cumulative_time / 10000.0)

0.00010664665180141313


## Working with the memory profiler

In [24]:
# Installation procedures
import sys
!{sys.executable} -m pip install memory_profiler



In [25]:
# Initialization from IPython (to be repeat at every IPython start)
%load_ext memory_profiler

In [26]:
hashing = htrick.transform(texts)
%memit dense_hashing = hashing.toarray()

peak memory: 91.91 MiB, increment: 1.31 MiB


In [27]:
%%writefile example_code.py
def comparison_test(text):
    import sklearn.feature_extraction.text as txt
    htrick = txt.HashingVectorizer(n_features=20, 
                                   binary=True, 
                                   norm=None) 
    oh_enconder = txt.CountVectorizer()
    oh_enconded = oh_enconder.fit_transform(text)
    hashing = htrick.transform(text)
    return oh_enconded, hashing

Writing example_code.py


In [28]:
from example_code import comparison_test
text = ['Python for data science',
        'Python for machine learning']
%mprun -f comparison_test comparison_test(text)




# Running in Parallel on Multiple Cores

## Demonstrating multiprocessing

In [29]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data,digits.target
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [30]:
%timeit single_core = cross_val_score(SVC(), X, y, \
                                      cv=20, n_jobs=1)

10.9 s ± 30.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
%timeit multi_core = cross_val_score(SVC(), X, y, \
                                     cv=20, n_jobs=-1)

4.44 s ± 44.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
