# Découverte de Scikit-learn (Chap 12)

## Sélection d'applications pour la datalogie

http://scikit-learn.org/stable/developers/<BR>
http://scikit-learn.org/stable/faq.html<BR>

In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data,boston.target
print("X:%s y:%s" % (X.shape, y.shape))

X:(506, 13) y:(506,)


In [2]:
from sklearn.linear_model import LinearRegression
hypothesis = LinearRegression(normalize=True)
hypothesis.fit(X, y)
print(hypothesis.coef_)

[-1.07170557e-01  4.63952195e-02  2.08602395e-02  2.68856140e+00
 -1.77957587e+01  3.80475246e+00  7.51061703e-04 -1.47575880e+00
  3.05655038e-01 -1.23293463e-02 -9.53463555e-01  9.39251272e-03
 -5.25466633e-01]


In [3]:
print(hypothesis.coef_)

[-1.07170557e-01  4.63952195e-02  2.08602395e-02  2.68856140e+00
 -1.77957587e+01  3.80475246e+00  7.51061703e-04 -1.47575880e+00
  3.05655038e-01 -1.23293463e-02 -9.53463555e-01  9.39251272e-03
 -5.25466633e-01]


In [4]:
import numpy as np
new_observation = np.array([1, 0, 1, 0, 0.5, 7, 59, 
                            6, 3, 200, 20, 350, 4], 
                           dtype=float).reshape(1, -1)
print(hypothesis.predict(new_observation))

[25.8972784]


In [5]:
hypothesis.score(X, y)

0.7406077428649428

In [6]:
#help(LinearRegression)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
print(scaler.transform(new_observation))

[[0.01116872 0.         0.01979472 0.         0.23662551 0.65893849
  0.57775489 0.44288845 0.08695652 0.02480916 0.78723404 0.88173887
  0.06263797]]


# La technique de hachage

## Démonstration de la technique de hachage

In [7]:
print(hash('Python'))

-7222251431502010719


In [8]:
print(abs(hash('Python')) % 1000)

719


In [9]:
from sklearn.feature_extraction.text import *
oh_enconder = CountVectorizer()
oh_enconded = oh_enconder.fit_transform([
'Python for data science','Python for machine learning'])
#'Python pour les datalogues','Python pour les mécapprentis'])

print(oh_enconder.vocabulary_)

{'python': 4, 'for': 1, 'data': 0, 'science': 5, 'machine': 3, 'learning': 2}


In [10]:
string_1 = 'Python for data science'
string_2 = 'Python for machine learning'

def hashing_trick(input_string, vector_size=20):
    feature_vector = [0] * vector_size
    for word in input_string.split(' '):
        index = abs(hash(word)) % vector_size
        feature_vector[index] = 1
    return feature_vector

In [11]:
print(hashing_trick(
    input_string='Python for data science', 
    vector_size=20))

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [12]:
print(hashing_trick(
    input_string='Python for machine learning', 
    vector_size=20))

[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


## Matrices creuses et sélection déterministe

In [13]:
from scipy.sparse import csc_matrix
print(csc_matrix([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]))

  (0, 0)	1
  (0, 5)	1
  (0, 16)	1
  (0, 18)	1


http://scikit-learn.org/stable/modules/feature_extraction.html
http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html

In [14]:
import sklearn.feature_extraction.text as txt
htrick = txt.HashingVectorizer(n_features=20, 
                           binary=True, norm=None)
hashed_text = htrick.transform(['Python for data science',
                           'Python for machine learning'])
hashed_text

<2x20 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [15]:
oh_enconder.transform(['Nouveau texte arrivé']).todense()

matrix([[0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
htrick.transform(['Nouveau texte arrivé']).todense()

matrix([[0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])

# Considérations de temps et de performances

## Chronométrage avec timeit

In [17]:
%timeit l = [k for k in range(10**6)]

96.1 ms ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%timeit -n 20 -r 5 l = [k for k in range(10**6)]

90.5 ms ± 1.78 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [20]:
%%timeit 
l = list()
for k in range(10**6):
    l.append(k)

139 ms ± 3.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
import sklearn.feature_extraction.text as txt
htrick = txt.HashingVectorizer(n_features=20, 
                           binary=True, 
                           norm=None) 
oh_enconder = txt.CountVectorizer()
texts = ['Python for data science', 
         'Python for machine learning']

In [22]:
%timeit oh_enconded = oh_enconder.fit_transform(texts)

791 µs ± 49.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%timeit hashing = htrick.transform(texts)

122 µs ± 1.44 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [24]:
import timeit
cumulative_time = timeit.timeit(
    "hashing = htrick.transform(texts)", 
    "from __main__ import htrick, texts", 
    number=10000)
print(cumulative_time / 10000.0)

0.00013299488999999766


## Utilisation du profileur mémoire

In [18]:
# Installation procedures
import sys
!{sys.executable} -m pip install memory_profiler



distributed 1.21.8 requires msgpack, which is not installed.
You are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [22]:
# Initialization from IPython (to be repeat at every IPython start)
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [23]:
hashing = htrick.transform(texts)
%memit dense_hashing = hashing.toarray()

peak memory: 91.83 MiB, increment: 0.03 MiB


In [24]:
%%writefile example_code.py
def comparison_test(text):
    import sklearn.feature_extraction.text as txt
    htrick = txt.HashingVectorizer(n_features=20, 
                                   binary=True, 
                                   norm=None) 
    oh_enconder = txt.CountVectorizer()
    oh_enconded = oh_enconder.fit_transform(text)
    hashing = htrick.transform(text)
    return oh_enconded, hashing

Overwriting example_code.py


In [26]:
from example_code import comparison_test
text = ['Python for data science',
        'Python for machine learning']
%mprun -f comparison_test comparison_test(text)




# Exécution parallèle et multi-cœurs

## Démonstration du multi-traitement

In [27]:
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data,digits.target
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [28]:
%timeit single_core = cross_val_score(SVC(), X, y, \
                                      cv=20, n_jobs=1)

11.7 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
%timeit multi_core = cross_val_score(SVC(), X, y, \
                                     cv=20, n_jobs=-1)

9.77 s ± 643 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Note : Il est possible que l'exec multicoeurs se bloque (n_jobs=-1). Bogue en cours de correction.