Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)

Chapter 10 Machine Learning Best Practices

Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)

## Best practice 14 – Extracting features from text data 

### Word embedding

In [1]:
from gensim.models import Word2Vec

In [2]:
# Sample sentences for training
sentences = [
    ["i", "love", "machine", "learning", "by", "example"],
    ["machine", "learning", "and", "deep", "learning", "are", "fascinating"],
    ["word", "embedding", "is", "essential", "for", "many", "nlp", "tasks"],
    ["word2vec", "produces", "word", "embeddings"]
]

# Create and train Word2Vec model
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=0)

# Access word vectors
vector = model.wv["machine"]
print("Vector for 'machine':", vector)

Vector for 'machine': [ 9.2815855e-05  3.0779743e-03 -6.8117767e-03 -1.3753572e-03
  7.6693585e-03  7.3465472e-03 -3.6724545e-03  2.6435424e-03
 -8.3174659e-03  6.2051434e-03 -4.6373457e-03 -3.1652437e-03
  9.3113342e-03  8.7273103e-04  7.4911476e-03 -6.0739564e-03
  5.1591368e-03  9.9220201e-03 -8.4587047e-03 -5.1362212e-03
 -7.0644980e-03 -4.8613679e-03 -3.7768795e-03 -8.5355258e-03
  7.9550967e-03 -4.8430962e-03  8.4243221e-03  5.2609886e-03
 -6.5501807e-03  3.9575580e-03  5.4708594e-03 -7.4282014e-03
 -7.4055856e-03 -2.4756377e-03 -8.6252270e-03 -1.5801827e-03
 -4.0236043e-04  3.3001360e-03  1.4415972e-03 -8.8241365e-04
 -5.5940133e-03  1.7302597e-03 -8.9826871e-04  6.7939684e-03
  3.9741215e-03  4.5290575e-03  1.4341431e-03 -2.6994087e-03
 -4.3666936e-03 -1.0321270e-03  1.4369689e-03 -2.6467817e-03
 -7.0735654e-03 -7.8056543e-03 -9.1217076e-03 -5.9348154e-03
 -1.8470082e-03 -4.3242811e-03 -6.4605214e-03 -3.7180765e-03
  4.2892280e-03 -3.7388816e-03  8.3797537e-03  1.5337169e-03
 -

In [3]:
import torch
import torch.nn as nn

# Sample data
input_data = torch.LongTensor([[1, 2, 3, 4], [5, 1, 6, 3]])

# Define the embedding layer
vocab_size = 10  # Total number of unique words
embedding_dim = 3  # Dimensionality of the embeddings
embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# Pass input data through the embedding layer
embedded_data = embedding_layer(input_data)

# Print the embedded data
print("Embedded Data:\n", embedded_data)

Embedded Data:
 tensor([[[-0.3944,  0.8023, -0.0500],
         [-1.9144, -1.3619, -1.4696],
         [-0.2408, -0.1748, -0.1506],
         [ 0.8162, -0.7899,  0.3673]],

        [[-0.3459, -2.2255,  0.1240],
         [-0.3944,  0.8023, -0.0500],
         [ 0.1515,  0.0214, -0.6475],
         [-0.2408, -0.1748, -0.1506]]], grad_fn=<EmbeddingBackward0>)


# Best practices in the deployment and monitoring stage 

# Best practice 19 – Saving, loading, and reusing models 

### Saving and restoring models using pickle 

In [4]:
from sklearn import datasets
dataset = datasets.load_diabetes()
X, y = dataset.data, dataset.target

num_new = 30    # the last 30 samples as new data set
X_train = X[:-num_new, :]
y_train = y[:-num_new]
X_new = X[-num_new:, :]
y_new = y[-num_new:]

In [5]:
# Data pre-processing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

In [6]:
import pickle
# Save the scaler
pickle.dump(scaler, open("scaler.p", "wb" ))

In [7]:
X_scaled_train = scaler.transform(X_train)

In [8]:
# Regression model training
from sklearn.svm import SVR
regressor = SVR(C=20)
regressor.fit(X_scaled_train, y_train)

In [9]:
# Save the regressor
pickle.dump(regressor, open("regressor.p", "wb"))

In [10]:
# Deployment
my_scaler = pickle.load(open("scaler.p", "rb" ))
my_regressor = pickle.load(open("regressor.p", "rb"))

In [11]:
X_scaled_new = my_scaler.transform(X_new)
predictions = my_regressor.predict(X_scaled_new)

In [12]:
# Monitor
from sklearn.metrics import r2_score
print(f'Health check on the model, R^2: {r2_score(y_new, predictions):.3f}')

Health check on the model, R^2: 0.613


### Saving and restoring models in TensorFlow 

In [13]:
import tensorflow as tf
from tensorflow import keras

cancer_data = datasets.load_breast_cancer()
X = cancer_data.data
X = scaler.fit_transform(X)
y = cancer_data.target

In [14]:
learning_rate = 0.005
n_iter = 10

tf.random.set_seed(42)

model = keras.Sequential([
    keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate))

In [15]:
model.fit(X, y, epochs=n_iter)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b76a2dff70>

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 31
Trainable params: 31
Non-trainable params: 0
_________________________________________________________________


In [17]:
path = './model_tf'
model.save(path)

INFO:tensorflow:Assets written to: ./model_tf\assets


In [18]:
new_model = tf.keras.models.load_model(path)

new_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 31        
                                                                 
Total params: 31
Trainable params: 31
Non-trainable params: 0
_________________________________________________________________


### Saving and restoring models in PyTorch

In [19]:
X_torch = torch.FloatTensor(X)
y_torch = torch.FloatTensor(y.reshape(y.shape[0], 1))

In [20]:
torch.manual_seed(42)
 
model = nn.Sequential(nn.Linear(X.shape[1], 1),
                      nn.Sigmoid())
 
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [21]:
def train_step(model, X_train, y_train, loss_function, optimizer):
    pred_train = model(X_train)
    loss = loss_function(pred_train, y_train)
    model.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()


for epoch in range(n_iter):
    loss = train_step(model, X_torch, y_torch, loss_function, optimizer)
    print(f"Epoch {epoch} - loss: {loss}")


Epoch 0 - loss: 0.8387020826339722
Epoch 1 - loss: 0.7999904751777649
Epoch 2 - loss: 0.76298588514328
Epoch 3 - loss: 0.7277476787567139
Epoch 4 - loss: 0.6943162679672241
Epoch 5 - loss: 0.6627081036567688
Epoch 6 - loss: 0.6329135298728943
Epoch 7 - loss: 0.6048969030380249
Epoch 8 - loss: 0.5786024332046509
Epoch 9 - loss: 0.5539639592170715


In [22]:
print(model)

Sequential(
  (0): Linear(in_features=30, out_features=1, bias=True)
  (1): Sigmoid()
)


In [23]:
path = './model.pth'
torch.save(model, path)

In [24]:
new_model = torch.load(path)
print(new_model)

Sequential(
  (0): Linear(in_features=30, out_features=1, bias=True)
  (1): Sigmoid()
)


---

Readers may ignore the next cell.

In [25]:
!jupyter nbconvert --to python ch10_part2.ipynb --TemplateExporter.exclude_input_prompt=True

[NbConvertApp] Converting notebook ch10_part2.ipynb to python
[NbConvertApp] Writing 4152 bytes to ch10_part2.py
