Web crawl of all names of garden plant

In [55]:
from bs4 import BeautifulSoup
import requests
import pickle
from tqdm import tqdm, tnrange, tqdm_notebook
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import re

The page link is analyzed using beautiful soup. All names are included in a div section with class equal to columns.

In [5]:
page_link = 'https://en.wikipedia.org/wiki/List_of_garden_plants'
response = requests.get(page_link)
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
list_items = soup.find_all('div', {'class': 'columns'})

In [11]:
res = []
for item in list_items:
    x = item.find_all('li')
    res.extend([t.text for t in x])

In [38]:
res[:5]

['Abelia',
 'Abeliophyllum (white forsythia)',
 'Abelmoschus (okra)',
 'Abies (fir)',
 'Abroma']

Some of the plants names include the common used daily names, which are labelled in brackets. Anything included in brackets will be deleted/removed.

In [56]:
res_removed = [i[:(i.find('(') - 1)] if i.find('(') != -1 else i for i in res]
res_no_syn = []
for plant in res_removed:
    syn_id = plant.find('syn.')
    if syn_id != -1:
        res_no_syn.append(plant[:(syn_id - 1)])
        res_no_syn.append(plant[(syn_id + 5):])
    else:
        res_no_syn.append(re.sub(r'[^a-zA-Z]', '', plant))

res_no_syn[:5]

['Abelia', 'Abeliophyllum', 'Abelmoschus', 'Abies', 'Abroma']

The size of the data and 

In [64]:
data_lower = list(map(lambda x: x.lower(), res_no_syn))
chunks = '\n'.join(list(map(str, data_lower)))
chars = list(set(chunks))
data = copy.deepcopy(chunks)
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 22841 total characters and 29 unique characters in your data.


In [65]:
data_lower[-5:]

['zephyranthes', 'zigadenus', 'zinnia', 'zizania', 'zygopetalum']

In [66]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: ' ', 2: ',', 3: 'a', 4: 'b', 5: 'c', 6: 'd', 7: 'e', 8: 'f', 9: 'g', 10: 'h', 11: 'i', 12: 'j', 13: 'k', 14: 'l', 15: 'm', 16: 'n', 17: 'o', 18: 'p', 19: 'q', 20: 'r', 21: 's', 22: 't', 23: 'u', 24: 'v', 25: 'w', 26: 'x', 27: 'y', 28: 'z'}


In [110]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding, Input
from keras import optimizers

Prepare features and targets

In [80]:
features = []
labels = []

training_length = np.max([len(plant) for plant in data_lower])

for plant in data_lower:

    # Create multiple training examples from each sequence
    for i in range(2, len(plant) + 1):
        
        # Extract the features and label
        sec = plant[:i]
        extract = [char_to_ix[cha] for cha in sec]
        
        feature = extract[:-1]
        features.append(feature + [0 for i in range(training_length - len(feature))])
        labels.append(extract[-1])
        #print(sec)
features = np.array(features)

In [81]:
features

array([[ 3,  0,  0, ...,  0,  0,  0],
       [ 3,  4,  0, ...,  0,  0,  0],
       [ 3,  4,  7, ...,  0,  0,  0],
       ...,
       [28, 27,  9, ...,  0,  0,  0],
       [28, 27,  9, ...,  0,  0,  0],
       [28, 27,  9, ...,  0,  0,  0]])

In [82]:
features_pd = pd.DataFrame(features)
features_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,4,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,4,7,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,4,7,14,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [85]:
labels_array = to_categorical(labels)

In [86]:
labels_array.shape

(18328, 29)

In [87]:
features_pd.shape

(18328, 36)

In [137]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    features_pd, labels_array, test_size=0.20, random_state=1402)

In [148]:
model = Sequential()
'''
# Embedding layer
model.add(
    Embedding(input_dim=num_words,
              input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))
'''
model.add(Embedding(input_dim=vocab_size,
                    input_length = training_length,
                    output_dim=20))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(vocab_size, activation='softmax'))


sgd = optimizers.SGD(lr=0.1, decay=1e-5, momentum=0.85, nesterov=True)
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

# Compile the model
model.compile(
    optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [149]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(('./models/model.h5'), save_best_only=True, 
                             save_weights_only=False)]

In [None]:
history = model.fit(X_train,  y_train, 
                    batch_size=4096, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))

Train on 14662 samples, validate on 3666 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150