# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [1]:
import sys

sys.path.append(sys.path[0] + './../')

In [2]:
from NLP.files_reader import *
import tensorflow as tf
import nltk

## Get the data

In [3]:
tweets_and_trends = []
tweets = []
trends = []

files_prefix = './../'

tweets_and_trends += (FilesReader.read_file(files_prefix + new_US_file))
tweets_and_trends += (FilesReader.read_file(files_prefix + new_UK_file))
tweets_and_trends += (FilesReader.read_file(files_prefix + new_AUS_file))
tweets_and_trends += (FilesReader.read_file(files_prefix + new_CAN_file))
tweets_and_trends += (FilesReader.read_file(files_prefix + new_IR_file))

random.shuffle(tweets_and_trends)

tweets, trends = FilesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 18771 tweets.


## Stemming and lemmatization

In [4]:
from NLP.lemmatizer_and_stemmer import LemmatizerAndStemmer

print(f"before: {tweets[:3]}")
for i, tweet in enumerate(tweets):
    processed_tweet = LemmatizerAndStemmer.stem_and_lemmatize_tweet(tweet)
    tweets[i] = processed_tweet
    

print(f"after: {tweets[:3]}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


before: ['fighting back watch live tbs', 'booking backroom staffer nearly took gus', 'talk social expectations impact men']
after: ['fight back watch live tb  ', 'book backroom staffer near took gu  ', 'talk social expect impact men  ']


## Tokenize the text

In [5]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different words")
print(tweets_word_index)

We have 14779 different words


## Create the padded sequences

In [6]:
from keras.utils import pad_sequences
sequence_length = 15

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [7]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 32 different trends
{'Ariana': 0, 'Raquel': 1, 'PumpRules': 2, 'NRLBroncosPanthers': 3, 'Madrid': 4, 'Yankees': 5, 'MentalHealthAwarenessWeek': 6, 'ChampionsLeague': 7, 'PUSB': 8, 'Heat': 9, 'MCIRMA': 10, 'Courtois': 11, 'jennie': 12, 'AEWDynamite': 13, 'Meghan': 14, 'Sandoval': 15, 'Houston': 16, 'Billy Graham': 17, 'r4today': 18, 'NRLIndigenousRound': 19, 'Quebec City': 20, 'Coyotes': 21, 'IDAHOBIT': 22, 'Jimmy Butler': 23, 'YouTube TV': 24, 'Penrith': 25, 'Ivan Toney': 26, 'Luton': 27, 'Adam Gee': 28, 'Reece Walsh': 29, 'Ashley Klein': 30, 'Playoff Jimmy': 31}


## Create the trends sequences

In [8]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 1, 2, 3, 2, 3, 4, 5, 6, 4, 7, 2, 8, 8, 5, 7, 9, 1, 6, 10, 11, 12, 10, 2, 3, 8, 6, 2, 9, 11, 3, 0, 3, 2, 6, 8, 13, 7, 10, 2, 3, 1, 2, 7, 14, 15, 16, 17, 18, 5, 7, 4, 10, 13, 8, 15, 6, 6, 15, 2, 2, 6, 13, 19, 2, 10, 10, 0, 15, 10, 20, 3, 6, 10, 12, 8, 16, 15, 6, 20, 2, 5, 3, 2, 21, 8, 18, 7, 8, 22, 13, 23, 10, 7, 10, 13, 13, 2, 8, 7, 10, 14, 0, 0, 10, 3, 2, 1, 6, 2, 18, 11, 5, 6, 7, 2, 10, 18, 11, 7, 2, 13, 2, 1, 7, 10, 17, 2, 2, 2, 10, 10, 1, 7, 3, 15, 2, 3, 18, 6, 19, 6, 24, 6, 7, 2, 2, 8, 2, 8, 5, 13, 22, 18, 2, 15, 2, 7, 15, 6, 2, 10, 5, 20, 11, 22, 7, 1, 3, 18, 2, 7, 22, 9, 13, 5, 8, 7, 18, 2, 18, 3, 3, 10, 6, 11, 1, 13, 5, 2, 16, 22, 12, 25, 6, 21, 1, 10, 8, 12, 1, 2, 3, 11, 10, 10, 7, 8, 6, 12, 10, 22, 10, 18, 6, 6, 8, 2, 1, 7, 21, 2, 22, 2, 3, 12, 6, 8, 18, 12, 16, 7, 2, 2, 2, 6, 26, 7, 6, 2, 0, 19, 8, 3, 21, 3, 27, 10, 3, 0, 6, 2, 7, 8, 7, 3, 18, 1, 19, 5, 15, 6, 2, 16, 13, 16, 10, 6, 6, 12, 6, 13, 0, 26, 0, 14, 15, 8, 1, 18, 20, 2, 16, 8, 5, 15, 15, 6, 10, 15, 4, 8, 0, 2, 1

## Encode the trends

In [9]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(18771, 32)


## Prepare the pre-trained embeddings

In [10]:
from NLP.Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./../NLP/Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 12811, Misses: 1968


## Split the data

In [11]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [12]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 15016 tweets for training and 3755 for testing


## Tune the hyper-parameters

In [13]:
# import keras_tuner as kt
# import keras
# from keras import initializers
# import tensorflow as tf

# no_of_tweets_words = len(tweets_word_index) + 1
# embedding_dimensions = 300

# def model_builder(hp):
#     model = keras.Sequential()

#     hp_conv_filters = hp.Int('conv_filters', min_value=32, max_value=256, step=16)
#     hp_kernel_size = hp.Int('conv_kernel_size', min_value=2, max_value=5, step=1)
#     hp_dropout = hp.Float('dropout', min_value=0.0, max_value=0.6, step=0.1)
#     hp_lstm = hp.Int('lstm_units', min_value=32, max_value=256, step=16)

#     model.add(keras.layers.Embedding(
#         no_of_tweets_words,
#         embedding_dimensions,
#         input_length=sequence_length,
#         embeddings_initializer=initializers.Constant(embeddings_matrix),
#         trainable=True
#     ))
#     model.add(keras.layers.Conv1D(hp_conv_filters, hp_kernel_size, padding='same'))
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Bidirectional(keras.layers.LSTM(hp_lstm)))
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Dense(no_of_trends))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.CategoricalCrossentropy(
#                       from_logits=True),
#                   metrics=['accuracy'])

#     return model


In [14]:
# tuner = kt.Hyperband(model_builder,
#                      objective='val_accuracy',
#                      max_epochs=6,
#                      factor=3,
#                      directory='parameters_tuning',
#                      project_name='trends_classifier')

In [15]:
# tuner.search(train_data, train_labels, epochs=50, validation_data=(test_data, test_labels))

In [16]:
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# print(f"""
# The hyperparameter search is complete. Here are the optimal configurations:
#     conv_filters: {best_hps.get('conv_filters')}
#     conv_kernel_size: {best_hps.get('conv_kernel_size')}
#     lstm_units: {best_hps.get('lstm_units')}
#     dropout: {best_hps.get('dropout')}
#     learning_rate: {best_hps.get('learning_rate')}
# """)

In [17]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# history = trends_classifier.fit(train_data, train_labels, epochs=20, validation_data=(test_data, test_labels))

# val_acc_per_epoch = history.history['val_accuracy']
# best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
# print('Best epoch: %d' % (best_epoch,))

In [18]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# trends_classifier.fit(train_data, train_labels, epochs=best_epoch, validation_data=(test_data, test_labels))

## Build the model
After tuning the hyper-parameters, here are the optimal configurations: <br>
    conv_filters: 224 <br>
    conv_kernel_size: 2<br>
    lstm_units: 144<br>
    dropout: 0.1<br>
    learning_rate: 0.001<br>

In [19]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 144
dropout_value = 0.1
conv_filters = 224
conv_kernel_size = 2

In [20]:
from keras import initializers
import tensorflow as tf

no_of_tweets_words = len(tweets_word_index) + 1

trends_classifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    tf.keras.layers.Conv1D(conv_filters, conv_kernel_size),
    tf.keras.layers.AveragePooling1D(),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

trends_classifier.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

trends_classifier.summary()

# trends_classifier = tf.keras.models.load_model("trends_classifier")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 300)           4434000   
                                                                 
 conv1d (Conv1D)             (None, 14, 224)           134624    
                                                                 
 average_pooling1d (AverageP  (None, 7, 224)           0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 7, 224)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 288)              425088    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 288)               0

In [22]:
epochs = 4
trends_classifier.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x285fb0782b0>

## Save the model

In [22]:
import pickle

trends_classifier.save("./trends_classifier/trends_classifier_model.h5")
with open('./trends_classifier/inv_trends_map.pkl', 'wb') as output:
    pickle.dump(inv_trends_map, output)
with open('./trends_classifier/tweet_tokenizer.pkl', 'wb') as output:
    pickle.dump(tweets_tokenizer, output)


In [23]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [None]:
tweet = "happy may day."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, trends_classifier))