# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [1]:
from files_reader import *
import tensorflow as tf

## Get the data

In [2]:
tweets_and_trends = []
tweets = []
trends = []

# tweets_and_trends += (FilesReader.read_file(UK_tweets_file))
# tweets_and_trends += (FilesReader.read_file(US_tweets_file))
# tweets_and_trends += (FilesReader.read_file(AUS_tweets_file))
# tweets_and_trends += (FilesReader.read_file(IR_tweets_file))
# tweets_and_trends += (FilesReader.read_file(CAN_tweets_file))
tweets_and_trends += (FilesReader.read_file(new_US_file))
tweets_and_trends += (FilesReader.read_file(new_UK_file))
tweets_and_trends += (FilesReader.read_file(new_AUS_file))
tweets_and_trends += (FilesReader.read_file(new_CAN_file))
# tweets_and_trends += (FilesReader.read_file(new_IR_file))   
# tweets_and_trends += (FilesReader.read_file(new_SINGA_file))   
tweets_and_trends += (FilesReader.read_file(new_SA_file))

random.shuffle(tweets_and_trends)

tweets, trends = FilesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 31685 tweets.


## Tokenize the text

In [3]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different words")
print(tweets_word_index)

We have 23242 different words


## Create the padded sequences

In [4]:
from keras.utils import pad_sequences
sequence_length = 20

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [5]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 57 different trends
{'Flames': 0, 'May Day': 1, 'New Month': 2, 'Panthers': 3, 'MondayMotivation': 4, 'ReaTsotellaMonday': 5, 'rufc': 6, 'FLAvsBOS': 7, 'MayDay': 8, 'WorkersDay': 9, 'Luca Brecel': 10, 'AccommodationsInFilmsAndSongs': 11, 'wunvsyd': 12, 'MondayMood': 13, 'Chevron': 14, 'aflcrowspies': 15, 'MentalHealthWeek': 16, 'Scotland': 17, 'Happy Beltane': 18, '1802LoveDefiesTime': 19, 'Labour Day': 20, 'Bruins': 21, 'BankHolidayMonday': 22, 'MetGala': 23, 'MentalHealthAwarenessMonth': 24, 'drokitowedi': 25, 'AAPIHeritageMonth': 26, 'konkhekuhambakahleClothing': 27, 'Swallows': 28, 'Kaizer Chiefs': 29, 'NationalPhysiciansDay': 30, 'Truro': 31, 'Prime': 32, 'Bergeron': 33, 'Steph': 34, 'Mitch Love': 35, 'Tony Abbott': 36, 'nrldragonsbulldogs': 37, 'Portugal': 38, 'Andrew Dillon': 39, 'Sutter': 40, 'Vardy': 41, 'Keith Stroud': 42, 'MasterChef': 43, '4corners': 44, 'Latham': 45, 'Mofaya': 46, 'Khune': 47, 'Snoop': 48, 'Jock Zonfrillo': 49, 'Luton': 50, 'JP Morgan': 51, 'Joanna

## Create the trends sequences

In [6]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 0, 1, 2, 3, 4, 5, 5, 6, 4, 7, 6, 8, 8, 8, 8, 9, 8, 10, 11, 12, 13, 14, 15, 15, 16, 13, 17, 8, 15, 18, 6, 19, 20, 21, 21, 10, 22, 6, 23, 16, 15, 7, 24, 8, 25, 15, 19, 26, 8, 15, 26, 27, 28, 16, 5, 10, 8, 24, 15, 15, 5, 9, 10, 6, 23, 18, 29, 0, 7, 30, 24, 19, 7, 31, 0, 23, 9, 26, 23, 25, 19, 7, 19, 24, 7, 7, 8, 9, 6, 23, 11, 13, 19, 4, 19, 8, 11, 0, 0, 32, 22, 15, 12, 10, 33, 5, 4, 7, 19, 34, 35, 8, 5, 13, 2, 29, 23, 36, 4, 22, 9, 22, 23, 6, 8, 37, 5, 37, 2, 7, 13, 8, 6, 13, 23, 6, 19, 5, 23, 26, 38, 37, 7, 6, 39, 15, 26, 13, 8, 23, 6, 19, 8, 19, 22, 21, 19, 40, 15, 20, 13, 15, 8, 6, 8, 7, 2, 41, 37, 22, 7, 19, 8, 24, 0, 26, 40, 9, 7, 13, 6, 8, 23, 35, 6, 42, 8, 19, 24, 23, 23, 43, 23, 16, 24, 7, 4, 15, 26, 38, 8, 7, 17, 13, 5, 12, 7, 7, 19, 28, 15, 15, 4, 8, 6, 25, 7, 24, 37, 1, 24, 9, 7, 24, 24, 11, 4, 6, 7, 43, 9, 8, 44, 0, 45, 8, 19, 24, 1, 26, 4, 5, 25, 19, 39, 6, 38, 5, 25, 19, 37, 8, 15, 37, 6, 23, 23, 19, 15, 4, 23, 19, 19, 15, 8, 0, 28, 45, 8, 37, 35, 30, 8, 9, 24, 16, 21, 8

## Encode the trends

In [7]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(31685, 57)


## Prepare the pre-trained embeddings

In [8]:
from Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 19282, Misses: 3960


## Split the data

In [9]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [10]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 25348 tweets for training and 6337 for testing


## Tune the hyper-parameters

In [11]:
# import keras_tuner as kt
# import keras


# def model_builder(hp):
#     model = keras.Sequential()

#     hp_conv_filters = hp.Int('conv_filters', min_value=32, max_value=256, step=16)
#     hp_kernel_size = hp.Int('conv_kernel_size', min_value=2, max_value=5, step=1)
#     hp_dropout = hp.Float('dropout', min_value=0.0, max_value=0.6, step=0.1)
#     hp_lstm = hp.Int('lstm_units', min_value=32, max_value=256, step=16)

#     model.add(keras.layers.Embedding(
#         no_of_tweets_words,
#         embedding_dimensions,
#         input_length=sequence_length,
#         embeddings_initializer=initializers.Constant(embeddings_matrix),
#         trainable=True
#     ))
#     model.add(keras.layers.Conv1D(hp_conv_filters, hp_kernel_size, padding='same'))
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Bidirectional(keras.layers.LSTM(hp_lstm)))
#     # model.add(keras.layers.AveragePooling1D())
#     model.add(keras.layers.Dropout(hp_dropout))
#     model.add(keras.layers.Dense(no_of_trends))

#     # Tune the learning rate for the optimizer
#     # Choose an optimal value from 0.01, 0.001, or 0.0001
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                   loss=keras.losses.CategoricalCrossentropy(
#                       from_logits=True),
#                   metrics=['accuracy'])

#     return model


In [12]:
# tuner = kt.Hyperband(model_builder,
#                      objective='val_accuracy',
#                      max_epochs=10,
#                      factor=3)

In [13]:
# tuner.search(train_data, train_labels, epochs=20, validation_data=(test_data, test_labels))

In [14]:
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# print(f"""
# The hyperparameter search is complete. Here are the optimal configurations:
#     conv_filters: {best_hps.get('conv_filters')}
#     conv_kernel_size: {best_hps.get('conv_kernel_size')}
#     lstm_units: {best_hps.get('lstm_units')}
#     dropout: {best_hps.get('dropout')}
#     learning_rate: {best_hps.get('learning_rate')}
# """)

In [15]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# history = trends_classifier.fit(train_data, train_labels, epochs=20, validation_data=(test_data, test_labels))

# val_acc_per_epoch = history.history['val_accuracy']
# best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
# print('Best epoch: %d' % (best_epoch,))

In [16]:
# trends_classifier = tuner.hypermodel.build(best_hps)
# trends_classifier.fit(train_data, train_labels, epochs=best_epoch, validation_data=(test_data, test_labels))

## Build the model
After tuning the hyper-parameters, here are the optimal configurations: <br>
    conv_filters: 112<br>
    conv_kernel_size: 3<br>
    lstm_units: 256<br>
    dropout: 0.0<br>
    learning_rate: 0.001<br>

In [17]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 256
dropout_value = 0.0
conv_filters = 112
conv_kernel_size = 2

In [18]:
from keras import initializers
import tensorflow as tf

no_of_tweets_words = len(tweets_word_index) + 1

trends_classifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    tf.keras.layers.Conv1D(conv_filters, conv_kernel_size),
    tf.keras.layers.AveragePooling1D(),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    # tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

trends_classifier.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

trends_classifier.summary()

# trends_classifier = tf.keras.models.load_model("trends_classifier")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 300)           6972900   
                                                                 
 conv1d (Conv1D)             (None, 19, 112)           67312     
                                                                 
 average_pooling1d (AverageP  (None, 9, 112)           0         
 ooling1D)                                                       
                                                                 
 dropout (Dropout)           (None, 9, 112)            0         
                                                                 
 bidirectional (Bidirectiona  (None, 512)              755712    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 512)               0

In [19]:
epochs = 4
trends_classifier.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x165865c3970>

## Save the model

In [25]:
import pickle

trends_classifier.save("./trends_classifier/trends_classifier_model.h5")
with open('./trends_classifier/inv_trends_map.pkl', 'wb') as output:
    pickle.dump(inv_trends_map, output)
with open('./trends_classifier/tweet_tokenizer.pkl', 'wb') as output:
    pickle.dump(tweets_tokenizer, output)


In [21]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [22]:
tweet = "who want to join me against the government ."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, trends_classifier))

['WorkersDay', 'MondayMood', 'MayDay']
