# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [1]:
from files_reader import *
import tensorflow as tf

## Get the data

In [2]:
tweets_and_trends = []
tweets = []
trends = []

tweets_and_trends += (filesReader.read_file(UK_tweets_file))
tweets_and_trends += (filesReader.read_file(US_tweets_file))
tweets_and_trends += (filesReader.read_file(AUS_tweets_file))
tweets_and_trends += (filesReader.read_file(IR_tweets_file))
tweets_and_trends += (filesReader.read_file(CAN_tweets_file))
tweets_and_trends += (filesReader.read_file(new_US_file))
tweets_and_trends += (filesReader.read_file(new_UK_file))
tweets_and_trends += (filesReader.read_file(new_AUS_file))
tweets_and_trends += (filesReader.read_file(new_CAN_file))
tweets_and_trends += (filesReader.read_file(new_IR_file))

random.shuffle(tweets_and_trends)

tweets, trends = filesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 111710 tweets.


## Tokenize the text

In [3]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different words")
print(tweets_word_index)

We have 45484 different words


## Create the padded sequences

In [4]:
from keras.utils import pad_sequences
sequence_length = 20

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [5]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 340 different trends
{'Rob Holding': 0, 'ArborDay': 1, 'lufc': 2, 'Brodie': 3, 'nrldolphinstitans': 4, 'brightfutures': 5, 'Jaylen Brown': 6, 'Good Wednesday': 7, 'Narinder': 8, 'Sudan': 9, 'Wordle 675 X': 10, 'LEELEI': 11, 'Tucker': 12, 'afldeestigers': 13, 'NFLDraft': 14, 'Keefe': 15, 'Daniel': 16, 'MCIARS': 17, 'WorldPenguinDay': 18, 'popmaster': 19, 'fridaymorning': 20, 'CashAppUK': 21, 'ASongOrMovieForFingers': 22, 'emergencyalert': 23, 'Holl': 24, 'Anzac Day': 25, 'Gnonto': 26, 'CM Punk': 27, 'Chelsea': 28, 'Evan Ferguson': 29, 'FridayFeeling': 30, 'DubNation': 31, 'Sainz': 32, 'WWERaw': 33, 'NRLStormWarriors': 34, 'NEWTOT': 35, 'Spurs': 36, 'WednesdayMotivation': 37, 'Calgary': 38, 'NRLEelsKnights': 39, 'COYG': 40, 'NRLRoostersDragons': 41, 'AzerbaijanGP': 42, 'HonkaiStarRail': 43, 'AMillionLittleThings': 44, 'Gus Johnson': 45, 'The Ditch': 46, 'Buchner': 47, 'NiallCollins': 48, 'Player of the Season': 49, 'AlienDay': 50, 'PoetryDayIRL': 51, 'ImACeleb': 52, 'Marina': 53,

## Create the trends sequences

In [6]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 13, 19, 20, 21, 22, 23, 24, 22, 25, 26, 14, 27, 28, 29, 22, 30, 31, 32, 33, 34, 35, 36, 17, 37, 33, 30, 21, 38, 39, 40, 41, 42, 1, 2, 40, 43, 44, 45, 46, 47, 48, 39, 49, 1, 50, 51, 52, 53, 54, 41, 55, 56, 57, 58, 59, 60, 41, 54, 61, 57, 62, 63, 36, 48, 64, 11, 33, 48, 14, 65, 66, 67, 18, 51, 68, 69, 17, 70, 71, 2, 72, 73, 74, 56, 17, 75, 19, 76, 37, 74, 37, 77, 78, 60, 52, 79, 80, 81, 36, 82, 83, 84, 85, 23, 86, 33, 87, 88, 35, 89, 37, 90, 87, 91, 40, 61, 86, 37, 85, 17, 92, 93, 23, 21, 55, 41, 55, 94, 63, 23, 95, 96, 97, 10, 23, 59, 82, 82, 98, 99, 52, 100, 33, 82, 101, 102, 54, 85, 23, 13, 103, 74, 13, 61, 9, 104, 105, 49, 106, 51, 63, 107, 11, 108, 100, 103, 58, 101, 93, 85, 52, 63, 62, 109, 110, 4, 44, 111, 112, 30, 113, 114, 57, 75, 34, 11, 115, 31, 23, 59, 109, 57, 116, 13, 117, 112, 56, 118, 119, 22, 120, 33, 76, 121, 23, 90, 122, 34, 81, 123, 30, 82, 20, 2, 35, 52, 124, 33, 10, 26, 34, 34, 8, 63, 14, 52, 118, 20

## Encode the trends

In [7]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(111710, 340)


## Prepare the pre-trained embeddings

In [8]:
from Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 34626, Misses: 10858


## Split the data

In [9]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [10]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 89368 tweets for training and 22342 for testing


## Build the model

In [11]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 128
dropout_value = 0.2
conv_filters = 64
conv_kernel_size = 5
dense_layers = 10000

In [12]:
from keras import initializers

no_of_tweets_words = len(tweets_word_index) + 1

trends_classifier = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    # tf.keras.layers.Conv1D(conv_filters, conv_kernel_size),
    # tf.keras.layers.AveragePooling1D(),
    # tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

trends_classifier.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

trends_classifier.summary()

# trends_classifier = tf.keras.models.load_model("trends_classifier")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 300)           13645500  
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 340)               87380     
                                                                 
Total params: 14,172,176
Trainable params: 14,172,176
Non-trainable params: 0
_________________________________________________________________


In [13]:
epochs = 6
trends_classifier.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/6

In [None]:
# trends_classifier.save("trends_classifier")



INFO:tensorflow:Assets written to: trends_classifier\assets


INFO:tensorflow:Assets written to: trends_classifier\assets


KeyboardInterrupt: 

In [None]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [None]:
tweet = "i am starting to consider using bing."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, trends_classifier))

['SNTA', 'BankHolidayWeekend', 'FridayFeeling']
