# Trends-based recommendation
In this notebook we will classify the tweets into trends and these trends will help us decide which hashtags to recommend 

In [1]:
from files_reader import *
import tensorflow as tf

## Get the data

In [2]:
tweets_and_trends = []
tweets = []
trends = []

tweets_and_trends += (filesReader.read_file(UK_tweets_file))
# tweets_and_trends += (filesReader.read_file(US_tweets_file))
# tweets_and_trends += (filesReader.read_file(AUS_tweets_file))
# tweets_and_trends += (filesReader.read_file(IR_tweets_file))
# tweets_and_trends += (filesReader.read_file(CAN_tweets_file))

random.shuffle(tweets_and_trends)

tweets, trends = filesReader.split_tweets_and_trends(tweets_and_trends)
print(f"We have {len(tweets)} tweets.")


We have 30773 tweets.


## Tokenize the text

In [3]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

print(f"We have {len(tweets_word_index)} different word")
print(tweets_word_index)

We have 22970 different word


## Create the padded sequences

In [4]:
from keras.utils import pad_sequences
sequence_length = 20

tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

## Map the trends to numbers

In [5]:
trends_map = {}

counter = 0

for trend in trends:
    if not (trend in trends_map):
        trends_map[trend] = counter
        counter += 1

no_of_trends = len(trends_map)
inv_trends_map = {v: k for k, v in trends_map.items()}
print(f"We have {no_of_trends} different trends")
print(trends_map)

We have 125 different trends
{'NEWTOT': 0, 'emergencyalert': 1, 'PAKvNZ': 2, 'Ryan Mason': 3, 'Player of the Season': 4, 'Voter Authority Certificate': 5, 'McKennie': 6, 'mastermind': 7, 'NationalLingerieDay': 8, 'Malpractice': 9, 'CoronationConcert': 10, 'ImACeleb': 11, 'ASongOrMovieForFingers': 12, 'The CMA': 13, 'AncestryHour': 14, 'PMQs': 15, 'McGoldrick': 16, 'itfc': 17, 'popmaster': 18, 'Snapchat AI': 19, 'St George': 20, 'LEELEI': 21, 'Roman': 22, 'lufc': 23, 'Cloud': 24, 'MCIARS': 25, 'NationalCancerCNSDay': 26, 'BBUK': 27, 'Murray Gold': 28, 'Kings': 29, 'Ramsdale': 30, 'UniversityChallenge': 31, 'AlienDay': 32, 'Gnonto': 33, 'Warriors': 34, 'COYG': 35, 'Tucker Carlson': 36, 'Barnes': 37, 'Showman': 38, 'Flop of the Season': 39, 'Vince': 40, 'Spurs': 41, 'forgotten80s': 42, 'Ed Balls': 43, 'Paul Burrell': 44, 'Windass': 45, 'Brighton': 46, 'aurora': 47, 'Gallipoli': 48, 'Daniel': 49, 'tuesdayvibe': 50, 'Chacun Pour Soi': 51, 'AEWDynamite': 52, 'CHEBRE': 53, 'Sony': 54, 'Micros

## Create the trends sequences

In [6]:
trends_sequences = [trends_map[trend] for trend in trends]
print(trends_sequences)

[0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 0, 0, 1, 1, 10, 11, 12, 12, 13, 2, 11, 11, 14, 0, 1, 15, 16, 11, 9, 2, 9, 1, 11, 2, 15, 17, 18, 1, 0, 19, 20, 18, 21, 22, 11, 11, 23, 24, 25, 15, 26, 14, 20, 27, 17, 1, 11, 1, 11, 2, 28, 29, 30, 31, 1, 21, 1, 18, 32, 0, 21, 14, 33, 34, 35, 14, 36, 11, 18, 23, 36, 37, 38, 39, 40, 0, 1, 41, 2, 33, 25, 17, 2, 25, 11, 12, 42, 42, 43, 31, 7, 14, 44, 35, 0, 31, 45, 11, 46, 18, 47, 11, 11, 7, 48, 35, 32, 49, 26, 11, 2, 23, 17, 20, 17, 32, 14, 21, 25, 31, 50, 2, 18, 12, 35, 50, 11, 42, 24, 7, 42, 25, 0, 51, 23, 44, 52, 23, 11, 2, 41, 12, 53, 0, 11, 54, 1, 49, 10, 7, 38, 49, 27, 55, 9, 35, 56, 1, 11, 52, 57, 58, 48, 25, 42, 59, 47, 35, 1, 23, 60, 2, 61, 33, 18, 62, 52, 63, 10, 61, 2, 0, 18, 10, 64, 9, 65, 9, 17, 1, 7, 26, 11, 7, 3, 66, 9, 64, 11, 35, 11, 49, 64, 11, 2, 57, 64, 35, 67, 7, 23, 9, 68, 50, 52, 52, 2, 12, 69, 32, 15, 70, 2, 2, 38, 67, 41, 32, 71, 11, 42, 2, 17, 25, 57, 3, 35, 2, 10, 72, 5, 42, 12, 11, 2, 32, 73, 12, 17, 8, 48, 18, 52, 3, 18, 11, 

## Encode the trends

In [7]:
from keras.utils import to_categorical
import tensorflow as tf

encoded_trends = to_categorical(trends_sequences)
print(encoded_trends.shape)

(30773, 125)


## Prepare the pre-trained embeddings

In [8]:
from Embeddings.embeddings_matrix import get_embeddings_matrix

embeddings_index_path = "./Embeddings/embeddings_index_object.pkl"
embeddings_matrix, hits, misses = get_embeddings_matrix(tweets_word_index, embeddings_index_path)

print(f"Hits: {hits}, Misses: {misses}")

Hits: 19817, Misses: 3153


## Split the data

In [9]:
training_split = 0.8
training_tweets_count = int(0.8 * len(tweets_sequences_padded))

In [10]:
train_data = tweets_sequences_padded[0:training_tweets_count]
train_labels = encoded_trends[0:training_tweets_count]
test_data = tweets_sequences_padded[training_tweets_count:]
test_labels = encoded_trends[training_tweets_count:]

print(f'we have {len(train_data)} tweets for training and {len(test_data)} for testing')

we have 24618 tweets for training and 6155 for testing


## Build the model

In [17]:
#hyperparameters
embedding_dimensions = 300
lstm_units = 128
dropout_value = 0.2
conv_filters = 64
conv_kernel_size = 5
dense_layers = 10000

In [18]:
from keras import initializers

no_of_tweets_words = len(tweets_word_index) + 1

hashtag_recommender = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        no_of_tweets_words,
        embedding_dimensions,
        input_length=sequence_length,
        embeddings_initializer=initializers.Constant(embeddings_matrix),
        trainable=True
    ),
    # tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_trends, activation='softmax')
])

hashtag_recommender.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

hashtag_recommender.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 64)            1470144   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 125)               16125     
                                                                 
Total params: 1,552,317
Trainable params: 1,552,317
Non-trainable params: 0
_________________________________________________________________


In [19]:
epochs = 20
hashtag_recommender.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

KeyboardInterrupt: 

In [None]:
import numpy as np

def predict(tweet, tweet_tokenizer, trends_map, inv_trends_map, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    trends_indices = np.argsort(prediction, axis=-1)[0][-3:]
    return [inv_trends_map[trend_index] for trend_index in trends_indices]

In [None]:
tweet = "this is so sad what happend to arsenal."

print(predict(tweet, tweets_tokenizer, trends_map, inv_trends_map, sequence_length, hashtag_recommender))

['MCIARS', 'COYG', 'PAKvNZ']
