## Get the data
csv headings: id, created_at, source, original_text, clean_text, favorite_count, retweet_count, hashtags, trend <br>
hashtags format: strings with comma separated hashtags

In [54]:
#File paths
US_tweets_file = './Data/USTweets.csv'
UK_tweets_file = './Data/UKTweets.csv'

In [55]:
import csv

tweets = []
hashtags = [] #list of lists of hashtags e.g. hashtags[0] = ["hashtag1", "hashtag2"]
hashtags_strings = [] #list of hashtags string e.g. hashtags[0] = ["hashtag1, hashtag2"]


In [56]:
with open(UK_tweets_file) as data_file:
    data = csv.reader(data_file)
    next(data) #To skip the headings
    for row in data:
        tweets.append(row[4])
        hashtags.append(row[7].split(", "))
        hashtags_strings.append(row[7])

In [57]:
# with open(US_tweets_file) as data_file:
#     data = csv.reader(data_file)
#     next(data) #To skip the headings
#     for row in data:
#         tweets.append(row[4])
#         hashtags.append(row[7].split(", "))
#         hashtags_strings.append(row[7])

## Initialize the tokenizers
Will use a specialized tokenizer for the hashtags because we need to encode all the hashtags. It also does not matter if the encoding of the tweets match the encoding of the hashtags.

In [58]:
from keras.preprocessing.text import Tokenizer

tweets_tokenizer = Tokenizer(oov_token="<OOV>")
tweets_tokenizer.fit_on_texts(tweets)
tweets_word_index = tweets_tokenizer.word_index
tweets_index_word = tweets_tokenizer.index_word

In [59]:
hashtags_tokenizer = Tokenizer(oov_token="<OOV>")
hashtags_tokenizer.fit_on_texts(hashtags_strings)
hashtags_word_index = hashtags_tokenizer.word_index
hashtags_index_word = hashtags_tokenizer.index_word

In [60]:
print(f'There are {len(tweets)} tweets, ')
print(f'the tweets contain {len(tweets_tokenizer.word_index)} different words.')
print(f'There are {len(hashtags_tokenizer.word_index)} different hashtags')
print('Here are the tokenized hashtags')
print(hashtags_word_index)

There are 2154 tweets, 
the tweets contain 4316 different words.
There are 726 different hashtags
Here are the tokenized hashtags
{'<OOV>': 1, 'facup': 2, 'bhamun': 3, 'nufc': 4, 'mufc': 5, 'newtot': 6, 'emergencyalert': 7, 'thfc': 8, 'spurs': 9, 'coys': 10, 'emiratesfacup': 11, 'bhafc': 12, 'facupsemifinal': 13, 'premierleague': 14, 'tottenham': 15, 'levyout': 16, 'aberdeen': 17, 'rangersfc': 18, 'emergencyalerts': 19, 'newcastle': 20, 'manutd': 21, 'barçaatleti': 22, 'milanlecce': 23, 'sempremilan': 24, 'munbha': 25, 'brighton': 26, 'fpl': 27, 'manchesterunited': 28, 'newcastleunited': 29, 'brimun': 30, 'bbcfootball': 31, 'stgeorgesday': 32, 'davisgarcia': 33, 'bitcoin': 34, 'toon': 35, 'iphone': 36, 'smarty': 37, 'earthday': 38, 'ukraine': 39, 'hwtl': 40, 'enicout': 41, 'nufcfans': 42, 'rangers': 43, 'rfc': 44, 'aberan': 45, 'pl': 46, 'bhamnu': 47, 'amazon': 48, 'ucl': 49, 'kane': 50, 'howaythelads': 51, 'toonarmy': 52, 'ggmu': 53, 'russia': 54, 'whufc': 55, 'glazersout': 56, 'timel

## Create the sequences and pad them and one hot encode the hashtags
Will use a binary vector to encode the hashtags to the model can categorize the tweets. e.g. hashtags[0] = [tag1, tag2], and tag1 has encoding of 1 and tag2 has encoding 2, then the binary vector wil be [0 1 1 0 0 ... no_of_different_hashtags]

In [61]:
sequence_length = 10

from keras.utils import pad_sequences
tweets_sequences = tweets_tokenizer.texts_to_sequences(tweets)
hashtags_sequences = hashtags_tokenizer.texts_to_sequences(hashtags)
tweets_sequences_padded = pad_sequences(tweets_sequences, padding="post", maxlen=sequence_length)

In [62]:
from keras.utils import to_categorical
import tensorflow as tf
import numpy as np

no_of_different_hashtags = len(hashtags_word_index) + 1
no_of_hashtags = len(hashtags_sequences)

encoded_hashtags = np.zeros((no_of_hashtags, no_of_different_hashtags))

for i, hashtags_indices in enumerate(hashtags_sequences):
    encoded_hashtags[i][hashtags_indices] = 1


encoded_hashtags = np.array(encoded_hashtags)


## Build the model

In [76]:
#hyperparameters
embedding_dimensions = 512
lstm_units = 100
dropout_value = 0.5
conv_filters = 64
conv_kernel_size = 5

In [77]:
no_of_tweets_words = len(tweets_word_index) + 1

hashtag_recommender_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(no_of_tweets_words, embedding_dimensions, input_length=sequence_length),
    # tf.keras.layers.Conv1D(conv_filters, conv_kernel_size, activation='relu'),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units)),
    tf.keras.layers.Dropout(dropout_value),
    tf.keras.layers.Dense(no_of_different_hashtags, activation='softmax')
])

hashtag_recommender_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics = ['accuracy']
)

hashtag_recommender_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 10, 512)           2210304   
                                                                 
 conv1d (Conv1D)             (None, 6, 64)             163904    
                                                                 
 dropout_14 (Dropout)        (None, 6, 64)             0         
                                                                 
 bidirectional_9 (Bidirectio  (None, 6, 200)           132000    
 nal)                                                            
                                                                 
 dropout_15 (Dropout)        (None, 6, 200)            0         
                                                                 
 bidirectional_10 (Bidirecti  (None, 200)              240800    
 onal)                                                

## Train the model

In [78]:
epochs = 8
hashtag_recommender_model.fit(tweets_sequences_padded, encoded_hashtags, epochs=epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x231e0faece0>

## Get hashtags!!

In [48]:
def predict(tweet, tweet_tokenizer, hashtag_tokenizer, pad_length, model):
    tweet_sequence = tweet_tokenizer.texts_to_sequences([tweet])[0]
    padded_tweet_sequence = pad_sequences([tweet_sequence], maxlen=pad_length, padding='post')
    prediction = (model.predict(padded_tweet_sequence))
    hashtag_index = np.argmax(prediction, axis=-1)[0]
    return hashtag_tokenizer.index_word[hashtag_index]


In [74]:
print(predict("oh my god", tweets_tokenizer, hashtags_tokenizer, 10, hashtag_recommender_model))

levyout
