<center>
    
# Hashtag Relevancy
    
</center>

#### importing required libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import re
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load the preprocessed data from the Excel file
data = pd.read_excel("preprocessed_tweets.xlsx")

In [3]:
#Loading the BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [4]:
# BERT embedding for text
def get_bert_embeddings(text):
    tokens_text = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        output = model(**tokens_text)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def get_hashtag_embeddings(hashtags):
    if isinstance(hashtags, list):
        combined_hashtags = ' '.join(hashtags)
    else:
        combined_hashtags = str(hashtags)
    
    tokens_hashtags = tokenizer(combined_hashtags, padding=True, truncation=True, return_tensors='pt', max_length=128)
    with torch.no_grad():
        output = model(**tokens_hashtags)
    embeddings = output.last_hidden_state.mean(dim=1)
    return embeddings

def extract_hashtags(text):
    return re.findall(r'#\w+', text)

In [5]:
data['text_bert_embeddings'] = data['Text'].apply(get_bert_embeddings)
data['hashtags_bert_embeddings'] = data['Hashtags'].apply(get_hashtag_embeddings)

In [6]:
# Function to calculate cosine similarity between two vectors
def cosine_similarity_vectors(a, b):
    cos_sim = cosine_similarity([a], [b])
    return cos_sim[0][0]

# Function to calculate relevancy score between a hashtag and text
def calculate_hashtag_relevancy(hashtag_embeddings, text_embeddings):
    # Reshape to 1D
    hashtag_embeddings = hashtag_embeddings.reshape(-1)
    text_embeddings = text_embeddings.reshape(-1)

    relevancy_scores = cosine_similarity_vectors(hashtag_embeddings, text_embeddings)
    return relevancy_scores


# Convert tensors to numpy arrays
data['hashtag_relevancy_scores'] = data.apply(lambda row: calculate_hashtag_relevancy(row['hashtags_bert_embeddings'].numpy(), row['text_bert_embeddings'].numpy()), axis=1)
data.head()

Unnamed: 0,Post,Text,Hashtags,Clean_Text,Cleaned_Hashtags,text_tokens,hashtags_tokens,input_tokens,numTags,text_bert_embeddings,hashtags_bert_embeddings,hashtag_relevancy_scores
0,Looking Ahead: 17 #Predicted #Marketing #Trend...,Looking Ahead: 17 Predicted Marketing Trends I...,#Predicted #Marketing #Trends #Mobile #AR #VR ...,looking ahead 17 predicted marketing trends in...,"['Predicted', 'Marketing', 'Trends', 'Mobile',...","[101, 2559, 3805, 1024, 2459, 10173, 5821, 128...","[[1001], [1052], [1054], [1041], [1040], [1045...","[101, 2559, 3805, 1024, 2459, 10173, 5821, 128...",9,"[[tensor(0.1858), tensor(-0.0076), tensor(0.45...","[[tensor(-0.0812), tensor(0.1149), tensor(0.46...",0.70919
1,RT @simonnash2017: These are the #Top20 #BigDa...,These are the Top20 BigData &amp; AI on twitte...,#Top20 #BigData #AI #twitter #2017 #ITjobs #Da...,these are the top20 bigdata amp ai on twitter ...,"['Top20', 'BigData', 'AI', 'twitter', '2017', ...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...","[[1001], [1056], [1051], [1052], [1016], [1014...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...",10,"[[tensor(0.1847), tensor(-0.2179), tensor(0.68...","[[tensor(0.0946), tensor(-0.0150), tensor(0.42...",0.802984
2,These are the #Top20 #BigData &amp; #AI on #tw...,These are the Top20 BigData &amp; AI on twitte...,#Top20 #BigData #AI #twitter #2017 #ITjobs #Da...,these are the top20 bigdata amp ai on twitter ...,"['Top20', 'BigData', 'AI', 'twitter', '2017', ...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...","[[1001], [1056], [1051], [1052], [1016], [1014...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...",9,"[[tensor(0.1032), tensor(-0.2423), tensor(0.67...","[[tensor(0.0765), tensor(-0.0382), tensor(0.43...",0.799164
3,RT @Azure: Data is evolving. Find out how to t...,Data is evolving. Find out how to transform yo...,#SQL #Data,data is evolving find out how to transform you...,"['SQL', 'Data']","[101, 2951, 2003, 20607, 1012, 2424, 2041, 212...","[[1001], [1055], [1053], [1048], [], [1001], [...","[101, 2951, 2003, 20607, 1012, 2424, 2041, 212...",2,"[[tensor(-0.1249), tensor(-0.2571), tensor(0.3...","[[tensor(0.0260), tensor(0.0383), tensor(0.253...",0.668773
4,These are the #Top20 #BigData &amp; #AI on #tw...,These are the Top20 BigData &amp; AI on twitte...,#Top20 #BigData #AI #twitter #2017 #ITjobs #Da...,these are the top20 bigdata amp ai on twitter ...,"['Top20', 'BigData', 'AI', 'twitter', '2017', ...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...","[[1001], [1056], [1051], [1052], [1016], [1014...","[101, 2122, 2024, 1996, 2327, 11387, 2502, 285...",9,"[[tensor(0.1032), tensor(-0.2423), tensor(0.67...","[[tensor(0.0765), tensor(-0.0382), tensor(0.43...",0.799164


In [9]:
selected_columns = ['Post','Text','Hashtags','Clean_Text','Cleaned_Hashtags','text_bert_embeddings','hashtags_bert_embeddings','hashtag_relevancy_scores']
filtered_data = data[selected_columns]
filtered_data.to_excel('Posts_Hashtags.xlsx', index= False)

## Relevancy Score for an Input Twitter Post

In [10]:
# input twitter post
twitter_post = input("Enter Twitter Post: \n")
hashtags = extract_hashtags(twitter_post)

# BERT embedding for text
bert = get_bert_embeddings(twitter_post)
hashtag = get_hashtag_embeddings(hashtags)

# relevancy score
relevancy_score = calculate_hashtag_relevancy(hashtag.numpy(), bert.numpy())

Enter Twitter Post: 
Looking Ahead: 17 #Predicted #Marketing #Trends In 2017 - See more at: https://t.co/bnho4Dfuvd #Mobile #AR #VR #Advertising #Data #Chatbots


In [14]:
print(f"---------- HASHTAG RELEVANCY SCORE ----------\nTwitter Post: {twitter_post}\nScore: {relevancy_score}")
if relevancy_score<=0.50:
    print('\nNon-Relevant Hashtags')
else:
    print('\nRelevant Hashtags')

---------- HASHTAG RELEVANCY SCORE ----------
Twitter Post: Looking Ahead: 17 #Predicted #Marketing #Trends In 2017 - See more at: https://t.co/bnho4Dfuvd #Mobile #AR #VR #Advertising #Data #Chatbots
Score: 0.8420144319534302

Relevant Hashtags
