#### Please install tensorflow_hub and top2vec before running the following code.

In [1]:
# Uncomment the below 2 lines of code to download the required libraries.
# ! pip install top2vec
# ! pip install --upgrade tensorflow-hub

In [2]:
import tensorflow_hub as hub

import pandas as pd
import re
from top2vec import Top2Vec

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)

In [None]:
df1 = pd.read_csv('Train.csv')
df2 = pd.read_csv('Test.csv')

# Concatenate the dataframes
df = pd.concat([df1, df2])

# Print the shape of the concatenated dataframe
print(df.shape)

# Print the first 5 rows of the concatenated dataframe
df.head(5)

In [None]:
docs = list(df.loc[:, "ABSTRACT"].values)

In [None]:
docs = [d.replace("See ", "") for d in docs]
docs = [re.sub(r"\([^()]*\)", "",  d).replace(" .", ".") for d in docs]

docs[:5]

In [None]:
semanticmodel = Top2Vec(docs, 
                        embedding_model = model, # Embedding model: See [1,2] for supported models
                        min_count = 20,              # Ignore words less frequent than this value
                        speed="deep-learn",
                        ngram_vocab=True) #speed="deep-learn", workers=8, embedding_model=model

In [None]:
len(semanticmodel.vocab)

In [None]:
bigrams = []
for word in semanticmodel.vocab:
    if len(word.split()) == 2:
        bigrams.append(word)

In [None]:
print(len(bigrams))
bigrams[0:15]

In [None]:
topic_sizes, topic_nums = semanticmodel.get_topic_sizes()

print(topic_nums)
print(topic_sizes)

In [None]:
topic_words, word_scores, topic_nums = semanticmodel.get_topics(20)

In [None]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f"Words: {words}") 

In [None]:
for topic in topic_nums:
    semanticmodel.generate_topic_wordcloud(topic) # , background_color="black"

In [None]:
documents, document_scores, document_ids = semanticmodel.search_documents_by_topic(topic_num=0, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
documents, document_scores, document_ids = semanticmodel.search_documents_by_keywords(keywords=["svm", "bayesian"], num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [None]:
words, word_scores = semanticmodel.similar_words(keywords=["face recognition"], keywords_neg=[], num_words=20)
for word, score in zip(words, word_scores):
    print(f"{word} {score}")

## References:
1. [Tensorflow_hub_Universal Sentence Encoder](https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder)
2. [top2vec](https://pypi.org/project/top2vec/)