In [1]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [2]:
!pip install git+https://github.com/LIAAD/yake

Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-ktkq0nxe
  Running command git clone --filter=blob:none --quiet https://github.com/LIAAD/yake /tmp/pip-req-build-ktkq0nxe
  Resolved https://github.com/LIAAD/yake to commit 0a9e2d39ed47f5013927df37427384dd8e75f662
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jellyfish (from yake==0.6.0)
  Downloading jellyfish-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting segtok (from yake==0.6.0)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading jellyfish-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (356 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m356.9/356.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.

In [3]:
import ast
import emoji
import pandas as pd
import numpy as np
import logging
from sentence_transformers import SentenceTransformer, util
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

# Load model
model_path = "saved_models/paraphrase-MiniLM-L6-v2"
if os.path.exists(model_path):
    logging.info("Loading model from saved path...")
    model = SentenceTransformer(model_path)
else:
    logging.info("Downloading model and saving...")
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    os.makedirs(model_path, exist_ok=True)
    model.save(model_path)

# Embedding cache
embedding_cache = {}

# Prepare emoji dataset
def prepare_emoji_data():
    logging.info("Preparing emoji descriptions and embeddings...")
    emoji_list = [(emj, emoji.demojize(emj).strip(":").replace("_", " ")) for emj in emoji.EMOJI_DATA.keys()]
    emoji_df = pd.DataFrame(emoji_list, columns=['emoji', 'description'])
    emoji_df['embedding'] = list(model.encode(emoji_df['description'].tolist(), batch_size=32, show_progress_bar=True))
    return emoji_df

# Map keywords to emojis
def map_emojis_for_single_text(keywords_str, emoji_df, threshold=0.3):
    keywords = ast.literal_eval(keywords_str)
    emojis = []
    mapping = {}

    for keyword in keywords:
        if keyword in embedding_cache:
            keyword_vec = embedding_cache[keyword]
        else:
            keyword_vec = model.encode(keyword)
            embedding_cache[keyword] = keyword_vec

        similarities = [util.cos_sim(keyword_vec, vec)[0][0].item() for vec in emoji_df['embedding']]
        max_sim = max(similarities)
        if max_sim >= threshold:
            top_index = int(np.argmax(similarities))
            best_emoji = emoji_df.iloc[top_index]['emoji']
            emojis.append(best_emoji)
            mapping[keyword] = best_emoji

    return ' '.join(emojis), mapping




2025-06-03 16:27:20,444 - INFO - Downloading model and saving...
2025-06-03 16:27:20,449 - INFO - Use pytorch device_name: cpu
2025-06-03 16:27:20,449 - INFO - Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2025-06-03 16:27:30,392 - INFO - Save model to saved_models/paraphrase-MiniLM-L6-v2


In [4]:
import yake
import numpy as np

def get_keywords(text):
    text = text.lower()

    word_count = len(text.split())
    keyword_count = max(1, int(word_count * 0.2))

    kw_extractor = yake.KeywordExtractor(top=keyword_count, stopwords=None, n=1)
    extracted = kw_extractor.extract_keywords(text)
    predicted_keywords = [kw for kw, _ in extracted]

    return str(predicted_keywords)

In [5]:
import re
import ast

def sort_keywords(keywords, text):
    text_lower = text.lower()
    positions = {}

    text = text.lower()
    keywords = ast.literal_eval(keywords)
    positions = {}

    for kw in keywords:
        match = re.search(r'\b' + re.escape(kw.lower()) + r'\b', text)
        if match:
            positions[kw] = match.start()
        else:
            positions[kw] = float('inf')

    sorted_keywords = sorted(keywords, key=lambda x: positions.get(x, float('inf')))

    return str(sorted_keywords)


In [7]:
    # Input text
    text = "A baby boy named Mowgli is found in the jungle and raised by wolves. He befriends Baloo the bear and Bagheera the panther. As he grows up, he faces dangers like the tiger Shere Khan and the snake Kaa."
    keyword_string = get_keywords(text)
    keyword_string = sort_keywords(keyword_string, text)

    # Prepare emoji dataset
    emoji_df = prepare_emoji_data()

    # Get emojis
    emojis, _ = map_emojis_for_single_text(keyword_string, emoji_df)

    # Format result
    result_line = f'{text}\n{keyword_string}\n{emojis}'
    print(result_line)

2025-06-03 16:29:01,881 - INFO - Preparing emoji descriptions and embeddings...


Batches:   0%|          | 0/158 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

A baby boy named Mowgli is found in the jungle and raised by wolves. He befriends Baloo the bear and Bagheera the panther. As he grows up, he faces dangers like the tiger Shere Khan and the snake Kaa.
['baby', 'boy', 'named', 'mowgli', 'found', 'jungle', 'wolves']
👶 👦 📛 🗿 🪨 🐒 🐺
