In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string

In [3]:
df = pd.read_csv("emoji_df.csv")
df.head()

Unnamed: 0,emoji,name,group,sub_group,codepoints
0,😀,grinning face,Smileys & Emotion,face-smiling,1F600
1,😃,grinning face with big eyes,Smileys & Emotion,face-smiling,1F603
2,😄,grinning face with smiling eyes,Smileys & Emotion,face-smiling,1F604
3,😁,beaming face with smiling eyes,Smileys & Emotion,face-smiling,1F601
4,😆,grinning squinting face,Smileys & Emotion,face-smiling,1F606


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/megha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/megha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# preprocess text
def preprocess(text):
    tokens = word_tokenize(text)
    
    # remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

In [6]:
groups = df['group'].unique()
sub_groups = df['sub_group'].unique()
df = df[df['name'] != 'thread']

In [7]:
from transformers import pipeline
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
word = "apple"
hypothesis_template = 'This text is about {}.'
prediction = classifier(word, sub_groups, hypothesis_template=hypothesis_template, multi_label=True)
prediction

In [10]:
# get most likely group based on a word
def get_group(word, groups):
    hypothesis_template = 'This text is about {}.'
    prediction = classifier(word, groups, hypothesis_template=hypothesis_template, multi_label=True)
    return [prediction['labels'][0], prediction['scores'][0]]

get_group('apple',groups)

['Food & Drink', 0.3914111852645874]

In [39]:
def word_to_emoji(word):
    # first look for direct matches
    if word in df.loc[df['name'] == word, 'name'].values:
        return [df.loc[df['name'] == word, 'emoji'].values[0], 1]
    
    # next look for indirect matches
    if any(df['name'].str.contains(word, case=False)):
        possible_emojis = df[df['name'].str.contains(word, case=False)]['name'].tolist()
        flag = 1
    else:
        group = get_group(word, groups)
        # print("group:",group)

        # option 1: separate into subgroups before choosing a possible emoji
        # possible_subgroups = df[df['group'] == group[0]]['sub_group'].unique()
        # print("possible subgroups:",possible_subgroups)
        # subgroup = get_group(word, possible_subgroups)
        # print("subgroup:",subgroup)
        # possible_emojis = df[df['sub_group'] == subgroup[0]]['name'].unique()

        # option 2: choose directly from group
        possible_emojis = df[df['group'] == group[0]]['name'].unique()
        flag = 0

    # print("possible emojis:",possible_emojis)
    name = get_group(word,possible_emojis)
    # print(name)
    if flag == 1: return [df[df['name'] == name[0]].iloc[0]['emoji'], 1]
    else: return [df[df['name'] == name[0]].iloc[0]['emoji'], name[1]]

# print(word_to_emoji('happy'))

Sample text:

The house is on fire!

I'm so happy today!

The early bird catches the worm.

Just had the best pizza ever!

The dog is chasing the ball in the park.

The flower is blooming beautifully in the garden.

In [51]:
# convert input text to emojis
input = "The house is on fire!"
print('input text:',input)

input = preprocess(input)
# print('preprocessed input text:',input)

emoji_translation = ''
for word in input:
    emoji = word_to_emoji(word)
    if emoji[1] > 0.95: # cutoff for accuracy
        emoji_translation = emoji_translation + emoji[0]
    # print(emoji)

print('emoji translation:',emoji_translation)

input text: The house is on fire!
emoji translation: 🏠🔥
