In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn import feature_extraction
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

## Preprocess data

In [2]:
# Load the tweets into a DataFrame
tweets_df = pd.read_csv("../project2/Data.Original.1900")
tweets_df

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region
0,pruzhany,en,2021-12-06 15:13:17,@WarcraftDevs - demon spikes reflect x % of in...,u936uxwkybu2,by,europe_east
1,lumphat,und,2021-12-06 15:25:06,@phuonganhh21 e,w6khf3wghq2s,kh,asia_southeast
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub
3,boruny,und,2021-12-06 15:55:03,@SobolLubov #ОскарПутину #Оскар #Путинизм #Нав...,u9dt48j9wb99,by,europe_east
4,vilkaviskis,en,2021-12-06 17:15:42,Top rated project! @polygen_io and #Launchpad ...,u98quj5919n4,lt,europe_east
...,...,...,...,...,...,...,...
12744328,soka,und,2021-12-21 16:43:39,https://t.co/cDu892Apj6,xn77v2fkpxu8,jp,asia_east
12744329,soka,ja,2021-12-21 16:43:39,今日だけで4玉食べて冷凍庫を空にした。引越し前うどん。 https://t.co/zG57D...,xn77v2fkpxu8,jp,asia_east
12744330,pedro betancourt,ja,2021-12-21 16:43:39,是我了 https://t.co/KPTyf0hJCO,dhn1q1q7ttq1,cu,america_central
12744331,soka,und,2021-12-21 16:43:40,🙇‍♀️💕💕 https://t.co/orOVDvtL85,xn77v2fkpxu8,jp,asia_east


In [3]:
# Drop empty rows
tweets_df = tweets_df.dropna()
tweets_df

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region
0,pruzhany,en,2021-12-06 15:13:17,@WarcraftDevs - demon spikes reflect x % of in...,u936uxwkybu2,by,europe_east
1,lumphat,und,2021-12-06 15:25:06,@phuonganhh21 e,w6khf3wghq2s,kh,asia_southeast
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub
3,boruny,und,2021-12-06 15:55:03,@SobolLubov #ОскарПутину #Оскар #Путинизм #Нав...,u9dt48j9wb99,by,europe_east
4,vilkaviskis,en,2021-12-06 17:15:42,Top rated project! @polygen_io and #Launchpad ...,u98quj5919n4,lt,europe_east
...,...,...,...,...,...,...,...
12744328,soka,und,2021-12-21 16:43:39,https://t.co/cDu892Apj6,xn77v2fkpxu8,jp,asia_east
12744329,soka,ja,2021-12-21 16:43:39,今日だけで4玉食べて冷凍庫を空にした。引越し前うどん。 https://t.co/zG57D...,xn77v2fkpxu8,jp,asia_east
12744330,pedro betancourt,ja,2021-12-21 16:43:39,是我了 https://t.co/KPTyf0hJCO,dhn1q1q7ttq1,cu,america_central
12744331,soka,und,2021-12-21 16:43:40,🙇‍♀️💕💕 https://t.co/orOVDvtL85,xn77v2fkpxu8,jp,asia_east


In [5]:
import re
import emoji


def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text, flags=re.MULTILINE)

    # Remove mentions
    text = re.sub(r"@\w+", "", text)

    return text


def get_emojis(text):
    emojis = emoji.analyze(text)
    return list(x[0] for x in emojis)


# Only consider tweets which contain at least 1 emoji
tweets_with_emojis = tweets_df[
    tweets_df["Text"].apply(lambda tweet_text: emoji.emoji_count(tweet_text) > 0)
]

# Apply the cleaning function to the 'Text' column to generate 'CleanedText' column
tweets_with_emojis["CleanedText"] = tweets_with_emojis["Text"].apply(clean_text)

# Get just emojis and store in 'Emojis' column
tweets_with_emojis["Emojis"] = tweets_with_emojis["Text"].apply(
    lambda text: "".join(get_emojis(text))
)

tweets_with_emojis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_with_emojis["CleanedText"] = tweets_with_emojis["Text"].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_with_emojis["Emojis"] = tweets_with_emojis["Text"].apply(


Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region,CleanedText,Emojis
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub,Eh le poto il est à l’aise avec sa question 💀💀,💀💀
11,hobyo,en,2021-12-06 23:04:04,@M0zark0 😭😭😭😭😭 ur ugly for that wallahi,t0fw2npnp7hn,so,africa_north,😭😭😭😭😭 ur ugly for that wallahi,😭😭😭😭😭
17,ayny,es,2021-12-06 23:19:23,Hacer cosas de novios sin ser novios ♥️,tx08psjenf21,tj,asia_central,Hacer cosas de novios sin ser novios ♥️,♥️
18,aflou,ar,2021-12-06 23:23:17,@beINSPORTS_news انا جزائري لا كن أرى أن جزائر...,sn16pc6ddver,dz,africa_north,انا جزائري لا كن أرى أن جزائري افضل في اسوء ح...,😂
21,villa yapacani,en,2021-12-06 23:28:34,@andalechuey Lmaooo i felt that shit 🤣💀,6sft52mvxsct,bo,america_south,Lmaooo i felt that shit 🤣💀,🤣💀
...,...,...,...,...,...,...,...,...,...
12744309,soka,ja,2021-12-21 16:43:36,@hima30823 ご参加ありがとうございます✨ 抽選結果は下部の画像をタップ😍 当選者...,xn77v2fkpxu8,jp,asia_east,ご参加ありがとうございます✨ 抽選結果は下部の画像をタップ😍 当選者には後ほどDMをお届...,✨😍📩🎯👇👇
12744312,soka,es,2021-12-21 16:43:37,@MilyDice Aquí no! Aquí hasta se asesora 😌 Y...,xn77v2fkpxu8,jp,asia_east,Aquí no! Aquí hasta se asesora 😌 Y si hay d...,😌
12744323,soka,en,2021-12-21 16:43:38,See you in the Solice metaverse https://t.co/H...,xn77v2fkpxu8,jp,asia_east,See you in the Solice metaverse #soliceio #so...,👍👍
12744327,archidona,ar,2021-12-21 16:43:39,الليالي عن هوانا مانهتنا البلا حنا على البعد ...,6rbdv95u08cn,ec,america_south,الليالي عن هوانا مانهتنا البلا حنا على البعد ...,😢


In [6]:
# Export to txt file
tweets_with_emojis.to_csv("Data/tweets_with_emojis_12M.txt", columns=["CleanedText"], header=False, index=False)

In [5]:
tweets_with_emojis = pd.read_csv('tweets_with_emojis_dataframe.csv', index_col=0)
tweets_with_emojis

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region,CleanedText,Emojis
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub,Eh le poto il est à l’aise avec sa question 💀💀,💀💀
11,hobyo,en,2021-12-06 23:04:04,@M0zark0 😭😭😭😭😭 ur ugly for that wallahi,t0fw2npnp7hn,so,africa_north,😭😭😭😭😭 ur ugly for that wallahi,😭😭😭😭😭
17,ayny,es,2021-12-06 23:19:23,Hacer cosas de novios sin ser novios ♥️,tx08psjenf21,tj,asia_central,Hacer cosas de novios sin ser novios ♥️,♥️
18,aflou,ar,2021-12-06 23:23:17,@beINSPORTS_news انا جزائري لا كن أرى أن جزائر...,sn16pc6ddver,dz,africa_north,انا جزائري لا كن أرى أن جزائري افضل في اسوء ح...,😂
21,villa yapacani,en,2021-12-06 23:28:34,@andalechuey Lmaooo i felt that shit 🤣💀,6sft52mvxsct,bo,america_south,Lmaooo i felt that shit 🤣💀,🤣💀
...,...,...,...,...,...,...,...,...,...
12744309,soka,ja,2021-12-21 16:43:36,@hima30823 ご参加ありがとうございます✨ 抽選結果は下部の画像をタップ😍 当選者...,xn77v2fkpxu8,jp,asia_east,ご参加ありがとうございます✨ 抽選結果は下部の画像をタップ😍 当選者には後ほどDMをお届...,✨😍📩🎯👇👇
12744312,soka,es,2021-12-21 16:43:37,@MilyDice Aquí no! Aquí hasta se asesora 😌 Y...,xn77v2fkpxu8,jp,asia_east,Aquí no! Aquí hasta se asesora 😌 Y si hay d...,😌
12744323,soka,en,2021-12-21 16:43:38,See you in the Solice metaverse https://t.co/H...,xn77v2fkpxu8,jp,asia_east,See you in the Solice metaverse #soliceio #so...,👍👍
12744327,archidona,ar,2021-12-21 16:43:39,الليالي عن هوانا مانهتنا البلا حنا على البعد ...,6rbdv95u08cn,ec,america_south,الليالي عن هوانا مانهتنا البلا حنا على البعد ...,😢


## Train embeddings

In [7]:
import fasttext
import gensim
import os


# Print test results
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))


# Set data directory
data_dir = os.path.join(".", "Data")
data_file = os.path.join(data_dir, "tweets_with_emojis_12M.txt")

# SGNS model
model = fasttext.train_unsupervised(
    input=data_file,
    model="skipgram",
    ws=4,
    minCount=2,
    minn=0,
    maxn=0,
    neg=5,
)
# Save bin format for SGNS
model.save_model(os.path.join(data_dir, "model_tweets_with_emojis_12M_sgns.bin"))

# CBOW model
model = fasttext.train_unsupervised(
    input=data_file,
    model="cbow",
    ws=4,
    minCount=2,
    minn=0,
    maxn=0,
    neg=5,
)
# Save bin format for CBOW
model.save_model(os.path.join(data_dir, "model_tweets_with_emojis_12M_cbow.bin"))


# Convert SGNS from bin to vec format
model = gensim.models.fasttext.load_facebook_vectors(
    os.path.join(data_dir, "model_tweets_with_emojis_12M_sgns.bin")
)
model.save_word2vec_format(os.path.join(data_dir, "model_tweets_with_emojis_12M_sgns.vec"))

# Convert CBOW from bin to vec format
model = gensim.models.fasttext.load_facebook_vectors(
    os.path.join(data_dir, "model_tweets_with_emojis_12M_cbow.bin")
)
model.save_word2vec_format(os.path.join(data_dir, "model_tweets_with_emojis_12M_cbow.vec"))

Read 25M words
Number of words:  781763
Number of labels: 0
Progress: 100.0% words/sec/thread:  116152 lr:  0.000000 avg.loss:  1.065171 ETA:   0h 0m 0s
Read 25M words
Number of words:  781763
Number of labels: 0
Progress: 100.0% words/sec/thread:  327243 lr:  0.000000 avg.loss:  1.372916 ETA:   0h 0m 0s


## Analyze neighbors

In [29]:
all_emojis_by_category = pd.read_csv('all-emojis/all-emojis-by-category.csv', index_col=0)
all_emojis_by_category

Unnamed: 0,Emoji,Category
0,😀,Smileys
1,😃,Smileys
2,😄,Smileys
3,😁,Smileys
4,😆,Smileys
...,...,...
264,🇻🇨,Flags
265,🇻🇺,Flags
266,🇼🇫,Flags
267,🇼🇸,Flags


In [None]:
def get_emoji_sample_with_categories(total=30, n=20):
    emojis_sample = tweets_with_emojis['Emojis'].str[0].sample(total)
    emojis_sample_categories = []
    for emoji_from_sample in emojis_sample:
        possible_category = all_emojis_by_category[all_emojis_by_category['Emoji'] == emoji_from_sample]['Category']
        try:
            category = possible_category.values[0]
            emojis_sample_categories.append((emoji_from_sample, category))
        except:
            continue

    return pd.DataFrame(emojis_sample_categories[:n], columns=['Emoji', 'Category'])
get_emoji_sample_with_categories()

Unnamed: 0,Emoji,Category
0,😦,Smileys
1,🙄,Smileys
2,🤣,Smileys
3,🥺,Smileys
4,🏀,Activity
5,😍,Smileys
6,🤷,People
7,💯,Symbols
8,😂,Smileys
9,😉,Smileys


In [59]:
import os
import random
import fasttext
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(12, 6), dpi=150)

# Get nearest neighbors for target word (i.e., target emoji)
def get_distances(model, target, vocab):
    # Get embedding for target emoji
    target_embedding = model[target].reshape(1, -1)

    # Get cosine similarity for each word in vocab
    distances = []
    for word in vocab:
        similarity = cosine_similarity(target_embedding, model[word].reshape(1, -1))[0][0]
        distances.append([similarity, word])

    # Make dataframe
    results_df = pd.DataFrame(distances, columns=["Similarity", "Emoji"])
    results_df.sort_values("Similarity", ascending=False, inplace=True)
    results_df = results_df.head(20)
    results_df.loc[:,"Target Emoji"] = target
    
    return results_df


# Set file names and location
data_dir = os.path.join(".", "Data")
sg_file = os.path.join(data_dir, "model_tweets_with_emojis_12M_sgns.bin")
cbow_file = os.path.join(data_dir, "model_tweets_with_emojis_12M_cbow.bin")


# Get 20 random emojis
# words = random.sample(tweets_with_emojis.loc[:,"Emojis"], 20)
# words = get_emoji_sample_with_categories(n=20) 
words = all_emojis_by_category[all_emojis_by_category['Category'] == 'Smileys'][:21]

# Generate id for file
tag = str(random.randint(1,1000))

# For each type of embedding
for embedding_file in [sg_file, cbow_file]:
    # Get model_type
    if "sgns" in embedding_file:
        model_type = "sg"
    elif "cbow" in embedding_file:
        model_type = "cbow"

    # Load embedding 
    model = fasttext.load_model(embedding_file)
    print(model)

    # Iterate over words (emojis from sample)
    stack = []
    for the_emoji, the_emoji_category in words.values:
        emojis_in_same_category = all_emojis_by_category[all_emojis_by_category['Category'] == the_emoji_category]
        nearest = get_distances(model, the_emoji, all_emojis_by_category['Emoji'])
        stack.append(nearest)

    # Concat
    nearest_df = pd.concat(stack)
    print(nearest_df)
    
    # Save
    file = f"attempt-3-1322/TweetsWithEmojis.Rand.Neighbors.{tag}.{model_type}"
    nearest_df.to_csv(file+".csv")
    
    # Graph
    ax = sns.stripplot(data=nearest_df, x="Target Emoji", hue="Target Emoji", y="Similarity", jitter=True, size=2)

    # Hide word labels
    frame = plt.gca()
    frame.axes.xaxis.set_ticklabels([f"{emoji.demojize(w).strip(":")} {w}" for w in words['Emoji']])
    plt.xticks(rotation=45, ha='right')


    plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
    plt.clf()



<fasttext.FastText._FastText object at 0x213475ac0>
    Similarity Emoji Target Emoji
0     1.000000     😀            😀
3     0.670085     😁            😀
1     0.645044     😃            😀
5     0.613654     😅            😀
38    0.582888     😎            😀
..         ...   ...          ...
97    0.724475     😫            🥲
94    0.719519     😞            🥲
5     0.713465     😅            🥲
61    0.698503     🤒            🥲
62    0.693300     🤕            🥲

[420 rows x 3 columns]


  frame.axes.xaxis.set_ticklabels([f"{emoji.demojize(w).strip(":")} {w}" for w in words['Emoji']])
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")


<fasttext.FastText._FastText object at 0x2133b5670>
    Similarity Emoji Target Emoji
0     1.000000     😀            😀
1     0.847207     😃            😀
2     0.822434     😄            😀
3     0.803640     😁            😀
9     0.773559     😉            😀
..         ...   ...          ...
45    0.843719     😐            🥲
97    0.842031     😫            🥲
56    0.840559     😌            🥲
50    0.835012     😒            🥲
47    0.834108     😶            🥲

[420 rows x 3 columns]


  frame.axes.xaxis.set_ticklabels([f"{emoji.demojize(w).strip(":")} {w}" for w in words['Emoji']])
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
  plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")


<Figure size 1800x900 with 0 Axes>

### Nearest neighbors

In [35]:
def find_nearest_neighbors(model, target_emoji, n=3):
    # Check if given emoji is in vocabulary
    if target_emoji not in model:
        return None

    # Find nearest neighbors for given emoji
    nearest_neighbors = model.most_similar(positive=[target_emoji], topn=500)
    similar_emojis = []
    for neighbor in nearest_neighbors:
        if emoji.is_emoji(neighbor):
            similar_emojis.append(neighbor)
            if len(similar_emojis) == n:
                break

    if len(similar_emojis) == 0:
        return None

    return similar_emojis


nearest_neighbors_dfs = []
for embedding_file in [sg_file, cbow_file]:
    # Get model_type
    if "sgns" in embedding_file:
        model_type = "sg"
    elif "cbow" in embedding_file:
        model_type = "cbow"

    model = gensim.models.fasttext.load_facebook_vectors(embedding_file)

    nearest_neighbors_for_model = []
    for the_emoji in all_emojis_by_category['Emoji']:
        nearest_neighbors = find_nearest_neighbors(model, the_emoji, n=3)
        nearest_neighbors_for_model.append((the_emoji, nearest_neighbors, model_type))
    
    nearest_neighbors_dfs.append(pd.DataFrame(nearest_neighbors_for_model, columns=['Emoji', 'NearestNeighbors', 'ModelType']))

nearest_neighbors = pd.concat(nearest_neighbors_dfs)
nearest_neighbors

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


Unnamed: 0,Emoji,NearestNeighbors,ModelType
0,😀,"[😁, 😃, 😅]",sg
1,😃,"[😁, 😊, 😀]",sg
2,😄,"[😨, 🤣, 😁]",sg
3,😁,"[😅, 😊, 😉]",sg
4,😆,"[🤣, 😂, 😅]",sg
...,...,...,...
1891,🇻🇨,"[🇿🇼, 🇪🇷, 🇫🇯]",cbow
1892,🇻🇺,,cbow
1893,🇼🇫,,cbow
1894,🇼🇸,"[🇲🇨, 🗝, 🗻]",cbow


In [36]:
nearest_neighbors[nearest_neighbors['Emoji'].isin(['😀', '😂', '😍', '😡', '👍', '👎', '🚀', '🍕', '🎉'])].sort_values(by='Emoji')

Unnamed: 0,Emoji,NearestNeighbors,ModelType
789,🍕,"[🍔, 🥘, 🍱]",sg
789,🍕,"[🍅, 🍔, 🥩]",cbow
872,🎉,"[🥳, 🎊, 🍾]",sg
872,🎉,"[🥳, 🎊, 🎂]",cbow
154,👍,"[👍🏼, 👍🏻, 👌🏼]",sg
154,👍,"[👌, 👍🏻, 👍🏼]",cbow
155,👎,"[👎🏻, 🤮, 😤]",sg
155,👎,"[🤥, 🤷🏻‍♂️, 👎🏻]",cbow
0,😀,"[😁, 😃, 😅]",sg
0,😀,"[😃, 😄, 😁]",cbow


In [76]:
tweets_with_emojis.to_csv('tweets_with_emojis_dataframe.csv')

## Run clustering

In [77]:
all_emojis_from_data = sorted(set([''.join(get_emojis(x)) for x in list(set(''.join(tweets_with_emojis.loc[:,"Emojis"].values)))]))
all_emojis_from_data

['',
 '©',
 '®',
 '‼',
 '⁉',
 '™',
 'ℹ',
 '↔',
 '↕',
 '↖',
 '↗',
 '↘',
 '↙',
 '↩',
 '↪',
 '⌚',
 '⌛',
 '⌨',
 '⏏',
 '⏩',
 '⏪',
 '⏫',
 '⏬',
 '⏭',
 '⏮',
 '⏯',
 '⏰',
 '⏱',
 '⏲',
 '⏳',
 '⏸',
 '⏹',
 '⏺',
 'Ⓜ',
 '▪',
 '▫',
 '▶',
 '◀',
 '◻',
 '◼',
 '◽',
 '◾',
 '☀',
 '☁',
 '☂',
 '☃',
 '☄',
 '☎',
 '☑',
 '☔',
 '☕',
 '☘',
 '☝',
 '☠',
 '☢',
 '☣',
 '☦',
 '☪',
 '☮',
 '☯',
 '☸',
 '☹',
 '☺',
 '♀',
 '♂',
 '♈',
 '♉',
 '♊',
 '♋',
 '♌',
 '♍',
 '♎',
 '♏',
 '♐',
 '♑',
 '♒',
 '♓',
 '♟',
 '♠',
 '♣',
 '♥',
 '♦',
 '♨',
 '♻',
 '♾',
 '♿',
 '⚒',
 '⚓',
 '⚔',
 '⚕',
 '⚖',
 '⚗',
 '⚙',
 '⚛',
 '⚜',
 '⚠',
 '⚡',
 '⚧',
 '⚪',
 '⚫',
 '⚰',
 '⚱',
 '⚽',
 '⚾',
 '⛄',
 '⛅',
 '⛈',
 '⛎',
 '⛏',
 '⛑',
 '⛓',
 '⛔',
 '⛩',
 '⛪',
 '⛰',
 '⛱',
 '⛲',
 '⛳',
 '⛴',
 '⛵',
 '⛷',
 '⛸',
 '⛹',
 '⛺',
 '⛽',
 '✂',
 '✅',
 '✈',
 '✉',
 '✊',
 '✋',
 '✌',
 '✍',
 '✏',
 '✒',
 '✔',
 '✖',
 '✝',
 '✡',
 '✨',
 '✳',
 '✴',
 '❄',
 '❇',
 '❌',
 '❎',
 '❓',
 '❔',
 '❕',
 '❗',
 '❣',
 '❤',
 '➕',
 '➖',
 '➗',
 '➡',
 '➰',
 '➿',
 '⤴',
 '⤵',
 '⬅',
 '⬆',
 '⬇',
 '⬛',
 '⬜',
 '⭐',
 '⭕',

In [79]:
import os
import gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import emoji

figure(figsize=(12, 12), dpi=150)

# Set file names and location
root = os.path.join(".", "Data")
sg_file = os.path.join(root, "model_tweets_with_emojis_12M_sgns.bin")
cbow_file = os.path.join(root, "model_tweets_with_emojis_12M_cbow.bin")

# Load vocab features
vocab_df = pd.DataFrame({"Emoji": all_emojis_from_data}).replace("", np.nan).dropna()
print(vocab_df)

# Load embedding (change file to change type)
model = gensim.models.fasttext.load_facebook_vectors(sg_file)
model_type = "sgns"
print(model)

# Get embeddings for the vocab in the file
embeddings = []
emojis_with_embeddings = []
for the_emoji in vocab_df.loc[:, "Emoji"].values:
    try:
        embeddings.append(model[the_emoji])
        emojis_with_embeddings.append(the_emoji)
    except:
        continue

emojis_with_embeddings_df = pd.DataFrame({"Emoji": emojis_with_embeddings})
vocab_df = emojis_with_embeddings_df


# Now individual arrays into single matrix (note shifting types!)
embeddings = np.vstack(embeddings)
print(embeddings)

# Cluster words
cluster = KMeans(n_clusters=20, init="k-means++", max_iter=300)

# Fit cluster (i.e., run it)
cluster.fit(embeddings)

# Now get cluster labels
vocab_df.loc[:, "Topic"] = [str(x) for x in cluster.labels_]
vocab_df.sort_values("Topic", inplace=True)
print(vocab_df)

# Now save and visualize clusters
root = os.path.join(".", "clusters-2235")
for cluster, cluster_df in vocab_df.groupby("Topic"):
    # Get the vocab items
    vocab = cluster_df.loc[:, "Emoji"].values
    save = True

    # Now find the distance of each word from the centroid
    cluster_shape = model.rank_by_centrality(vocab, use_norm=True)

    cluster_shape = pd.DataFrame(cluster_shape, columns=["Similarity", "Emoji"])
    print(cluster_shape)
    cluster_shape.to_csv(
        os.path.join(root, "Clusters." + model_type + "." + str(cluster) + ".csv")
    )

    # Get distance from the center word
    try:
        distances_center = model.distances(
            cluster_shape.iloc[0, 1], other_words=tuple(vocab)
        )
    except:
        save = False
    try:
        distances_edge = model.distances(
            cluster_shape.iloc[-1, 1], other_words=tuple(vocab)
        )
    except:
        save = False

    # Just to catch any key errors for missing words
    if save == True:
        # Make dataframe
        cluster_graph = pd.DataFrame([vocab, distances_center, distances_edge]).T
        cluster_graph.columns = ["Word", "Distance from Center", "Distance from Edge"]

        # Make graph of the cluster
        ax = sns.scatterplot(
            data=cluster_graph, x="Distance from Center", y="Distance from Edge"
        )

        # Add words to label points
        for row in cluster_graph.itertuples():
            the_emoji = row[1]
            x = row[2]
            y = row[3]

            plt.annotate(
                emoji.demojize(the_emoji).strip(":"),
                (x, y),
                textcoords="offset points",
                xytext=(0, 10),
                ha="center",
            )

        # Save
        plt.savefig(
            os.path.join(root, "Clusters." + model_type + "." + str(cluster) + ".png"),
            bbox_inches="tight",
            dpi=150,
        )
        plt.clf()

     Emoji
1        ©
2        ®
3        ‼
4        ⁉
5        ™
...    ...
1333     🫢
1334     🫣
1335     🫱
1336     🫲
1337     🫶

[1337 rows x 1 columns]
FastTextKeyedVectors<vector_size=100, 781763 keys>
[[ 0.7504815  -0.12357643  0.2813078  ... -0.7362428  -0.21499234
   0.29283905]
 [ 0.100286   -0.14175165 -0.327772   ... -0.03339017 -0.4929159
   0.0641002 ]
 [ 0.748908    0.315675   -0.06812673 ... -0.14983422 -0.74549055
   0.04139285]
 ...
 [ 0.17411348  0.03496134 -0.03184183 ... -0.18819378  0.04994455
   0.00530196]
 [-0.17472786 -0.02215752  0.1227208  ... -0.14464186  0.19242333
   0.24125613]
 [-0.29211238 -0.27425668  0.16486563 ... -0.3110864   0.36513796
   0.0600051 ]]
     Emoji Topic
1269     🫖     0
508      👟     0
511      👢     0
512      👣     0
232      🌾     0
...    ...   ...
308      🎊     9
307      🎉     9
299      🎁     9
1026     🤶     9
306      🎈     9

[1270 rows x 2 columns]
     Similarity Emoji
0      0.854889     🪶
1      0.835376     🍱
2     

<Figure size 1800x1800 with 0 Axes>

## Analogy analysis

In [46]:
def find_analogy_result(model, e1, e2, e3, n_closest):
    # Check if the given emojis are in the vocabulary
    if e1 not in model or e2 not in model or e3 not in model:
        return None

    # Calculate the analogy result: e2 - e1 + e3
    e1_vector = model[e1]
    e2_vector = model[e2]
    e3_vector = model[e3]
    analogy_vector = e2_vector - e1_vector + e3_vector

    # Find the closest emoji embedding to the analogy vector
    nearest_emoji = [res[0] for res in model.similar_by_vector(analogy_vector, topn=500) if emoji.emoji_count(res) > 0]

    return nearest_emoji[:n_closest]

In [44]:
import os
import gensim
import pandas as pd

analogy_triplets = [
    ("🙂", "😄", "😢"),
    ("👍", "👎", "❤️"),
    ("👨", "🤴", "👩"),
]

root = os.path.join(".", "Data")
sg_file = os.path.join(root, "model_tweets_with_emojis_12M_sgns.bin")
cbow_file = os.path.join(root, "model_tweets_with_emojis_12M_cbow.bin")

analogies_dfs = []
for embedding_file in [sg_file, cbow_file]:
    # Get model_type
    if "sgns" in embedding_file:
        model_type = "sg"
    elif "cbow" in embedding_file:
        model_type = "cbow"

    model = gensim.models.fasttext.load_facebook_vectors(embedding_file)

    print(model_type)
    df_for_embedding_set = []
    for triplet in analogy_triplets:
        # e1, e2, e3 =
        analogy_result = find_analogy_result(
            model, triplet[0], triplet[1], triplet[2], 5
        )
        vector_math = f"{triplet[1]} - {triplet[0]} + {triplet[2]}"
        print(f"'{vector_math} = {analogy_result} (top 5)")
        df_for_embedding_set.append(
            (
                model_type,
                triplet[0],
                triplet[1],
                triplet[2],
                vector_math,
                analogy_result,
            )
        )
    analogies_dfs.append(
        pd.DataFrame(
            df_for_embedding_set,
            columns=["Model", "e1", "e2", "e3", "VectorOperation", "Top5Results"],
        )
    )
    print()

sg
'😄 - 🙂 + 😢 = ['😄', '😢', '😨', '😟', '😥'] (top 5)
'👎 - 👍 + ❤️ = ['💔', '😩', '👎', '🤰', '🫂'] (top 5)
'🤴 - 👨 + 👩 = ['🤴', '👸', '🩰', '🧟', '🥟'] (top 5)



  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


cbow
'😄 - 🙂 + 😢 = ['😄', '😢', '😟', '😔', '😥'] (top 5)
'👎 - 👍 + ❤️ = ['💘', '🖤', '💔', '🫂', '🥺'] (top 5)
'🤴 - 👨 + 👩 = ['🤴', '👸', '👩', '🍒', '🐻'] (top 5)



In [45]:
pd.concat(analogies_dfs)

Unnamed: 0,Model,e1,e2,e3,VectorOperation,Top5Results
0,sg,🙂,😄,😢,😄 - 🙂 + 😢,"[😄, 😢, 😨, 😟, 😥]"
1,sg,👍,👎,❤️,👎 - 👍 + ❤️,"[💔, 😩, 👎, 🤰, 🫂]"
2,sg,👨,🤴,👩,🤴 - 👨 + 👩,"[🤴, 👸, 🩰, 🧟, 🥟]"
0,cbow,🙂,😄,😢,😄 - 🙂 + 😢,"[😄, 😢, 😟, 😔, 😥]"
1,cbow,👍,👎,❤️,👎 - 👍 + ❤️,"[💘, 🖤, 💔, 🫂, 🥺]"
2,cbow,👨,🤴,👩,🤴 - 👨 + 👩,"[🤴, 👸, 👩, 🍒, 🐻]"


## Stats

In [38]:
# Unique languages
len(tweets_with_emojis['Lang'].unique())

66

In [34]:
# Smileys percentage
len(
    tweets_with_emojis[
        tweets_with_emojis["Emojis"].str.contains(
            '|'.join(all_emojis_by_category[all_emojis_by_category["Category"] == "Smileys"]['Emoji'].to_list())
        )
    ]
)/len(tweets_with_emojis)*100

63.14669785705335