In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn import feature_extraction
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the tweets into a DataFrame
tweets_df = pd.read_csv("../project2/Data.Original.1900")
tweets_df

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region
0,pruzhany,en,2021-12-06 15:13:17,@WarcraftDevs - demon spikes reflect x % of in...,u936uxwkybu2,by,europe_east
1,lumphat,und,2021-12-06 15:25:06,@phuonganhh21 e,w6khf3wghq2s,kh,asia_southeast
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub
3,boruny,und,2021-12-06 15:55:03,@SobolLubov #ОскарПутину #Оскар #Путинизм #Нав...,u9dt48j9wb99,by,europe_east
4,vilkaviskis,en,2021-12-06 17:15:42,Top rated project! @polygen_io and #Launchpad ...,u98quj5919n4,lt,europe_east
...,...,...,...,...,...,...,...
12744328,soka,und,2021-12-21 16:43:39,https://t.co/cDu892Apj6,xn77v2fkpxu8,jp,asia_east
12744329,soka,ja,2021-12-21 16:43:39,今日だけで4玉食べて冷凍庫を空にした。引越し前うどん。 https://t.co/zG57D...,xn77v2fkpxu8,jp,asia_east
12744330,pedro betancourt,ja,2021-12-21 16:43:39,是我了 https://t.co/KPTyf0hJCO,dhn1q1q7ttq1,cu,america_central
12744331,soka,und,2021-12-21 16:43:40,🙇‍♀️💕💕 https://t.co/orOVDvtL85,xn77v2fkpxu8,jp,asia_east


In [3]:
# Drop empty rows
tweets_df = tweets_df.dropna()
tweets_df

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region
0,pruzhany,en,2021-12-06 15:13:17,@WarcraftDevs - demon spikes reflect x % of in...,u936uxwkybu2,by,europe_east
1,lumphat,und,2021-12-06 15:25:06,@phuonganhh21 e,w6khf3wghq2s,kh,asia_southeast
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub
3,boruny,und,2021-12-06 15:55:03,@SobolLubov #ОскарПутину #Оскар #Путинизм #Нав...,u9dt48j9wb99,by,europe_east
4,vilkaviskis,en,2021-12-06 17:15:42,Top rated project! @polygen_io and #Launchpad ...,u98quj5919n4,lt,europe_east
...,...,...,...,...,...,...,...
12744328,soka,und,2021-12-21 16:43:39,https://t.co/cDu892Apj6,xn77v2fkpxu8,jp,asia_east
12744329,soka,ja,2021-12-21 16:43:39,今日だけで4玉食べて冷凍庫を空にした。引越し前うどん。 https://t.co/zG57D...,xn77v2fkpxu8,jp,asia_east
12744330,pedro betancourt,ja,2021-12-21 16:43:39,是我了 https://t.co/KPTyf0hJCO,dhn1q1q7ttq1,cu,america_central
12744331,soka,und,2021-12-21 16:43:40,🙇‍♀️💕💕 https://t.co/orOVDvtL85,xn77v2fkpxu8,jp,asia_east


In [166]:
tweets_df_10000 = tweets_df[:10000]
tweets_df_100k = tweets_df[:100000]

In [110]:
import re
import emoji


def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text, flags=re.MULTILINE)

    # Remove mentions
    text = re.sub(r"@\w+", "", text)

    return text


def get_emojis(text):
    emojis = emoji.analyze(text)
    return list(x[0] for x in emojis)


In [None]:
# Only consider tweets which contain at least 1 emoji
tweets_with_emojis = tweets_df[
    tweets_df["Text"].apply(lambda tweet_text: emoji.emoji_count(tweet_text) > 0)
]

# Apply the cleaning function to the 'Text' column to generate 'CleanedText' column
tweets_with_emojis["CleanedText"] = tweets_with_emojis["Text"].apply(clean_text)

# Get just emojis and store in 'Emojis' column
tweets_with_emojis["Emojis"] = tweets_with_emojis["Text"].apply(
    lambda text: "".join(get_emojis(text))
)

tweets_with_emojis

In [168]:
tweets_with_emojis[:50]['CleanedText']

2        Eh le poto il est à l’aise avec sa question 💀💀 
11                        😭😭😭😭😭 ur ugly for that wallahi
17               Hacer cosas de novios sin ser novios ♥️
18      انا جزائري لا كن أرى أن جزائري افضل في اسوء ح...
21                            Lmaooo i felt that shit 🤣💀
29                                            おはようございます🕺
30                                                   ❤️ 
32                                                 😭😭😭😭 
36     Hello!  PaidTunes is a company that pays you w...
42                                               لصقوو 💙
43                                       صلي علي النبي 🤍
51     Знаменитый имбирный эль 🍺 из Англии 🇬🇧 сладкий...
52     До самого конца хочется избавить пиво 🍺 от это...
55        Me casually cleaning all my guns and bullets 😏
57                                        D-24 굿모닝 싸랑해💙 
77                                3 shifts left at unm 🤩
78      Yeah , we found out who lmfaoo but it threw u...
83                       Me que

In [3]:
def is_regional_indicator_or_zwj_or_gender(char):
    return char.encode("utf-8")[:3] == b"\xf0\x9f\x87" or char in [
        "\u200d",
        "\u2640",
        "\u2642",
    ]


def extract_emoji_sequences(text):
    emoji_sequences = []
    formatted_text = ""
    current_sequence = ""
    for i, char in enumerate(text):
        # Check if is emoji or is regional indicator like "🇦"
        if emoji.is_emoji(char) or is_regional_indicator_or_zwj_or_gender(char):
            current_sequence += char
        else:
            if current_sequence:
                emoji_sequences.append(current_sequence)
                if formatted_text and formatted_text[-1] != " ":
                    # print("adding space")
                    formatted_text += " "
                formatted_text += current_sequence
                current_sequence = ""

            if (
                formatted_text
                and formatted_text[-1] != " "
                and (
                    emoji.is_emoji(formatted_text[-1])
                    or is_regional_indicator_or_zwj_or_gender(char)
                )
            ):
                formatted_text += " "
            formatted_text += char

    if current_sequence:
        emoji_sequences.append(current_sequence)
        if formatted_text and formatted_text[-1] != " ":
            formatted_text += " "
        formatted_text += current_sequence

    return formatted_text.strip(), emoji_sequences

In [None]:
# Extract sequences
tweets_with_emojis["SeparatedText"], tweets_with_emojis["EmojiSequences"] = zip(
    *tweets_with_emojis["CleanedText"].apply(extract_emoji_sequences)
)

In [170]:
tweets_with_emojis

Unnamed: 0,City,Lang,Time,Text,Geohash,Country,Region,CleanedText,Emojis,SeparatedText,EmojiSequences
2,tsiombe,fr,2021-12-06 15:50:41,Eh le poto il est à l’aise avec sa question 💀💀...,m5824frby6pt,mg,africa_sub,Eh le poto il est à l’aise avec sa question 💀💀,💀💀,Eh le poto il est à l’aise avec sa question 💀💀,[💀💀]
11,hobyo,en,2021-12-06 23:04:04,@M0zark0 😭😭😭😭😭 ur ugly for that wallahi,t0fw2npnp7hn,so,africa_north,😭😭😭😭😭 ur ugly for that wallahi,😭😭😭😭😭,😭😭😭😭😭 ur ugly for that wallahi,[😭😭😭😭😭]
17,ayny,es,2021-12-06 23:19:23,Hacer cosas de novios sin ser novios ♥️,tx08psjenf21,tj,asia_central,Hacer cosas de novios sin ser novios ♥️,♥️,Hacer cosas de novios sin ser novios ♥ ️,[♥]
18,aflou,ar,2021-12-06 23:23:17,@beINSPORTS_news انا جزائري لا كن أرى أن جزائر...,sn16pc6ddver,dz,africa_north,انا جزائري لا كن أرى أن جزائري افضل في اسوء ح...,😂,انا جزائري لا كن أرى أن جزائري افضل في اسوء حا...,[😂]
21,villa yapacani,en,2021-12-06 23:28:34,@andalechuey Lmaooo i felt that shit 🤣💀,6sft52mvxsct,bo,america_south,Lmaooo i felt that shit 🤣💀,🤣💀,Lmaooo i felt that shit 🤣💀,[🤣💀]
...,...,...,...,...,...,...,...,...,...,...,...
100032,bojnurd,ar,2021-12-15 10:53:44,@heyfa64 ممنونم❤,tq8v0qw6k277,ir,asia_central,ممنونم❤,❤,ممنونم ❤,[❤]
100033,kaliganj,und,2021-12-15 10:53:44,@Milon10941303 ❤️❤️,tupmhqbgvphc,bd,asia_south,❤️❤️,❤️❤️,❤ ️ ❤ ️,"[❤, ❤]"
100035,tiszaujvaros,und,2021-12-15 10:53:44,@kyloliqq 🤍🤍🤍,u2wbxvbrvkcq,hu,europe_east,🤍🤍🤍,🤍🤍🤍,🤍🤍🤍,[🤍🤍🤍]
100039,kuusamo,und,2021-12-15 10:53:45,@kiyowyd 🥰🥵,uesz20fesvvy,fi,europe_west,🥰🥵,🥰🥵,🥰🥵,[🥰🥵]


In [184]:
# Export to txt file (used for embeddings later)
tweets_with_emojis.to_csv(
    "Data/2024-05-07-13-10/tweets_with_emojis.txt",
    columns=["SeparatedText"],
    header=False,
    index=False,
)

In [185]:
# Save preprocessed data to csv
tweets_with_emojis.to_csv("Data/2024-05-07-13-10/tweets_with_emojis_dataframe.csv")

In [4]:
import ast

data_dir = "./Data/2024-05-07-13-10"
# Load data from CSV
tweets_with_emojis = pd.read_csv(
    f"{data_dir}/tweets_with_emojis_dataframe.csv", index_col=0
)

# Convert string representation of lists to actual lists
tweets_with_emojis["EmojiSequences"] = tweets_with_emojis["EmojiSequences"].apply(
    ast.literal_eval
)

## Train embeddings

In [186]:
import fasttext
import gensim
import os


# Print test results
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))


# Set data directory
data_dir = os.path.join(".", "Data/2024-05-07-13-10")
data_file = os.path.join(data_dir, "tweets_with_emojis.txt")


# SGNS model
model = fasttext.train_unsupervised(
    input=data_file,
    model="skipgram",
    ws=4,
    minCount=2,
    minn=0,
    maxn=0,
    neg=5,
)

# Save bin format for SGNS
model.save_model(os.path.join(data_dir, "model_tweets_with_emojis_sgns.bin"))


# CBOW model
model = fasttext.train_unsupervised(
    input=data_file,
    model="cbow",
    ws=4,
    minCount=2,
    minn=0,
    maxn=0,
    neg=5,
)

# Save bin format for CBOW
model.save_model(os.path.join(data_dir, "model_tweets_with_emojis_cbow.bin"))


# Convert SGNS from bin to vec format
model = gensim.models.fasttext.load_facebook_vectors(
    os.path.join(data_dir, "model_tweets_with_emojis_sgns.bin")
)
model.save_word2vec_format(os.path.join(data_dir, "model_tweets_with_emojis_sgns.vec"))


# Convert CBOW from bin to vec format
model = gensim.models.fasttext.load_facebook_vectors(
    os.path.join(data_dir, "model_tweets_with_emojis_cbow.bin")
)
model.save_word2vec_format(os.path.join(data_dir, "model_tweets_with_emojis_cbow.vec"))

Read 26M words
Number of words:  796246
Number of labels: 0
Progress: 100.0% words/sec/thread:  108542 lr:  0.000000 avg.loss:  1.053137 ETA:   0h 0m 0s 40.4% words/sec/thread:  112825 lr:  0.029813 avg.loss:  1.497900 ETA:   0h 1m41s
Read 26M words
Number of words:  796246
Number of labels: 0
Progress: 100.0% words/sec/thread:  308815 lr:  0.000000 avg.loss:  1.347733 ETA:   0h 0m 0s  6.0% words/sec/thread:  302074 lr:  0.046991 avg.loss:  3.138034 ETA:   0h 0m59s1.347733 ETA:   0h 0m 0s


## Analysis 

In [95]:
import os
import gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import emoji

figure(figsize=(12, 12), dpi=150)

# Set file names and location
root = os.path.join(".", "Data/2024-05-07-13-10")
sg_file = os.path.join(root, "model_tweets_with_emojis_sgns.bin")
cbow_file = os.path.join(root, "model_tweets_with_emojis_cbow.bin")

# Load vocab features (tweets )
vocab_df = tweets_with_emojis[
    tweets_with_emojis["EmojiSequences"].apply(
        lambda sequences: all(emoji.emoji_count(seq) > 1 for seq in sequences)
    )
]
print(vocab_df)

# Load embedding (change file to change type)
model = gensim.models.fasttext.load_facebook_vectors(sg_file)
model_type = "cbow"
print(model)

# Get embeddings for the vocab in the file
embeddings = []
emoji_sequences_with_embeddings = []
for emoji_sequences_list in vocab_df.loc[:, "EmojiSequences"].values:
    for emoji_sequence in emoji_sequences_list:
        try:
            embeddings.append(model[emoji_sequence])
            emoji_sequences_with_embeddings.append(emoji_sequence)
        except:
            continue

emoji_seq_with_embeddings_df = pd.DataFrame(
    {"EmojiSequences": emoji_sequences_with_embeddings}
)
vocab_df = emoji_seq_with_embeddings_df


# Now individual arrays into single matrix (note shifting types!)
embeddings = np.vstack(embeddings)
print(embeddings)

# Cluster words
cluster = KMeans(n_clusters=20, init="k-means++", max_iter=300)

# Fit cluster (i.e., run it)
cluster.fit(embeddings)

# Now get cluster labels
vocab_df.loc[:, "Topic"] = [str(x) for x in cluster.labels_]
vocab_df.sort_values("Topic", inplace=True)
print(vocab_df)

# Now save and visualize clusters
root = os.path.join(".", "Clusters/2024-05-08-13-20")
for cluster, cluster_df in vocab_df.groupby("Topic"):
    # Get vocab items in the model
    vocab = set(
        [
            seq
            for seq in cluster_df.loc[:, "EmojiSequences"].values
            if not np.all(model[seq] == 0)  # Make sure embedding is not 0
        ]
    )
    save = True

    if vocab == []:
        continue

    # Now find the distance of each word from the centroid
    cluster_shape = model.rank_by_centrality(vocab, use_norm=True)

    cluster_shape = pd.DataFrame(cluster_shape, columns=["Similarity", "EmojiSequence"])

    # print(cluster_shape)
    cluster_shape.to_csv(
        os.path.join(root, "Clusters." + model_type + "." + str(cluster) + ".csv")
    )

    # Get distance from the center word
    try:
        distances_center = model.distances(
            cluster_shape.iloc[0, 1], other_words=tuple(vocab)
        )
    except:
        save = False
    try:
        distances_edge = model.distances(
            cluster_shape.iloc[-1, 1], other_words=tuple(vocab)
        )
    except:
        save = False

    # Just to catch any key errors for missing words
    if save == True:
        # Make dataframe
        cluster_graph = pd.DataFrame([vocab, distances_center, distances_edge]).T
        cluster_graph.columns = ["Word", "Distance from Center", "Distance from Edge"]

        # Make graph of the cluster
        ax = sns.scatterplot(
            data=cluster_graph, x="Distance from Center", y="Distance from Edge"
        )

        # Add words to label points
        for row in cluster_graph.itertuples():
            emoji_sequence = row[1]
            x = row[2]
            y = row[3]

            # plt.annotate(
            #     emoji.demojize(emoji_sequence).strip(":"),
            #     (x, y),
            #     textcoords="offset points",
            #     xytext=(0, 10),
            #     ha="center",
            # )

        # Save
        plt.savefig(
            os.path.join(root, "Clusters." + model_type + "." + str(cluster) + ".png"),
            bbox_inches="tight",
            dpi=150,
        )
        plt.clf()

                    City Lang                 Time  \
2                tsiombe   fr  2021-12-06 15:50:41   
11                 hobyo   en  2021-12-06 23:04:04   
21        villa yapacani   en  2021-12-06 23:28:34   
32          vangaindrano  und  2021-12-06 23:40:46   
36         david-gorodok   en  2021-12-06 23:48:44   
...                  ...  ...                  ...   
12744226            soka   ja  2021-12-21 16:43:23   
12744230        rumuruti  und  2021-12-21 16:43:24   
12744299            soka   ja  2021-12-21 16:43:35   
12744306  klosterneuburg   en  2021-12-21 16:43:36   
12744323            soka   en  2021-12-21 16:43:38   

                                                       Text       Geohash  \
2         Eh le poto il est à l’aise avec sa question 💀💀...  m5824frby6pt   
11                  @M0zark0 😭😭😭😭😭 ur ugly for that wallahi  t0fw2npnp7hn   
21                  @andalechuey Lmaooo i felt that shit 🤣💀  6sft52mvxsct   
32                             😭😭😭😭 https:/

<Figure size 1800x1800 with 0 Axes>

### Clustering with both individual emojis and sequences

In [105]:
import os
import gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import emoji

figure(figsize=(12, 12), dpi=150)

# Set file names and location
root = os.path.join(".", "Data/2024-05-07-13-10")
sg_file = os.path.join(root, "model_tweets_with_emojis_sgns.bin")
cbow_file = os.path.join(root, "model_tweets_with_emojis_cbow.bin")

# Load vocab features
vocab_df = tweets_with_emojis

# Load embedding (change file to change type)
model = gensim.models.fasttext.load_facebook_vectors(sg_file)
model_type = "cbow"
print(model)

# Get embeddings for the vocab in the file
embeddings = []
emoji_sequences_with_embeddings = []
for emoji_sequences_list in vocab_df.loc[:, "EmojiSequences"].values:
    for emoji_sequence in emoji_sequences_list:
        try:
            embeddings.append(model[emoji_sequence])
            emoji_sequences_with_embeddings.append(emoji_sequence)
        except:
            continue

emoji_seq_with_embeddings_df = pd.DataFrame(
    {"EmojiSequences": emoji_sequences_with_embeddings}
)
vocab_df = emoji_seq_with_embeddings_df


# Now individual arrays into single matrix (note shifting types!)
embeddings = np.vstack(embeddings)
print(embeddings)

# Cluster words
cluster = KMeans(n_clusters=20, init="k-means++", max_iter=300)

# Fit cluster (i.e., run it)
cluster.fit(embeddings)

# Now get cluster labels
vocab_df.loc[:, "Topic"] = [str(x) for x in cluster.labels_]
vocab_df.sort_values("Topic", inplace=True)
print(vocab_df)

# Now save and visualize clusters
root = os.path.join(".", "Clusters/2024-05-09-12-05")
for cluster, cluster_df in vocab_df.groupby("Topic"):
    # Get the vocab items that are in model
    vocab = set(
        [
            seq
            for seq in cluster_df.loc[:, "EmojiSequences"].values
            if not np.all(model[seq] == 0)  # Make sure embedding is not 0
        ]
    )
    save = True

    if vocab == []:
        continue

    # Now find the distance of each word from the centroid
    cluster_shape = model.rank_by_centrality(vocab, use_norm=True)

    cluster_shape = pd.DataFrame(cluster_shape, columns=["Similarity", "EmojiSequence"])

    # print(cluster_shape)
    cluster_shape.to_csv(
        os.path.join(
            root, "Clusters.IndivAndSeq." + model_type + "." + str(cluster) + ".csv"
        )
    )

    # Get distance from the center word
    try:
        distances_center = model.distances(
            cluster_shape.iloc[0, 1], other_words=tuple(vocab)
        )
    except:
        save = False
    try:
        distances_edge = model.distances(
            cluster_shape.iloc[-1, 1], other_words=tuple(vocab)
        )
    except:
        save = False

    # Just to catch any key errors for missing words
    if save == True:
        # Make dataframe
        cluster_graph = pd.DataFrame([vocab, distances_center, distances_edge]).T
        cluster_graph.columns = ["Word", "Distance from Center", "Distance from Edge"]

        # Make graph of the cluster
        ax = sns.scatterplot(
            data=cluster_graph, x="Distance from Center", y="Distance from Edge"
        )

        # Add words to label points
        for row in cluster_graph.itertuples():
            emoji_sequence = row[1]
            x = row[2]
            y = row[3]

        # Save
        plt.savefig(
            os.path.join(
                root, "Clusters.IndivAndSeq." + model_type + "." + str(cluster) + ".png"
            ),
            bbox_inches="tight",
            dpi=150,
        )
        plt.clf()

FastTextKeyedVectors<vector_size=100, 796246 keys>
[[ 0.41380402 -0.11446256 -0.7173146  ...  0.53497213  0.5812335
   0.32401615]
 [ 0.45882672  0.37555623 -0.5633557  ...  0.10746752  0.18964337
   0.48710862]
 [ 0.23250706  0.81500727  1.166513   ...  0.3877129  -0.26803163
  -1.1895267 ]
 ...
 [ 0.27002043 -1.1184332   0.01037823 ... -0.4062101  -0.22688912
   0.53178984]
 [-1.138427   -0.09576458  0.51226735 ... -0.24816284  1.3126827
  -1.0916843 ]
 [ 0.27747092  0.02784789  0.34089074 ... -0.24584675  0.31233457
  -0.3711467 ]]
        EmojiSequences Topic
2074828              👌     0
1498618              👌     0
3397247              👌     0
2909770              👌     0
878794               👌     0
...                ...   ...
3462411              💛     9
2093404              🧡     9
2902256              💜     9
3347091              🖤     9
1583919              💜     9

[3528540 rows x 2 columns]


<Figure size 1800x1800 with 0 Axes>

In [96]:
# Flatten list of sequences
all_sequences = [sequence for sequences in tweets_with_emojis['EmojiSequences'] for sequence in sequences]
all_sequences_at_least_2 = [sequence for sequence in all_sequences if emoji.emoji_count(sequence) > 1]

Average sequence length: 1.905306602743449


In [97]:
np.mean([emoji.emoji_count(sequence) for sequence in all_sequences_at_least_2])

3.339016947504073

In [98]:
np.median([emoji.emoji_count(sequence) for sequence in all_sequences_at_least_2])

3.0

In [112]:
# How many tweets used?
len(tweets_with_emojis)

2992375

In [107]:
# How many unique sequences of at least 2?
len(set(all_sequences_at_least_2))

244654

In [114]:
# How many unique individual emojis?
all_emojis_from_data = sorted([''.join(get_emojis(x)) for x in list(set(''.join(tweets_with_emojis.loc[:,"Emoji"].values)))])
all_unique_emojis_from_data = set(all_emojis_from_data)
len(all_unique_emojis_from_data)

1338

In [117]:
from collections import Counter

# Count occurrences of each sequence
sequence_counter = Counter(all_sequences_at_least_2)
most_common_sequences = sequence_counter.most_common(10)

print("10 most common sequences:")
for sequence, count in most_common_sequences:
    if emoji.emoji_count(sequence) >= 2:
        print(sequence, ":", count)


# Count occurrences of each emoji
sequence_counter = Counter([seq for seq in all_sequences if len(get_emojis(seq)) == 1])
most_common_emojis = sequence_counter.most_common(10)

print("10 most common emojis:")
for common_emoji, count in most_common_emojis:
    print(common_emoji, ":", count)

10 most common sequences:
😂😂😂 : 57871
😂😂 : 56719
🤣🤣🤣 : 33592
🤣🤣 : 24275
😂😂😂😂 : 24073
😭😭😭 : 20088
😭😭 : 19998
🤣🤣🤣🤣 : 15248
😂😂😂😂😂 : 11023
😭😭😭😭 : 8581
10 most common emojis:
❤ : 183941
😂 : 153454
😭 : 68989
🤣 : 68818
🥺 : 41917
😍 : 40050
😅 : 37134
♥ : 36839
😊 : 33679
🥰 : 31201


## Neighborhoods

In [217]:
import os
import random
import fasttext
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(12, 6), dpi=150)

# Get nearest neighbors for target word (i.e., target emoji)
def get_distances(model, target, vocab):
    # Get embedding for target emoji
    target_embedding = model[target].reshape(1, -1)

    # Get cosine similarity for each word in vocab
    distances = []
    for word in vocab:
        similarity = cosine_similarity(target_embedding, model[word].reshape(1, -1))[0][0]
        distances.append([similarity, word])

    # Make dataframe
    results_df = pd.DataFrame(distances, columns=["Similarity", "Emoji Sequence"])
    results_df.sort_values("Similarity", ascending=False, inplace=True)
    results_df = results_df.head(20)
    results_df.loc[:,"Target Emoji Sequence"] = target
    
    return results_df


# Set file names and location
data_dir = os.path.join(".", "Data/2024-05-07-13-10")
sg_file = os.path.join(data_dir, "model_tweets_with_emojis_sgns.bin")
cbow_file = os.path.join(data_dir, "model_tweets_with_emojis_cbow.bin")


# Get 20 random emoji sequences
# words = random.sample(tweets_with_emojis.loc[:,"Emojis"], 20)
# words = get_emoji_sample_with_categories(n=20) 
seq_in_model = [w for w in model.get_words() if emoji.emoji_count(w) > 1]
words = random.sample(seq_in_model, 20)

# Generate id for file
tag = str(random.randint(1,1000))

# For each type of embedding
for embedding_file in [sg_file, cbow_file]:
    # Get model_type
    if "sgns" in embedding_file:
        model_type = "sg"
    elif "cbow" in embedding_file:
        model_type = "cbow"

    # Load embedding 
    model = fasttext.load_model(embedding_file)
    print(model)

    # Iterate over words (emojis from sample)
    stack = []
    for emoji_sequence in words:
        print(f"getting distances for {emoji_sequence}")
        # emojis_in_same_category = all_emojis_by_category[all_emojis_by_category['Category'] == the_emoji_category]
        nearest = get_distances(model, emoji_sequence, seq_in_model)
        stack.append(nearest)

    # Concat
    nearest_df = pd.concat(stack)
    print(nearest_df)
    
    # Save
    file = f"Neighbors/2024-05-08-07-10/TweetsWithEmojis.Neighbors.{tag}.{model_type}"
    nearest_df.to_csv(file+".csv")
    
    # Graph
    ax = sns.stripplot(data=nearest_df, x="Target Emoji Sequence", hue="Target Emoji Sequence", y="Similarity", jitter=True, size=2)

    # Hide word labels
    frame = plt.gca()
    # frame.axes.xaxis.set_ticklabels([f"{emoji.demojize(w).strip(":")} {w}" for w in words['Emoji']])
    frame.axes.xaxis.set_ticklabels([])
    # plt.xticks(rotation=45, ha='right')


    plt.savefig(file+".neighborhoods.png", dpi = 150, bbox_inches = "tight")
    plt.clf()



<fasttext.FastText._FastText object at 0x279b882c0>
getting distances for 🙌🙃
getting distances for 👏⚽
getting distances for 🇵🇹🇵🇹🇵🇹
getting distances for 🎂🍾🌲
getting distances for 💪🏼🤜🏼🤛🏼
getting distances for 😐👍
getting distances for 😭😭😢😢
getting distances for 🤪👌
getting distances for 👩🏾‍❤
getting distances for 😷🦠💉
getting distances for 😂😂😂💔💔💔
getting distances for 👍🥶
getting distances for 😂😂😭😭"
getting distances for 😬🤔"
getting distances for 🍿🍿🍿🍿🍿🍿🍿🍿
getting distances for 🥲💕💕
getting distances for 🌳🌷
getting distances for 🐶💩
getting distances for 🤩👌"
getting distances for 🤣🤣🙏
       Similarity Emoji Sequence Target Emoji Sequence
28784    1.000000             🙌🙃                    🙌🙃
29581    0.934571            ✨🙌"                    🙌🙃
54220    0.925216             😳🛐                    🙌🙃
38791    0.919492            🤗👏🏼                    🙌🙃
25509    0.918676            😍🙌"                    🙌🙃
...           ...            ...                   ...
44593    0.879122           🤩🎉🎁💵



<fasttext.FastText._FastText object at 0x277abf3e0>
getting distances for 🙌🙃
getting distances for 👏⚽
getting distances for 🇵🇹🇵🇹🇵🇹
getting distances for 🎂🍾🌲
getting distances for 💪🏼🤜🏼🤛🏼
getting distances for 😐👍
getting distances for 😭😭😢😢
getting distances for 🤪👌
getting distances for 👩🏾‍❤
getting distances for 😷🦠💉
getting distances for 😂😂😂💔💔💔
getting distances for 👍🥶
getting distances for 😂😂😭😭"
getting distances for 😬🤔"
getting distances for 🍿🍿🍿🍿🍿🍿🍿🍿
getting distances for 🥲💕💕
getting distances for 🌳🌷
getting distances for 🐶💩
getting distances for 🤩👌"
getting distances for 🤣🤣🙏
       Similarity Emoji Sequence Target Emoji Sequence
28784    1.000000             🙌🙃                    🙌🙃
6854     0.863498             ❤🎶                    🙌🙃
6872     0.859594            🥚🥚🥚                    🙌🙃
13799    0.859109            🤗😍"                    🙌🙃
10228    0.850270             😈🤤                    🙌🙃
...           ...            ...                   ...
19272    0.856314            🙏🏼🍀

<Figure size 1800x900 with 0 Axes>

In [101]:
len(set(all_sequences))

247621

In [205]:
model = fasttext.load_model(cbow_file)
# get_distances(model, 🥱🤢,🐶🐶😊)
word = "🥱🤢"
target = "🐶🐶😊"
target_embedding = model[target].reshape(1, -1)
cosine_similarity(target_embedding, model[word].reshape(1, -1))[0][0]



0.0

In [216]:
len([w for w in model.get_words() if emoji.emoji_count(w) > 1])

60231