In [1]:
import tweepy
import os.path as path
import numpy as np
import nltk.tokenize as tk
import math
import scipy.stats as stats
import pandas as pd
import huggingface_hub
from scipy.stats import pearsonr
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
import torch
from transformers import BertModel
import sklearn.cluster
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import random
import emoji

In [2]:
def get_all_examples():
    """Load all examples from a CSV file using pandas and return those that contain text.

    Returns:
        A list of tweets (or data) that contain text.
    """
    save_file = './twitter_emoji.csv'

    if path.exists(save_file):
        df = pd.read_csv(save_file, lineterminator = '\n')
        ids_to_examples = df[df['Text'].notna()].to_dict(orient='records')  # Convert to list of dicts
    else:
        print('Could not find tweets, so returning an empty list!')
        ids_to_examples = []

    return ids_to_examples

In [3]:
def get_tweets(raw_examples):
  tweets = []
  for tweet in raw_examples:
    for (key, value) in tweet.items():
      tweets.append(value)
  return tweets

In [4]:
import re
import emoji

# Input tweet
tweet = """🧡@KeplerHomes AirdropBox event for #Arbitrum ecological users is here. A total of 550,000 addresses are eligible for #airdrop, and 5 types of AirDropbox with different scarcity can be issued.
❤️
💙Invitation code: 52DC39
🏆Airdrop Portal:👉 https://t.co/fudohu97uV"""

# gets rid of punctuation and https
def remove_gack(words):
  punctation = ".,/';:[]\-=`~!@#$%^&*()_+{}|<>?"
  for punc in punctation:
    while punc in words:
      words.remove(punc)
  while "https" in words:
    words.remove("https")
  return words


# Convert tweet into an array of words, keeping emojis
def tweet_to_words_with_emojis(_tweet):
    tweet = _tweet
    # Match words, hashtags, mentions, URLs, numbers, and emojis
    # words = re.findall(r'\w+|#\w+|@\w+|https?://\S+|[^\w\s]', tweet)
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for em in emoji.emoji_list(tweet):
        em = em["emoji"]
        tweet = tweet.replace(em, " " + em + " ")
    while "  " in tweet:
      tweet = tweet.replace("  ", " ")
    tweet = tweet.replace(punc, "").split()
    
    # words = remove_gack(words)
    return tweet

# Process the tweet
words_array = tweet_to_words_with_emojis(tweet)

# Print the result
print(words_array)


['🧡', '@KeplerHomes', 'AirdropBox', 'event', 'for', '#Arbitrum', 'ecological', 'users', 'is', 'here.', 'A', 'total', 'of', '550,000', 'addresses', 'are', 'eligible', 'for', '#airdrop,', 'and', '5', 'types', 'of', 'AirDropbox', 'with', 'different', 'scarcity', 'can', 'be', 'issued.', '❤️', '💙', 'Invitation', 'code:', '52DC39', '🏆', 'Airdrop', 'Portal:', '👉', 'https://t.co/fudohu97uV']


In [5]:
all_tweets = []
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/backhand_index_pointing_right.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/check_mark_button.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/check_mark.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/clown_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/cooking.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/egg.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/enraged_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/eyes.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/face_holding_back_tears.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/face_savoring_food.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/face_with_steam_from_nose.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/face_with_tears_of_joy.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/fearful_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/fire.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/folded_hands.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/ghost.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/grinning_face_with_sweat.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/hatching_chick.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/hot_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/loudly_crying_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/melting_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/middle_finger.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/party_popper.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/partying_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/pile_of_poo.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/rabbit_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/rabbit.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/red_heart.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/rolling_on_the_floor_laughing.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/saluting_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/skull.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face_with_halo.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face_with_heart-eyes.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face_with_hearts.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face_with_sunglasses.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face_with_tear.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/smiling_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/sparkles.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/sun.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/thinking_face.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/thumbs_up.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/white_heart.csv", lineterminator='\n')["Text"]))
all_tweets.extend(list(pd.read_csv("emoji_twitter_dataset2/winking_face.csv", lineterminator='\n')["Text"]))

In [6]:
len(all_tweets)

860000

In [7]:
# raw_tweets = all_tweets
tweets = all_tweets #get_tweets(raw_tweets)
token_tweets = []
for tweet in tweets:
  token_tweets.append(tweet_to_words_with_emojis(tweet))

(token_tweets[:10])

[['🧡',
  '@KeplerHomes',
  'AirdropBox',
  'event',
  'for',
  '#Arbitrum',
  'ecological',
  'users',
  'is',
  'here.',
  'A',
  'total',
  'of',
  '550,000',
  'addresses',
  'are',
  'eligible',
  'for',
  '#airdrop,',
  'and',
  '5',
  'types',
  'of',
  'AirDropbox',
  'with',
  'different',
  'scarcity',
  'can',
  'be',
  'issued.',
  '💙',
  'Invitation',
  'code:',
  '52DC39',
  '🏆',
  'Airdrop',
  'Portal:',
  '👉',
  'https://t.co/fudohu97uV'],
 ['Remember,',
  'success',
  'in',
  'online',
  'business',
  'is',
  'a',
  'marathon,',
  'not',
  'a',
  'sprint.',
  'Keep',
  'at',
  'it,',
  'stay',
  'focused,',
  'and',
  'success',
  'will',
  'come."',
  '#patience',
  '#onlinebusiness',
  '#success',
  'For',
  'more',
  'tips',
  'and',
  'Strategies,',
  'follow',
  'me',
  '👉',
  '@coach_lawrence1',
  'https://t.co/IvtL9Om86J'],
 ['@occupied_9',
  '@Rhiannon_clare_',
  '@FightHaven',
  'Thanks',
  'for',
  'the',
  'update',
  'the',
  'sh*t',
  'country',
  'the',
  

In [8]:
# all_emojis = []
# for tweet in tweets:
#     for em in emoji.emoji_list(tweet):
#         em = em["emoji"]
#         all_emojis.append(em)
# all_emojis = list(set(all_emojis))

In [9]:
# tweets_array = [tweet_to_words_with_emojis(tweet) for tweet in tweets]
# tweets_array

In [10]:
# all_emojis = []
# for tweet in tweets:
#     for em in emoji.emoji_list(tweet):
#         em = em["emoji"]
#         all_emojis.append(em)
# all_emojis = list(set(all_emojis))

In [11]:
all_emojis = []
for tweet in token_tweets:
    tokens = [token for token in tweet if emoji.is_emoji(token)]
    all_emojis.extend(tokens)
all_emojis = list(set(all_emojis))

In [12]:
from gensim.models import Word2Vec

model = Word2Vec(token_tweets, min_count=1, max_vocab_size=None)
model.build_vocab(token_tweets, update=True)

In [13]:
model.train(token_tweets, total_examples=model.corpus_count, epochs=model.epochs, report_delay=1)

(73685183, 82791480)

In [17]:
model.wv["❤️"]

array([ 5.0239772e-01, -5.3765707e+00, -3.1128863e-01,  2.2337623e+00,
       -1.5126020e+00,  3.8958380e+00, -3.4616716e+00,  9.3498915e-01,
        4.6467695e+00, -2.3549285e+00, -9.0483183e-01,  4.2565164e-01,
        4.2022276e+00, -2.7177212e+00, -6.4326340e-01,  6.0342914e-01,
        7.7761449e-02, -5.4054016e-01,  2.8076386e-01, -3.0399007e-01,
       -1.0196285e+00, -5.1881026e-02, -2.6516130e+00,  2.5474327e+00,
        9.6406955e-01, -2.0042887e+00,  1.7409179e+00, -9.2644918e-01,
        2.9051108e+00,  1.0117105e+00,  8.3809532e-02,  2.7588732e+00,
        1.5294102e+00,  4.5669273e-01, -3.3542471e+00,  7.7384019e-01,
       -2.2157575e-01, -3.4021541e-01,  3.5055488e-01, -4.5526454e-01,
       -4.0888429e-01,  9.9432194e-01,  1.5269815e+00, -4.6535573e+00,
        4.7304469e-01, -2.2350764e+00,  2.2937322e+00,  5.2155461e+00,
        1.3824332e+00,  2.5452430e+00, -1.3880554e+00,  1.2415099e+00,
        7.8836650e-01,  1.2881939e+00,  1.8563708e+00, -1.6374679e+00,
      

In [18]:
model.save("emoji2vec.model")

In [16]:
from gensim.models import Word2Vec
model = Word2Vec.load("emoji2vec.model")