# Training word2vec

## Data

Loading the data from the GitHub repository and reading it in :)

In [9]:
!wget https://github.com/AdiShirsath/Emoji_Word2Vec/raw/main/dataset/corpus.zip

--2022-12-12 20:41:27--  https://github.com/AdiShirsath/Emoji_Word2Vec/raw/main/dataset/corpus.zip
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/AdiShirsath/Emoji_Word2Vec/main/dataset/corpus.zip [following]
--2022-12-12 20:41:28--  https://raw.githubusercontent.com/AdiShirsath/Emoji_Word2Vec/main/dataset/corpus.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22026968 (21M) [application/zip]
Saving to: ‚Äòcorpus.zip‚Äô


2022-12-12 20:41:29 (14.9 MB/s) - ‚Äòcorpus.zip‚Äô saved [22026968/22026968]



In [10]:
!unzip corpus.zip

Archive:  corpus.zip
  inflating: corpus.txt              


In [3]:
with open('corpus.txt', 'r', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    data = [line.strip().split() for line in lines]

In [4]:
len(data), data[:5]

(1000000,
 [['they', 'know', 'what', 'i', 'love', 'üòä'],
  ['lemme', 'get', 'to', 'school', 'üôÑ'],
  ['watching', 'the', 'ellen', 'degeneres', 'show', 'üòÉ'],
  ['i',
   'spent',
   'twenty',
   'minutes',
   'herding',
   'turkeys',
   'by',
   'myself',
   'so',
   "that's",
   'how',
   'my',
   'days',
   'going',
   'ü¶É'],
  ['hello', 'seattle', 'cityscape', 'üèó']])

### Preprocessing

In [5]:
import myunicode

In [6]:
def download_all_emojis() -> list[str]:
    emojis = []
    for emoji in myunicode.emoji_iterator():
        try:
            if emoji != myunicode.ens_normalize(emoji):
                continue
        except ValueError:
            continue

        emojis.append(emoji)
    return emojis

In [8]:
all_emojis = download_all_emojis()

In [9]:
all_emojis

['¬©',
 '¬Æ',
 '‚Üî',
 '‚Üï',
 '‚Üñ',
 '‚Üó',
 '‚Üò',
 '‚Üô',
 '‚Ü©',
 '‚Ü™',
 '‚åö',
 '‚åõ',
 '‚å®',
 '‚éà',
 '‚èè',
 '‚è©',
 '‚è™',
 '‚è´',
 '‚è¨',
 '‚è≠',
 '‚èÆ',
 '‚èØ',
 '‚è∞',
 '‚è±',
 '‚è≤',
 '‚è≥',
 '‚è∏',
 '‚èπ',
 '‚è∫',
 '‚ñ™',
 '‚ñ´',
 '‚ñ∂',
 '‚óÄ',
 '‚óª',
 '‚óº',
 '‚óΩ',
 '‚óæ',
 '‚òÄ',
 '‚òÅ',
 '‚òÇ',
 '‚òÉ',
 '‚òÑ',
 '‚òÖ',
 '‚òá',
 '‚òà',
 '‚òâ',
 '‚òä',
 '‚òã',
 '‚òç',
 '‚òé',
 '‚òè',
 '‚òê',
 '‚òë',
 '‚òí',
 '‚òî',
 '‚òï',
 '‚òñ',
 '‚òó',
 '‚òò',
 '‚òô',
 '‚òö',
 '‚òõ',
 '‚òú',
 '‚òù',
 '‚òû',
 '‚òü',
 '‚ò†',
 '‚ò¢',
 '‚ò£',
 '‚ò§',
 '‚ò•',
 '‚ò¶',
 '‚òß',
 '‚ò®',
 '‚ò©',
 '‚ò™',
 '‚ò´',
 '‚ò¨',
 '‚ò≠',
 '‚òÆ',
 '‚òØ',
 '‚ò∏',
 '‚òπ',
 '‚ò∫',
 '‚òª',
 '‚òº',
 '‚òΩ',
 '‚òæ',
 '‚òø',
 '‚ôÄ',
 '‚ôÅ',
 '‚ôÇ',
 '‚ôÉ',
 '‚ôÑ',
 '‚ôÖ',
 '‚ôÜ',
 '‚ôá',
 '‚ôà',
 '‚ôâ',
 '‚ôä',
 '‚ôã',
 '‚ôå',
 '‚ôç',
 '‚ôé',
 '‚ôè',
 '‚ôê',
 '‚ôë',
 '‚ôí',
 '‚ôì',
 '‚ôî',
 '‚ôï',
 '‚ôñ',
 '‚ôó',
 '‚ôò',
 '‚ôô',
 '‚ôö',
 '‚ôõ',
 '‚ôú',
 '‚ôù',
 '‚ôû',
 '‚ôü',
 '‚ô†',
 '‚ô£',
 '‚ô•',
 '‚ô¶',
 '

In [10]:
unique_emojis = set()
for doc in data:
    for token in doc:
        if myunicode.is_emoji(token):
            unique_emojis.add(token)

In [14]:
len(unique_emojis), len(all_emojis)

(1143, 5508)

In [16]:
len(unique_emojis & set(all_emojis))

1082

In [17]:
unique_emojis - set(all_emojis)

{'‚Äº',
 '‚Åâ',
 '‚É£',
 '‚Ñ¢',
 '‚Ñπ',
 '‚òå',
 '‚ò°',
 '‚ò∞',
 '‚ô°',
 '‚ô¢',
 '‚ô§',
 '‚ôß',
 '‚ô≠',
 '‚ö¨',
 '‚õ¶',
 '„äó',
 '„äô',
 'Ô∏è',
 'üá¶',
 'üáß',
 'üá®',
 'üá©',
 'üá™',
 'üá´',
 'üá¨',
 'üá≠',
 'üáÆ',
 'üáØ',
 'üá∞',
 'üá±',
 'üá≤',
 'üá≥',
 'üá¥',
 'üáµ',
 'üá∂',
 'üá∑',
 'üá∏',
 'üáπ',
 'üá∫',
 'üáª',
 'üáº',
 'üáΩ',
 'üáæ',
 'üáø',
 'üàÅ',
 'üàÇ',
 'üàö',
 'üàØ',
 'üà≤',
 'üà≥',
 'üà¥',
 'üàµ',
 'üàπ',
 'üà∫',
 'üâê',
 'üâë',
 'üèª',
 'üèº',
 'üèΩ',
 'üèæ',
 'üèø'}

In [18]:
set(all_emojis) - unique_emojis

{'üßü',
 'ü•†',
 'üÇ±',
 'üßëüèª\u200d‚ù§\u200düßëüèº',
 'ü§õüèΩ',
 'üßëüèæ\u200d‚ù§\u200düíã\u200düßëüèº',
 'üë®\u200düîß',
 'üë∑üèΩ',
 'üë∞\u200d‚ôÄ',
 'üáßüá±',
 'üßëüèΩ\u200dü¶±',
 'üë©üèΩ\u200d‚ù§\u200düë®üèΩ',
 'üßèüèª\u200d‚ôÇ',
 'üë±üèº',
 'üë≤üèª',
 'üë©üèæ\u200dü§ù\u200düë®üèº',
 'üïµüèº\u200d‚ôÇ',
 'ü§πüèø\u200d‚ôÄ',
 'üè≥\u200d‚öß',
 'üë®üèø\u200düåæ',
 'üßë',
 'üßö\u200d‚ôÄ',
 '‚öë',
 'üôéüèΩ\u200d‚ôÄ',
 'üèÑüèº',
 'üèå\u200d‚ôÇ',
 'üë∏üèø',
 'üëÜüèø',
 'üë®\u200düë®\u200düë¶\u200düë¶',
 'üö¥üèº\u200d‚ôÄ',
 'üÅû',
 'üë©üèΩ\u200dü§ù\u200düë©üèø',
 '\U0001f77e',
 'üÅ≠',
 'üë®üèº\u200düç≥',
 'üÄ©',
 'üßùüèæ',
 'üë®üèΩ\u200d‚ù§\u200düíã\u200düë®üèæ',
 'üèÉüèº\u200d‚ôÇ',
 'üõà',
 'ü§µüèΩ\u200d‚ôÇ',
 '\U0001fae2',
 'ü¶∏üèæ',
 'üñæ',
 'üíÅüèº',
 'üá®üá¶',
 'üë®üèª\u200d‚ù§\u200düíã\u200düë®üèø',
 'üßëüèæ\u200dü§ù\u200düßëüèº',
 'üë©üèæ\u200dü§ù\u200düë©üèº',
 'üë©üèª\u20

## Model

In [54]:
import gensim.downloader

import numpy as np
from gensim.models import Word2Vec, KeyedVectors

In [87]:
external_model_path = gensim.downloader.load('glove-twitter-200', return_path=True)
external_model = KeyedVectors.load_word2vec_format(external_model_path)

binary = False
epochs = 4

external_model_path

'/home/goader/gensim-data/glove-twitter-200/glove-twitter-200.gz'

In [24]:
# external_model_path = gensim.downloader.load('word2vec-google-news-300', return_path=True)
# external_model = KeyedVectors.load_word2vec_format(external_model_path, binary=True)

# binary = True
# epochs = 10

# external_model_path

'/home/goader/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz'

### Defining our model

In [88]:
model = Word2Vec(vector_size=external_model.vector_size, min_count=3, workers=6)
model.build_vocab(data)
model.wv.vectors_lockf = np.ones(len(model.wv))

In [89]:
vocab = list(model.wv.key_to_index.keys())

In [90]:
model.build_vocab([list(external_model.key_to_index.keys())], update=True)
model.wv.intersect_word2vec_format(external_model_path, binary=binary, lockf=1.0)

In [91]:
model.train(data, total_examples=len(data), epochs=epochs)

(32626377, 39956068)

### Predictions

In [92]:
model.wv.similarity('cat', 'üê±')

0.6172112

In [93]:
model.wv.most_similar('cat')

[('kitten', 0.6844383478164673),
 ('kitty', 0.6647198796272278),
 ('animal', 0.6421106457710266),
 ('meow', 0.6401075720787048),
 ('cats', 0.6254410147666931),
 ('pup', 0.6253268718719482),
 ('üê±', 0.6172111630439758),
 ('hamster', 0.6100335121154785),
 ('little', 0.6067391633987427),
 ('puppy', 0.6030903458595276)]

In [94]:
model.wv.most_similar('üê±')

[('kitty', 0.6242503523826599),
 ('cat', 0.6172113418579102),
 ('meow', 0.5820024013519287),
 ('kitten', 0.5706236958503723),
 ('cats', 0.557283341884613),
 ('üçÜ', 0.5394867658615112),
 ('puss', 0.5174140930175781),
 ('pup', 0.5162878632545471),
 ('paws', 0.5158326029777527),
 ('üêï', 0.5149833559989929)]

In [95]:
model.wv.most_similar('üçÜ')

[('dick', 0.6780715584754944),
 ('cock', 0.6641056537628174),
 ('pussy', 0.6268385052680969),
 ('lick', 0.6217989325523376),
 ('üåΩ', 0.6094993948936462),
 ('üçê', 0.6067872047424316),
 ('titties', 0.5962030291557312),
 ('booty', 0.5714117288589478),
 ('sucking', 0.56900954246521),
 ('üçå', 0.5650728344917297)]

In [96]:
model.wv.most_similar('üçë')

[('crappiest', 0.2948598563671112),
 ('stripclubs', 0.28915873169898987),
 ('fillies', 0.2740689218044281),
 ('toff', 0.26720932126045227),
 ('fek', 0.2579253017902374),
 ('‚õ≥', 0.25544315576553345),
 ('drankin', 0.24922415614128113),
 ('guhhhh', 0.24273085594177246),
 ('üîé', 0.24092388153076172),
 ('golds', 0.23822413384914398)]

In [97]:
model.wv.most_similar('üá∫üá¶')

KeyError: "Key 'üá∫üá¶' not present in vocabulary"

In [98]:
model.wv.most_similar('üëç')

[('üàµ', 0.2506566047668457),
 ('ahhhhhhh', 0.2484702318906784),
 ('ao3', 0.24594947695732117),
 ('_‡§ï‡§§', 0.23157167434692383),
 ('myeongdong', 0.2291068285703659),
 ('wolverhampton', 0.22218447923660278),
 ('macarena', 0.2173740118741989),
 ('zxxx', 0.21516813337802887),
 ('a7', 0.20381024479866028),
 ('0.17', 0.20367883145809174)]

In [52]:
model.wv.save_word2vec_format('emoji_w2v')