# Embedding structure through mentions

In [None]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

<hr>
<h1 align="center">driver code</h1>

1. Extract data from db -> organize in local file
2. Train embeddings
3. Export embeddings (maybe model)
4. Upload for tensor viewer
5. delete local file

In [None]:
FOLDER = "../embeddings/"
NAME = "tweet_relations_mentions"
MODEL = abs_path(FOLDER + NAME + ".model")
TF_OUT = abs_path(FOLDER + NAME + "_tf_out")
# JOBLIB = abs_path(FOLDER + NAME + ".joblib")
# CSV = abs_path("embeddings/" + NAME + ".csv") # done dynamically

In [None]:
FOLDER_FULL_PATH = abs_path(FOLDER)
if not os.path.exists(FOLDER_FULL_PATH): os.makedirs(FOLDER_FULL_PATH)

In [None]:
filter_options = {'user_mentions.1': {"$exists": True}}

In [None]:
total = api_db.col_tweets.count_documents(filter_options)
print("Total to process: %d" % total)

In [None]:
def task_mentions(skip, limit):
    filter_options = {'user_mentions.1': {"$exists": True}}
    tweets = api_db.col_tweets.find(filter_options, {"user": True, "user_mentions": True}).skip(skip).limit(limit)
    pairs = []
    step=1e4
    for t in tweets:
        try:
            pairs.append(t["user_mentions"]) # accumulate
            # print in batches for speed
            if len(pairs)>=step:
                for ps in pairs: print(",".join(map(str, ps)))
                pairs = []
        except: continue        
    for ps in pairs: print(",".join(map(str, ps)))

In [None]:
dp = DynamicParallelism(total, task_mentions, NAME, batch_size=total//4, max_threads=4)

In [None]:
CSV = dp.run().reduce()

In [None]:
dp.clean()

---
# train embeddings

In [None]:
class RelationsTweetsCorpus:
    """An interator that yields sentences (lists of str)."""
    def __init__(self): pass
    def __iter__(self):
        step = 5e5
        with DoneMessage("Iterating mentions corpus"):
            for line in open(CSV, encoding="utf-8"):
                # assume there's one document per line, tokens separated by comma
                yield list(map(str,line.strip().split(",")))

In [None]:
import gensim.models

In [None]:
# EMBEDDINGS CONFIGS
MIN_COUNT = 25
SIZE = 64

In [None]:
model = gensim.models.Word2Vec(sentences=RelationsTweetsCorpus(), compute_loss=True, min_count=MIN_COUNT, size=SIZE, window=1000) # default alpha=0.025

In [None]:
import os

In [None]:
# check https://medium.com/@aakashchotrani/visualizing-your-own-word-embeddings-using-tensorflow-688b3a7750ee
# to use with `python -m gensim.scripts.word2vec2tensor -i INPUT_FILE_PATH -o OUTPUT_FILE_PATH`
with DoneMessage("Saving model locally"):
    model.wv.save_word2vec_format(MODEL)

In [None]:
with DoneMessage("Output embeddings for visualization"):
    os.system("python -m gensim.scripts.word2vec2tensor -i %s -o %s" % (MODEL, TF_OUT))

---
# converting _ids to screen_names

In [None]:
METADATA_IDS = TF_OUT + "_metadata.tsv"
METADATA_HANDLES = TF_OUT + "_metadata_handles.tsv"

In [None]:
CACHE_FILE = abs_path("../_cache_id_screen_name.json")
CACHE = {}
try:
    if os.path.isfile(CACHE_FILE): CACHE = json_to_dict(CACHE_FILE)
except: print("Failed to load cache")

In [None]:
counter = 0
with DoneMessage("Converting _ids to handles"):
    with open(METADATA_IDS, "r", encoding="utf-8") as inf:
        with open(METADATA_HANDLES, "w", encoding="utf-8") as outf:
            for _id in inf:
                _id = _id.strip()
                # check cache
                if _id in CACHE:
                    outf.write("%s\n" % CACHE[_id])
                    continue

                # check if in db
                account = api_db.col_users.find_one({"_id": int(_id), "screen_name": {"$exists": True}}, {"screen_name": True})
                # query api
                if not account: 
                    temp_account = get_account_details(user_id=_id)
                    if temp_account: account = user_to_db_format(temp_account)
                # save either found screen_name or default _id
                if account:
                    CACHE[_id] = account["screen_name"]
                    outf.write("%s\n" % account["screen_name"])
                    counter+=1
                else: outf.write("%s\n" % _id) # defaults to the _id value
                if counter%10 == 0: dict_to_json(CACHE, CACHE_FILE)

dict_to_json(CACHE, CACHE_FILE)

In [None]:
# remove the MODEL
with DoneMessage("removing the model"):
    try: os.remove(MODEL)
    except: print("failed to remove the model")

In [None]:
print("DONE")