In [None]:
import gensim
from heapq import nlargest
from load_data import get_hashtags
from load_data import generate_embeddings
from load_data import load_media
from load_data import load_users
from load_data import prep_train_test_bert
from load_data import train_test_bert
from matplotlib import pyplot as plt
import random
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage

random.seed(4)

# Load data
Use together with path to respective file:
- load_users to load user graph.
- load_Media to load media data.
- gensim.models.Word2Vec.load for embeddings.

In [None]:
# graph = load_users('../users.csv')
# media, hashtags = load_media('../media.csv')
model = gensim.models.Word2Vec.load('./tag2vec64dIterator.model')

# Embedding clustering
Below we generate a dendogram of the hierachical clustering of the `plot_N` most common hashtags. 

In [None]:
plt.rcParams['figure.figsize'] = [10, 15]
plot_N = 100
most_frequent_tags = [i[1] for i in nlargest(plot_N, [
        (model.wv.vocab[word].count, word) for word in model.wv.vocab
])]
linkage_method = linkage([model[word] for word in most_frequent_tags], 'average', 'cosine')
dendrogram(linkage_method, labels=most_frequent_tags, orientation='right',
           leaf_font_size=8., color_threshold=0.43)
print('Clustering complete.')

# Hashtag similarity
Use model.wv.similarity with two hashtags to calculate similarity of two hashtags. 

In [None]:
[
 model.wv.similarity(w1="instagood",w2="instamood"),
 model.wv.similarity(w1="christmas",w2="xmas"),
 model.wv.similarity(w1="rap",w2="rnb"),
 model.wv.similarity(w1="dad",w2="father"),
 model.wv.similarity(w1="netflix",w2="cats"),
 model.wv.similarity(w1="nofilter",w2="sanfrancisco"),
 model.wv.similarity(w1="instagood",w2="garden"),
]

# Similar hashtags
Use model.wv.most_similar with single hashtag, for calculating similar hashtags. Set topn to N, where N is the number of similar hashtags you want to find.

In [None]:
['#{0} - {1:.10f}'.format(i[0], i[1]) for i in model.wv.most_similar(positive=['christmas'], topn=30)]


# Arithmetic operations
Use model.wv.most_similar with multiple hashtags to do arithemtic operations on hashtag vectors.
Hashtags given in negative parameter are deducted.

In [None]:
 model.wv.most_similar(positive=['helloween', 'christmas'], negative=['pumkin'], topn=10)

# Calculating post distance
Given two image posts, you can calculate distance between them.
Use model.wv.wmdistance with two posts.

In [None]:
posts = [
'''altimqa Takie tam 🏝️🌞🌊
#vacation #holiday #holidays #chill #chillout #fuertaventura #spain #nikon #nikonphotography #corralejo #ocean #goodday #happy #happytime #instatravel #travel #worldtraveler #traveler #adventure #adventuretime #traveling #d5500''',
'''marianamonkey #beach #ocean #portugal #travel #europe #instapic #instagram #instadaily #beautifuldestinations #nature #landscape #photo #instafashion #photographer #view #amazing #instalike#smile #girl #hair #beachbody #bikini #fitness #instafit #sun #dress #naturalbeauty #blue #jeans''',
'''rbariquelo #city #cwb #curitiba #curitibacool #citygram #city_explore #architecture #urbanxplore #skyline''',
'''#parquecascavel #buildings #jardimatlantico #towers #cityscape #landscape #urban #urbanphotography #nightfall #dusk #cityphotography #streetphotography #goiania #architectureporn #arquitetura #architecturephotography #architecture #engenhariacivil #civilengineering #brazil #igersbrasil #igersgoiania'''
]
for i in range(len(posts)):
    for j in range(len(posts)):
        print('post {0} and post {1}: {2:.10f}'.format(i, j, model.wv.wmdistance(get_hashtags(posts[i], model), get_hashtags(posts[j], model)), [t for t in get_hashtags(posts[i], model) if t in get_hashtags(posts[j], model)]))

# Generating embeddings and testing
Generate embeddings using generate_embeddings. You can also do a train and test run with train_test_pipeline.

In [None]:
# generate_embeddings('../mini_media.csv', './m_tag2vec64dIterator.model', threads=2)
# train_test_hashtag2vec('../mini_media.csv', './m_dist.dat', './models/256dModelRef', 10, # N-way split
#                     result_path='./result256dRef.txt', dim=256, check=10)
# import cProfile
# pr = cProfile.Profile()
# pr.enable()
 
prep_train_test_bert('../mini_dataset/mini_media.csv', '../mini_dataset/m_dist.dat', '../artifacts/models/768dBertModel', 10, # N-way split
                    result_path='./result768dBert.txt', check=1, pretrained_weights='distilbert-base-uncased')
train_test_bert('../mini_dataset/mini_media.csv', '../mini_dataset/m_dist.dat', '../artifacts/models/768dBertModel', 10, # N-way split
                    result_path='./result768dBert.txt', check=1, pretrained_weights='distilbert-base-uncased')

# pr.disable()
# pr.print_stats(sort='time')