great tutorial:
    https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
        

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import spacy  # For preprocessing


import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# user_id to screen_name conversion

In [None]:
ud_df = pd.read_pickle('users_with_over_200_DETAILS.pkl')

In [None]:
def id_to_name(uid):
    return ud_df[ud_df['user_id']==uid]['screen_name'].to_string(index=False).strip()
    
def name_to_id(name):
    uid = ud_df[ud_df['screen_name']==name]['user_id'].to_string(index=False)
    return uid.strip()

In [None]:
name_to_id('Starbucks')

In [None]:
ud_df[ud_df['user_id']=='30973']['screen_name'].to_string(index=False)

# Read train data

In [None]:
train_data = pd.read_pickle('train_data.pkl')

In [None]:
import random
random.shuffle(train_data[0])

In [None]:
len(train_data)

In [None]:
df_clean = pd.DataFrame([' '.join(map(str, line)) for line in train_data] )

In [None]:
df_clean.shape

# Create permutations of the followed users

In [None]:
number_of_shuffles = 5

for _ in range(number_of_shuffles):
    for line in train_data:
        random.shuffle(line)
    shuffled_train_df = pd.DataFrame([' '.join(map(str, line)) for line in train_data] )
    df_clean = df_clean.append(shuffled_train_df)
    
print( "augemented train data length: " , df_clean.shape)

In [None]:
df_clean.rename(columns={0:'clean'}, inplace=True)

In [None]:
df_clean.head(5)

### Bigrams - not needed for item2vec

In [None]:
from gensim.models.phrases import Phrases, Phraser
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, threshold =10000 ,progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

## Train the model

In [None]:
import multiprocessing

from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count()
cores

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

### Build the vocabolary table

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

### Train the model

In [None]:
w2v_model.corpus_count

In [None]:
time to train 72606 items: 35.25 mins

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

### Compacting the model - only if not going to train it further

In [None]:
#w2v_model.init_sims(replace=True)

### save the model for later usage

In [None]:
w2v_model.save("word2vec_v2.model")

In [None]:
w2v_model.wv.most_similar(positive=["180505807"])

## t-SNE visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt


%matplotlib inline

import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    #my part -- foo = [a + 42 for a in foo]

    word_labels = [ud_df[ud_df['user_id']==wrd]['screen_name'].to_string(index=False) for wrd in word_labels]
    
    
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    # was:  reduc = PCA(n_components=50).fit_transform(arrays)
    reduc = PCA(n_components=20).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

### 10 Most similar words vs. 10 Most dissimilar

In [None]:
def ten_most_similar(wrd):
    uid = name_to_id(wrd)
    #uid = ud_df[ud_df['screen_name']==wrd]['user_id'].to_string(index=False)
    #uid = uid.strip()
    tsnescatterplot(w2v_model, uid, [i[0] for i in w2v_model.wv.most_similar(negative=[uid])])

ten_most_similar('GalGadot')


Harvard
Target
CNN
Apple
BarackObama
united
ABC
ArianaGrande
NYGovCuomo
KimKardashian
Starbucks


check these ids:
    25073877 => @realDonaldTrump
180505807 => @instagram
25365536 => @KimKardashian
79293791 => @rihanna
28603812 => @Royals

@billgates => 50393960
@cnn => 759251

In [None]:
tsnescatterplot(w2v_model, '759251', [i[0] for i in w2v_model.wv.most_similar(negative=["759251"])])

# Load from drive

In [None]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [None]:
model = Word2Vec.load("word2vec_v1.model")