# Word2Vec Analysis of User Bios
References:
    - http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    - https://radimrehurek.com/gensim/models/word2vec.html
    - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py

## Library Imports

In [1]:
# Packages for PostgreSQL Import and Export
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy import create_engine
import psycopg2
import os

# Data Management and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Word2Vec
import pprint
from collections import defaultdict
import gensim

# Visualize embeddings
from sklearn.manifold import TSNE

# Clustering Embedding Vectors for Documents
from sklearn.cluster import DBSCAN

# NLTK for working with text data (cleaning and processing)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Other
from itertools import chain
import string

## Data Import from SQL

In [2]:
dbname = "freelance_db"
username = os.environ['USER']
pswd = os.environ['SQLPSWD']

# Connect to Database
con = None
con = psycopg2.connect(database=dbname, user=username,
                       host='localhost', password=pswd)

# Extracting table with user profile and bio
sql_query = """SELECT profile_url, bio from user_details_table;"""
bio_table = pd.read_sql_query(sql_query, con)
bio_table = bio_table.dropna() # Removing users with no bio
bio_table = bio_table.loc[bio_table.bio != "NA", :]

## TF-IDF Modeling

### Processing Data

In [None]:
text_corpus = bio_table.bio.tolist()

# Create a set of frequent words
stoplist = stopwords.words('english')

# Tokenize each document
texts = [text.lower().split() for text in text_corpus]

# Removing punctuation
texts = [[s.translate(str.maketrans('', '', string.punctuation)) for s in word] for word in texts]

# Removing stop words
texts = [[x for x in words if x not in stoplist] for words in texts]

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [
    [token for token in text if frequency[token] > 1] for text in texts]

# Creating Word Dictionary
dictionary = corpora.Dictionary(processed_corpus)
num_features = len(dictionary.keys())

### Modeling with tf-idf

In [24]:
# Convert entire corpus to vectors
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [25]:
# Train Model
tfidf = models.TfidfModel(bow_corpus)

# Create similarity index
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features = num_features)

### Testing against a new example

In [26]:
# Testing new example
query_document = 'human interaction and engineering data c# analysis'.split()

# Converting to vector
query_bow = dictionary.doc2bow(query_document)

# Checking similarity of tfidif conversion of vector
sims = index[tfidf[query_bow]]

['human', 'interaction', 'and', 'engineering', 'data', 'c#', 'analysis']


In [None]:
# Printing similarity scores
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

# Obtaining a Word2Vec Embedding

The idea here is to take the word vector I previously built and reduce it's dimensionality. Then I can insert that into the model as a new feature. I made a realization that I don't have to have it by users. I can simply put all the sentences together. The point is to train a model.

Reference: https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa

## Pre-processing

In [19]:
# Extracting sentences from bios
data = bio_table.bio.tolist()
sentences = [sent_tokenize(text) for text in data]
sentences = list(chain.from_iterable(sentences))

# Lowercasing each sentence
sentences = [w.lower() for w in sentences]

# Splitting sentences into words
words = [s.split(' ') for s in sentences]

# Removing punctuation from each word
words = [[w.translate(str.maketrans('', '', string.punctuation)) for w in x] for x in words]

# Removing stop words
stoplist = stopwords.words('english')
words = [[x for x in s if x not in stoplist] for s in words]

## Training Word2Vec

In [20]:
# Train word2vec
size = 100 # Don't go higher than 10% 100 or 200 will be sufficient; Check overfitting drop between train and test.
model = gensim.models.Word2Vec(words, min_count=5, size = size, workers = 4)

In [21]:
model.wv.most_similar(['web'], topn=5)

[('development', 0.9999358057975769),
 ('software', 0.9999181628227234),
 ('website', 0.9999127388000488),
 ('design', 0.9999101161956787),
 ('application', 0.9999095797538757)]

In [6]:
words = model.wv.index2word
wvs = model.wv[words]

## Extracting Embeddings

In [7]:
# Extracting sentences from bios
data = bio_table.bio.tolist()

tokenized_corpus = [word_tokenize(s) for s in data]

In [8]:
# This extracts the embeddings at the sentence level and then averages across sentences within
# a document in order to obtain document (user) level embedding vector
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# Extracting Datafrom 
words = model.wv.index2word
wvs = model.wv[words]

# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=model,
                                             num_features=size)
embeddings = pd.DataFrame(w2v_feature_array)

  # This is added back by InteractiveShellApp.init_path()


## Exporting Embeddings to SQL

In [None]:
# Adding profile_urls for merging later
embeddings['profile_url'] = bio_table.profile_url

# Connect to the database and save data to it
engine = create_engine('postgresql://%s:%s@localhost/%s' %
                       (username, pswd, dbname))
embeddings.to_sql("embeddings_table", engine, if_exists='replace')

## Visualizing Word Embeddings

In [473]:
# tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
# np.set_printoptions(suppress=True)
# T = tsne.fit_transform(wvs)
# labels = words

In [474]:
# plt.figure(figsize=(12, 6))
# plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
# # for label, x, y in zip(labels, T[:, 0], T[:, 1]):
# #     plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

## Clustering Text Documents Using Word Embeddings

This really doesn't seem to be working.

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

Notes from Office Hours with Rachel WK:
    - Evaluate with intra-cluster similiarity scores (also between-cluster)
    - Manually inspect clusters and get an idea if the things go together

In [477]:
# Cluster using K Means and Extract Labels
# elbow method (look it up)
# Assumes cluster sizes are the same (try DBSCAN)
dbscan_model = DBSCAN(eps=2, min_samples=2, n_jobs = 4).fit(w2v_feature_array)

In [478]:
# Extracting labels and inertia 
labels = dbscan_model.labels_

In [479]:
# Adding cluster labels to data
bio_table['cluster'] = labels
bio_table.groupby(['cluster']).cluster.count()

cluster
0    785
Name: cluster, dtype: int64

In [480]:
a = bio_table[bio_table.cluster == 19].bio