# Word2Vec Analysis of User Bios
References:
    - http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    - https://radimrehurek.com/gensim/models/word2vec.html
    - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py

## Library Imports

In [1]:
# Packages for PostgreSQL Import and Export
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy import create_engine
import psycopg2
import os

# Data Management and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Word2Vec
import pprint
from collections import defaultdict
import gensim

# Further Reducing the Dimensionality of Word Embeddings
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# NLTK for working with text data (cleaning and processing)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Other
from itertools import chain
import string
import pickle

## Data Import from SQL

In [2]:
dbname = "freelance_db"
username = os.environ['USER']
pswd = os.environ['SQLPSWD']

# Connect to Database
con = None
con = psycopg2.connect(database=dbname, user=username,
                       host='localhost', password=pswd)

# Extracting table with user profile and bio
sql_query = """SELECT profile_url, bio from user_details_table;"""
bio_table = pd.read_sql_query(sql_query, con)
bio_table = bio_table.dropna() # Removing users with no bio
bio_table = bio_table.loc[bio_table.bio != "NA", :]

# Obtaining a Word2Vec Embedding

The idea here is to take the word vector I previously built and reduce it's dimensionality. Then I can insert that into the model as a new feature. I made a realization that I don't have to have it by users. I can simply put all the sentences together. The point is to train a model.

Reference: https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa

## Pre-processing

In [3]:
# Extracting sentences from bios
data = bio_table.bio.tolist()
sentences = [sent_tokenize(text) for text in data]
sentences = list(chain.from_iterable(sentences))

# Lowercasing each sentence
sentences = [w.lower() for w in sentences]

# Splitting sentences into words
words = [s.split(' ') for s in sentences]

# Removing punctuation from each word
words = [[w.translate(str.maketrans('', '', string.punctuation)) for w in x] for x in words]

# Removing stop words
stoplist = stopwords.words('english')
words = [[x for x in s if x not in stoplist] for s in words]

## Training Word2Vec

In [4]:
# Train word2vec
size = 50 # Don't go higher than 10% 100 or 200 will be sufficient; Check overfitting drop between train and test.
model = gensim.models.Word2Vec(words, min_count=5, size = size, workers = 1)

In [5]:
model.wv.most_similar(['web'], topn=5)

[('developer', 0.9998610615730286),
 ('services', 0.9998580813407898),
 ('application', 0.9998360872268677),
 ('development', 0.9998316168785095),
 ('including', 0.9998306035995483)]

In [6]:
# Exporting Model with pickle
filename = os.environ['PWD'] + '/scripts/models/model_w2v.sav'
pickle.dump(model, open(filename, 'wb'))

## Extracting Embeddings

In [7]:
# Extracting sentences from bios
data = bio_table.bio.tolist()

tokenized_corpus = [word_tokenize(s) for s in data]

In [8]:
# This extracts the embeddings at the sentence level and then averages across sentences within
# a document in order to obtain document (user) level embedding vector
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [11]:
# Extracting Datafrom 
words = model.wv.index2word
wvs = model.wv[words]

# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=model,
                                             num_features=size)
embeddings = pd.DataFrame(w2v_feature_array)

  # This is added back by InteractiveShellApp.init_path()


## Exporting Embeddings to SQL

In [12]:
# Adding profile_urls for merging later
embeddings['profile_url'] = "https://www.guru.com" + bio_table.profile_url
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,profile_url
0,-0.473687,-0.083848,0.146523,-0.094585,-0.364188,0.158576,-0.071422,-0.367279,-0.087792,0.526227,...,0.423684,0.511746,-0.046579,-0.475470,-0.171319,-0.202593,0.092216,-0.391020,-0.543387,https://www.guru.com/freelancers/scopic
1,-0.397739,-0.072081,0.125606,-0.082030,-0.310165,0.133467,-0.059300,-0.313602,-0.073510,0.443927,...,0.359313,0.432702,-0.040519,-0.403511,-0.146243,-0.171523,0.074763,-0.329972,-0.459600,https://www.guru.com/freelancers/top-guru-assi...
2,-0.536638,-0.097036,0.173734,-0.108803,-0.413958,0.174294,-0.078517,-0.427603,-0.097882,0.596463,...,0.478170,0.583258,-0.058174,-0.539831,-0.194414,-0.231520,0.101226,-0.439445,-0.621933,https://www.guru.com/freelancers/eden-programm...
3,-0.467405,-0.085745,0.147209,-0.094812,-0.362905,0.156062,-0.068085,-0.366998,-0.087990,0.517890,...,0.418161,0.507364,-0.048871,-0.471296,-0.169405,-0.197554,0.089430,-0.383217,-0.538900,https://www.guru.com/freelancers/avion-technol...
4,-0.458545,-0.082751,0.145163,-0.093410,-0.351569,0.151399,-0.066140,-0.360898,-0.086942,0.508418,...,0.409719,0.498466,-0.046255,-0.460647,-0.165392,-0.194842,0.087352,-0.374891,-0.527440,https://www.guru.com/freelancers/j-consulting
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,-0.437466,-0.078369,0.139276,-0.088720,-0.335110,0.143663,-0.068917,-0.344429,-0.080906,0.480431,...,0.394473,0.469618,-0.043726,-0.436772,-0.159378,-0.186742,0.087153,-0.357306,-0.498007,https://www.guru.com/freelancers/leslie-mestrow
781,-0.389108,-0.071778,0.123694,-0.079138,-0.302635,0.129613,-0.059151,-0.306527,-0.072863,0.431356,...,0.351334,0.422329,-0.038835,-0.393794,-0.142296,-0.165632,0.073958,-0.320322,-0.448946,https://www.guru.com/freelancers/four-winds-gr...
782,-0.263986,-0.051583,0.084650,-0.051157,-0.205489,0.086467,-0.041054,-0.205224,-0.049693,0.294936,...,0.239662,0.286001,-0.029092,-0.267428,-0.098488,-0.115194,0.049589,-0.217621,-0.305413,https://www.guru.com/freelancers/robert-bennear
783,-0.514803,-0.094894,0.162922,-0.104494,-0.399490,0.170641,-0.077902,-0.401780,-0.095346,0.570083,...,0.460974,0.557570,-0.052579,-0.517442,-0.188584,-0.218128,0.098148,-0.418702,-0.588495,https://www.guru.com/freelancers/david-cas


In [13]:
# Adding profile_urls for merging later
embeddings['profile_url'] = "https://www.guru.com" + bio_table.profile_url

# Connect to the database and save data to it
engine = create_engine('postgresql://%s:%s@localhost/%s' %
                       (username, pswd, dbname))
embeddings.to_sql("embeddings_table", engine, if_exists='replace')

# TF-IDF Modeling

### Processing Data

In [None]:
text_corpus = bio_table.bio.tolist()

# Create a set of frequent words
stoplist = stopwords.words('english')

# Tokenize each document
texts = [text.lower().split() for text in text_corpus]

# Removing punctuation
texts = [[s.translate(str.maketrans('', '', string.punctuation)) for s in word] for word in texts]

# Removing stop words
texts = [[x for x in words if x not in stoplist] for words in texts]

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [
    [token for token in text if frequency[token] > 1] for text in texts]

# Creating Word Dictionary
dictionary = corpora.Dictionary(processed_corpus)
num_features = len(dictionary.keys())

## Modeling with tf-idf

In [None]:
# Convert entire corpus to vectors
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [None]:
# Train Model
tfidf = models.TfidfModel(bow_corpus)

# Create similarity index
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features = num_features)

## Testing against a new example

In [None]:
# Testing new example
query_document = 'human interaction and engineering data c# analysis'.split()

# Converting to vector
query_bow = dictionary.doc2bow(query_document)

# Checking similarity of tfidif conversion of vector
sims = index[tfidf[query_bow]]

In [None]:
# Printing similarity scores
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

# Dimensionality Reduction on Embeddings

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
# Fitting PCA Model to aggregated embeddings (or should I do this to the fully embedding then aggregate?)
x = StandardScaler().fit_transform(w2v_feature_array)
pca = PCA(.95)

pca.fit(x)
out = pd.DataFrame(pca.fit_transform(x))

In [None]:
print(pca.explained_variance_ratio_)
print(pca.singular_values_)