# Word2Vec Analysis of User Bios
References:
    - http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    - https://radimrehurek.com/gensim/models/word2vec.html
    - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py

## Library Imports

In [299]:
# Packages for PostgreSQL Import and Export
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy import create_engine
import psycopg2
import os

# Data Management and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Word2Vec
import pprint
from collections import defaultdict
import gensim

# Visualize embeddings
from sklearn.manifold import TSNE

# Clustering Embedding Vectors for Documents
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans

# NLTK for working with text data (cleaning and processing)
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Other
from itertools import chain
import string

## Data Import from SQL

In [239]:
dbname = "freelance_db"
username = os.environ['USER']
pswd = os.environ['SQLPSWD']

# Connect to Database
con = None
con = psycopg2.connect(database=dbname, user=username,
                       host='localhost', password=pswd)

# Extracting table with user profile and bio
sql_query = """SELECT profile_url, bio from user_details_table;"""
bio_table = pd.read_sql_query(sql_query, con)
bio_table = bio_table.dropna() # Removing users with no bio
bio_table = bio_table[bio_table.bio != "NA"]

## Processing Data

In [None]:
text_corpus = bio_table.bio.tolist()

# Create a set of frequent words
stoplist = stopwords.words('english')

# Tokenize each document
texts = [text.lower().split() for text in text_corpus]

# Removing punctuation
texts = [[s.translate(str.maketrans('', '', string.punctuation)) for s in word] for word in texts]

# Removing stop words
texts = [[x for x in words if x not in stoplist] for words in texts]

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [
    [token for token in text if frequency[token] > 1] for text in texts]

# Creating Word Dictionary
dictionary = corpora.Dictionary(processed_corpus)
num_features = len(dictionary.keys())

## Modeling with tf-idf

In [24]:
# Convert entire corpus to vectors
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [25]:
# Train Model
tfidf = models.TfidfModel(bow_corpus)

# Create similarity index
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features = num_features)

## Testing against a new example

In [26]:
# Testing new example
query_document = 'human interaction and engineering data c# analysis'.split()

# Converting to vector
query_bow = dictionary.doc2bow(query_document)

# Checking similarity of tfidif conversion of vector
sims = index[tfidf[query_bow]]

['human', 'interaction', 'and', 'engineering', 'data', 'c#', 'analysis']


In [None]:
# Printing similarity scores
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

# Training Word2Vec Neural Network

The idea here is to take the word vector I previously built and reduce it's dimensionality. Then I can insert that into the model as a new feature. I made a realization that I don't have to have it by users. I can simply put all the sentences together. The point is to train a model.

Reference: https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa

In [301]:
# Extracting sentences from bios
data = bio_table.bio.tolist()
sentences = [sent_tokenize(text) for text in data]
sentences = list(chain.from_iterable(sentences))

# Lowercasing each sentence
sentences = [w.lower() for w in sentences]

# Splitting sentences into words
words = [s.split(' ') for s in sentences]

# Removing punctuation from each word
words = [[w.translate(str.maketrans('', '', string.punctuation)) for w in x] for x in words]

# Removing stop words
words = [[x for x in s if x not in stoplist] for s in words]

In [302]:
# Train word2vec on the two sentences
size = 50
model = gensim.models.Word2Vec(words, min_count=5, size = size, workers = 4)

In [303]:
words = model.wv.index2word
wvs = model.wv[words]

## Visualizing Word Embeddings

In [304]:
# tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
# np.set_printoptions(suppress=True)
# T = tsne.fit_transform(wvs)
# labels = words

In [305]:
# plt.figure(figsize=(12, 6))
# plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
# # for label, x, y in zip(labels, T[:, 0], T[:, 1]):
# #     plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

## Clustering Text Documents Using Word Embeddings

In [306]:
# Extracting sentences from bios
data = bio_table.bio.tolist()

tokenized_corpus = [word_tokenize(s) for s in data]

In [307]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=model,
                                             num_features=size)
pd.DataFrame(w2v_feature_array)

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.211775,0.082670,-0.155556,-0.187576,0.527085,0.027952,-0.373018,0.375317,-0.158288,0.287524,...,-0.887868,0.503113,-0.621936,-0.605516,-0.458044,0.360395,0.230850,0.544939,0.089387,0.189030
1,-0.182241,0.072193,-0.133667,-0.160288,0.446433,0.022265,-0.320216,0.321246,-0.137784,0.243430,...,-0.756186,0.430652,-0.532264,-0.518236,-0.389219,0.307597,0.196172,0.466388,0.074990,0.163425
2,-0.247685,0.091381,-0.178052,-0.212091,0.605762,0.029905,-0.435115,0.435463,-0.184536,0.330945,...,-1.028574,0.587998,-0.718808,-0.705244,-0.528510,0.411946,0.265667,0.635704,0.102416,0.220111
3,-0.212179,0.081244,-0.152066,-0.184273,0.519975,0.028526,-0.371839,0.371196,-0.157696,0.279662,...,-0.873660,0.497988,-0.613011,-0.599375,-0.449393,0.352545,0.227080,0.541230,0.086431,0.189738
4,-0.209419,0.080943,-0.148384,-0.184141,0.508552,0.025275,-0.366254,0.366537,-0.154483,0.276676,...,-0.859970,0.493028,-0.602487,-0.589957,-0.441599,0.347658,0.223794,0.531442,0.086779,0.183669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780,-0.198885,0.075583,-0.139745,-0.176276,0.486588,0.024890,-0.344290,0.349625,-0.147055,0.265392,...,-0.821497,0.467302,-0.579851,-0.562865,-0.419650,0.335559,0.215934,0.501857,0.084282,0.179124
781,-0.177181,0.067949,-0.127467,-0.158215,0.435342,0.022111,-0.311666,0.315038,-0.133195,0.234625,...,-0.732457,0.419066,-0.514790,-0.503966,-0.376644,0.296298,0.189689,0.453673,0.071469,0.156430
782,-0.116557,0.045219,-0.083370,-0.108826,0.291851,0.015119,-0.207591,0.211861,-0.090320,0.155634,...,-0.491826,0.279252,-0.346432,-0.339466,-0.254675,0.199058,0.128540,0.302518,0.050538,0.106629
783,-0.234010,0.089930,-0.167795,-0.206796,0.575158,0.028459,-0.410356,0.414230,-0.176367,0.312884,...,-0.966784,0.550767,-0.678243,-0.661895,-0.499026,0.389655,0.252732,0.596285,0.097499,0.206240


In [339]:
# Cluster using K Means and Extract Labels
kmeans = KMeans(n_clusters=5, random_state=0).fit(w2v_feature_array)
labels = kmeans.labels_.tolist()

In [345]:
bio_table['cluster'] = labels
bio_table.groupby(['cluster']).cluster.count()

Unnamed: 0,profile_url,bio,Cluster Labels,cluster
0,/freelancers/scopic,Scopic is a U.S. based company specializing in...,4,4
1,/freelancers/top-guru-assistants,Top Guru Assistants is a team of 100+ professi...,0,0
2,/freelancers/eden-programming-village,Our team has many software developers and desi...,1,1
3,/freelancers/avion-technology-inc-chicago,"A Chicago-based company provides web design, w...",4,4
4,/freelancers/j-consulting,Experienced solutions developer with Masters D...,4,4
