# Word embeddings:
In this notebook, we explore how to convert how preprocessed textual data into word embeddings. 

Import libraries

In [1]:
import concurrent.futures
import time
import numpy as np
import pandas as pd
import pickle
import spacy
from tqdm import tqdm
from sklearn.base import BaseEstimator, TransformerMixin

## Load preprocessed data:

In [2]:
with open('../data/spacy_preprocessed.pickle', 'rb') as file:

    preprocessed_data= pickle.load(file)

In [3]:
len(preprocessed_data)

972406

In [4]:
preprocessed_data[0]

'<s> green and blue </s> <s> very thick and soft </s> <s> perfect for layer on cold day </s> <s> like new condition </s> <s> free shipping </s>'

## Load spacy model:

In [5]:
nlp= spacy.load('en_core_web_lg')

Let's try getting some vectors

In [6]:
tree= nlp('tree')
tree.has_vector

True

In [7]:
tree.vector

array([ 3.2919e-01, -1.2684e+00, -7.2209e+00, -1.6487e+00,  1.7131e+00,
       -3.1834e+00,  5.6145e-01,  3.6603e+00, -9.6616e-02,  3.2192e+00,
        2.4644e+00,  1.8412e+00, -5.7259e+00,  3.0492e+00, -8.4666e-01,
       -1.7067e+00,  2.3521e+00, -3.3682e-01,  5.7802e+00, -3.7691e+00,
        4.3651e+00,  7.4848e+00,  2.4060e-03,  2.9580e-01, -1.1311e+00,
       -5.1604e+00, -5.8515e+00, -1.5563e+00,  7.7758e-01,  5.4768e+00,
       -4.9496e+00, -1.8279e+00,  1.4919e+00, -5.6514e+00,  3.3848e+00,
       -4.3007e+00, -2.1703e+00,  6.1267e+00,  2.7119e+00, -3.4822e-01,
        2.8005e+00, -2.0020e+00,  3.3536e+00,  4.1416e+00,  3.0407e+00,
        1.2859e+01,  3.3048e+00,  1.0003e-01,  4.2711e-01, -2.8046e+00,
        3.8545e+00,  6.4176e+00, -9.4765e-02, -2.7159e+00, -2.4363e+00,
        2.2896e+00, -1.4956e-01, -3.0021e+00,  5.0707e+00, -3.0722e+00,
       -3.9061e-01, -3.0551e+00,  4.8163e+00,  2.6263e+00, -5.7205e+00,
        3.2005e+00,  1.6785e+00, -1.9029e+00, -3.0395e+00,  1.68

Let's try to get a vector for a complete sentence

In [8]:
sent= nlp(preprocessed_data[0]).vector

In [9]:
sent = sent.reshape(1,-1)
sent.shape

(1, 300)

## Convert preprocessed text into feature matrix:

In [10]:
def get_vector(doc):

    return nlp(doc).vector.reshape(1,-1)

Get the vectors for all the documents

In [11]:
vector_list=[]

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:

    futures=[ executor.submit(get_vector,doc) for doc in preprocessed_data[:100] ] #assigning tasks to threads
    for future in concurrent.futures.as_completed(futures):
        vector_list.append(future.result()) # adding samples to list 
 
print("List of vectors created.")

List of vectors created.


Concatenate all the vectors

In [12]:
vectors = np.concatenate(vector_list)

In [13]:
# sequential code
# data_list = [nlp(doc).vector.reshape(1,-1) for doc in tqdm(preprocessed_data)]
# data = np.concatenate(data_list)

## Sklearn integration:

In [14]:
class WordVectorTransformer(TransformerMixin,BaseEstimator):
    def __init__(self, model="en_core_web_lg"):
        self.model = model

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        nlp = spacy.load(self.model)
        return np.concatenate([nlp(doc).vector.reshape(1,-1) for doc in X])

In [16]:
transformer = WordVectorTransformer()
vs=transformer.fit_transform(preprocessed_data[:100])

In [17]:
vs.shape

(100, 300)