In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
import numpy as np
from time import sleep
import os
import pandas as pd
from tqdm import tqdm
import json
from typing import Union, List

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mittal.nit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords 

In [5]:
stop_words = set(stopwords.words('english')) 

In [6]:
PROJECT_PATH = os.getcwd().replace("notebooks", "")
DATA_PATH = os.path.join(PROJECT_PATH, "data")

In [7]:
os.listdir(DATA_PATH)

['yelp_academic_dataset_review.json',
 'glove.6B.zip',
 'glove.6B.200d.txt',
 'glove.6B.50d.txt',
 'word_expansion.json',
 'glove.6B.300d.txt',
 'glove.6B.100d.txt']

In [8]:
# loading word expansions
word_expansions = ""
with open(os.path.join(DATA_PATH, "word_expansion.json")) as f:
    for i, line in tqdm(enumerate(f)):
        word_expansions += line
        
word_expansions = word_expansions.replace("\n","")
word_expansions = word_expansions.replace("\t","")
word_expansions = word_expansions.replace(",}","}")
word_expansions = word_expansions.replace(",]","]")
word_expansions = json.loads(word_expansions)

76it [00:00, 158748.56it/s]


In [9]:
data = []
max_reviews = 100
with open(os.path.join(DATA_PATH, "yelp_academic_dataset_review.json")) as f:
    for i, line in tqdm(enumerate(f)):
        data.append(line)
        if i > max_reviews:
            break
temp = [json.loads(data[i].replace("\n",""))["text"] for i in range(len(data))]

101it [00:00, 378236.34it/s]


In [11]:
glove_100d_path = os.path.join(DATA_PATH, "glove.6B.100d.txt")

CPU times: user 4 µs, sys: 5 µs, total: 9 µs
Wall time: 14.8 µs


In [12]:
cluster = SLURMCluster(
    n_workers=10,
    cores=50, 
    memory="100GB")
cluster

In [13]:
client = Client(cluster)

In [14]:
def process_review(
    reviews: str, 
    embedding_matrix_path: str) -> List[List[np.ndarray]]:
    
    """
    Parameters
    ----------
    
    reviews: str
    
    embedding_matrix_path: str
    
    Returns
    -------
    embeddings
    """
    
    reviews = [json.loads(reviews[i].replace("\n",""))["text"] for i in range(len(reviews))]
    
    embedding_matrix = {}
    
    with open(embedding_matrix_path) as f:
        for line in f:
            word, embedding = line.split(maxsplit=1)
            coefs = np.fromstring(embedding, dtype=float, sep=" ")
            embedding_matrix[word] = embedding 
            
    unknown_word_embedding = np.zeros(len(next(iter(embedding_matrix.values()))))
    
    reviews_embeddings = []
    
    for review in reviews:
        review_embedding = []
        for word in review.split():
            review_embedding.append(embedding_matrix[word] if word in embedding_matrix else unknown_word_embedding)
        reviews_embeddings.append(review_embedding)
        
    return reviews_embeddings

# Sequential

In [15]:
%%time
current_batch_no = 1
batch_size = 10
total_size = 100
results_futures = {}
with open(os.path.join(DATA_PATH, "yelp_academic_dataset_review.json")) as f:
    
    reviews = []
    for i, line in tqdm(enumerate(f), leave=False):
        
        if i >= total_size:
            break
            
        if len(reviews) == batch_size:
            current_batch_no += 1
            results_futures[current_batch_no] = process_review(reviews=reviews, embedding_matrix_path=glove_100d_path)
            reviews = []
            reviews.append(line)
        else:
            reviews.append(line)
            
if len(reviews) > 0:  
     results_futures[current_batch_no] = process_review(reviews=reviews, embedding_matrix_path=glove_100d_path)
    
results = client.gather(results_futures)

                      

CPU times: user 21.9 s, sys: 974 ms, total: 22.9 s
Wall time: 22.6 s




# On HPC

In [20]:
%%time
current_batch_no = 1
batch_size = 1000
total_size = 10000
results_futures = {}
with open(os.path.join(DATA_PATH, "yelp_academic_dataset_review.json")) as f:
    
    reviews = []
    for i, line in tqdm(enumerate(f), leave=False):
        
        if i >= total_size:
            break
            
        if len(reviews) == batch_size:
            current_batch_no += 1
            results_futures[current_batch_no] = client.submit(process_review, reviews=reviews, embedding_matrix_path=glove_100d_path)
            reviews = []
            reviews.append(line)
        else:
            reviews.append(line)
            
if len(reviews) > 0:  
     results_futures[current_batch_no] = client.submit(process_review, reviews=reviews, embedding_matrix_path=glove_100d_path)
    
results = client.gather(results_futures)

                            

CPU times: user 21 s, sys: 3.28 s, total: 24.2 s
Wall time: 32.7 s


In [17]:
# results_futures.keys()

In [18]:
# [len(result) for result in results_futures.values()]

In [19]:
# list(results_futures.values())[0].status