In [49]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter 
import os
from sys import getsizeof

## [1] Fit Method

In [50]:
def fit(dataset):
    
    unique_words = set()
    vocab = dict()
    
    for sentence in tqdm(dataset):
        
        for word in sentence.split():
            if len(word) < 2:
                continue

            unique_words.add(word)
        
    unique_words_list = sorted(list(unique_words))
    
    vocab = {word:index for index, word in enumerate(unique_words_list)}
    
    return vocab

In [51]:
vocab = fit(["abc def aaa prq", "lmn pqr aaaaaaa aaa abbb baaa"])
print(vocab)

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1703.96it/s]


{'aaa': 0, 'aaaaaaa': 1, 'abbb': 2, 'abc': 3, 'baaa': 4, 'def': 5, 'lmn': 6, 'pqr': 7, 'prq': 8}


<ol>
    <li> Matrix matrix 
<pre>
[[1, 0, 0, 0, 0], 
[0, 0, 0, 1, 0], 
[0, 0, 4, 0, 0]]

</pre>
    </li>


</ol>

In [52]:
dense_mtx = np.array([[1, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 4, 0, 0]])
dense_mtx = getsizeof(dense_mtx)
print(dense_mtx)

172


In [53]:
sparse_mtx = [(0,0,1), (1,3,1), (2,2,2)]
sparse_mtx = getsizeof(sparse_mtx)
print(sparse_mtx)

88


## [2] Transform Method

### [2.1] Using this method we will compute sparse matrix

In [54]:
def get_unique_words_with_freq(sentence):

    unique_word_with_freq = dict(Counter(sentence.split()))
    
    return unique_word_with_freq

In [55]:
test = 'abc def abc def zzz zzz pqr'
get_unique_words_with_freq(test)

{'abc': 2, 'def': 2, 'zzz': 2, 'pqr': 1}

In [56]:
from scipy.sparse import csr_matrix

In [57]:
strings = ["the method of lagrange multipliers is the economists workhorse for solving optimization problems",
           "the technique is a centerpiece of economic theory but unfortunately its usually taught poorly"]
print("Dataset : ".format(strings))

Dataset : 


In [58]:
vocabulary_i = fit(strings)
print("Vocabulary : {}".format(vocabulary_i))

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 2457.84it/s]


Vocabulary : {'but': 0, 'centerpiece': 1, 'economic': 2, 'economists': 3, 'for': 4, 'is': 5, 'its': 6, 'lagrange': 7, 'method': 8, 'multipliers': 9, 'of': 10, 'optimization': 11, 'poorly': 12, 'problems': 13, 'solving': 14, 'taught': 15, 'technique': 16, 'the': 17, 'theory': 18, 'unfortunately': 19, 'usually': 20, 'workhorse': 21}


In [59]:
def transform(dataset, vocabulary):
    """
    Compute sparse marix given dataset and vocabulary
    
    Parameter:
    
    dataset: list()
        List of sentences
    vocabulary: dict()
        {word: frequency}
        
    Return: sparse_mtx
        2-d Matrix
    """
    rows, cols, frequencies = list(), list(), list()
    sparse_mtx = ""
    if isinstance(dataset, list):

        for idx, sentence in enumerate(dataset):

            word_with_freq = get_unique_words_with_freq(sentence)

            for word, frequency in word_with_freq.items():

                if len(word) < 2:
                    continue

                col_index = vocabulary.get(word, -1)
                if col_index != -1:

                    rows.append(idx)
                    cols.append(col_index)
                    frequencies.append(frequency)

        sparse_mtx = csr_matrix((frequencies, (rows,cols)), shape = (len(strings), len(vocabulary)))
        
    return sparse_mtx

In [60]:
sparse_mtx = transform(strings, vocabulary_i)
print(sparse_mtx)

  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	1
  (0, 13)	1
  (0, 14)	1
  (0, 17)	2
  (0, 21)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 5)	1
  (1, 6)	1
  (1, 10)	1
  (1, 12)	1
  (1, 15)	1
  (1, 16)	1
  (1, 17)	1
  (1, 18)	1
  (1, 19)	1
  (1, 20)	1
