# https://docs.python.org/3/library/functools.html

The Least Recently Used (LRU) cache is a cache eviction algorithm that organizes elements in order of use. In LRU, as the name suggests, the element that hasn't been used for the longest time will be evicted from the cache

# https://numpy.org/doc/stable/reference/generated/numpy.vectorize.html

In [44]:
from nltk.corpus import stopwords
import nltk
import os
import pathlib
import numpy as np
import pandas as pd
from functools import lru_cache , cache

In [45]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
stop_words = stopwords.words('english')
prt = nltk.stem.PorterStemmer()

In [82]:
def preprocess(document_path):
    
    with open(document_path, 'r', encoding = "utf-8") as file:
        document = file.read()
        tokens = document.split(" ")
        tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
        tokens_stop = [i for i in tokens_pun_lower if (len(i) > 1) ]
        terms = [prt.stem(i) for i in tokens_stop]
    
    return " ".join(tokens_stop)
    
# Least Recently Used
@lru_cache(maxsize = 128) 
def preprocess_lru(document_path):
    
    with open(document_path, 'r', encoding = "utf-8") as file:
        document = file.read()
        tokens = document.split(" ")
        tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
        tokens_stop = [i for i in tokens_pun_lower if (len(i) > 1) ]
        terms = [prt.stem(i) for i in tokens_stop]
    
    return " ".join(terms)

@cache
def preprocess_cache(document_path):
    
    with open(document_path, 'r', encoding = "utf-8") as file:
        document = file.read()
        tokens = document.split(" ")
        tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
        tokens_stop = [i for i in tokens_pun_lower if (len(i) > 1) ]
        terms = [prt.stem(i) for i in tokens_stop]
    
    return " ".join(terms)

In [83]:
filename = "D:/repos/custom_classifier/texto.txt"

In [99]:
with open(filename, 'r', encoding = "utf-8") as file:
    document = file.read()

In [100]:
len(document)

4541

In [84]:
%timeit documents = preprocess(filename)

14.6 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [85]:
%timeit documents = preprocess_lru(filename)

195 ns ± 3.29 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [86]:
%timeit documents = preprocess_cache(filename)

190 ns ± 1.66 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [101]:
(14.6 * 1000) / 195

74.87179487179488

In [87]:
df = pd.read_csv("documents_cv.csv")

In [88]:
df.tail()

Unnamed: 0,Title,Document,Class
15073,Zorah Noar.txt,evaluation the document was created with for z...,non-cv
15074,Zorina Yarranton.txt,evaluation the document was created with for z...,non-cv
15075,Zsa zsa Feldmark.txt,evaluation the document was created with for z...,non-cv
15076,Zulema Pedden.txt,evaluation the document was created with for z...,non-cv
15077,Zulema Road.txt,evaluation the document was created with for z...,non-cv


In [89]:
len(df)

15078

In [90]:
dfs= []
for i in range(10):
    dfs.append(df)

In [91]:
dataframe =  pd.concat(dfs, ignore_index=True)

In [92]:
len(dataframe)

150780

In [93]:
def count_words(text, n=2):
    text_list = str(text).split(" ")
    total_count=0
    for m in [n, n * 2, n*3]:
        tokens_n_length = [x for x in text_list if len(x)== m]
        total_count = total_count + len(tokens_n_length)
    return total_count
    
@lru_cache(maxsize = 128) 
def count_words_lru(text, n=2):
    text_list = str(text).split(" ")
    total_count=0
    for m in [n, n * 2, n*3]:
        tokens_n_length = [x for x in text_list if len(x)== m]
        total_count = total_count + len(tokens_n_length)
    return total_count

@cache
def count_words_cache(text, n=2):
    text_list = str(text).split(" ")
    total_count=0
    for m in [n, n * 2, n*3]:
        tokens_n_length = [x for x in text_list if len(x)== m]
        total_count = total_count + len(tokens_n_length)
    return total_count

In [94]:
%timeit dataframe["count"] = dataframe['Document'].apply(count_words)

10.3 s ± 93.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [95]:
countwords_vectorized= np.vectorize(count_words)

In [96]:
%timeit dataframe["count"] = countwords_vectorized(dataframe['Document'])

9.98 s ± 59.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [97]:
%timeit dataframe["count"] = dataframe['Document'].apply(count_words_lru)

9.15 s ± 61.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [98]:
%timeit dataframe["count"] = dataframe['Document'].apply(count_words_cache)

132 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
