In [None]:
from pathlib import Path

# Crawling
data = {}
for doc_path in Path('data/documents').iterdir():
    if doc_path.suffix != '.txt':
        continue

    with open(doc_path) as f:
        doc_name = doc_path.stem.replace('_', ' ').title()
        data[doc_name] = f.read

In [None]:
# Indexing
stop_words = open('data/stop_words.txt').readlines()
stop_words = list(map(str.strip, stop_words))
stop_words = set(map(pipe.transfrm, stop_words))


index = {}
for doc_name, content in data.items():
    for word in content.split():
        words = pip.transform(word)

        # empty words
        if not word:
            continue
        # Ignoring stop words
        if word in stop_words:
            continue

        # adding to index
        if index.get(word):
            index[word].add(doc_name)
        else:

In [None]:
# Search
from termcolor import colored
from collections import Counter

def print_success(text):
    print(colored(text, 'green'))

while True:
    searh_input = input('Search to find a doc (q to quit):')
    if searh_input.lower() == 'q':
        break

    # for multiple inputs
    search_input = pipe.transform(search_input)
    search_tokens = searh_input.split()
    docs = []
    for token in search_tokens:
        docs.extend(index.get(token, []))

    print(docs)


In [None]:
# Text Pre-processing

from abc import ABC, abstractmethod
import string


class TextProcessor(ABC):
    @abstractmethod
    def transform(self, text):
        pass

class ConvertCase(TextProcessor):
    def __init__(self, casing='lower'):
        self.casing = casing

    def transform(self, text):
        if self.casing == 'lower':
            return text.lower()
        elif self.casing == 'upper':
            return text.upper()
        elif self.casing == 'title':
            return text.title()

class RemoveDigit(TextProcessor):
    def transform(self, text):
        return ''.join(filter(lambda char: not char.isdigit(), text))

class RemoveSpace(TextProcessor):
    def transform(self, text):
        return ''.join(text.split())

class RemovePunkt(TextProcessor):
    def transform(self, text):
        return ''.join(filter(lambda char: char not in string.punctuation, text))

#then we create the pipe class to put them all together
class TextPipeline:
    def __init__(self, *args):
        self.transformers = args

    def transform(self, text):
        for tf in self.transformers:
            text = tf.transform(text)
        return text

#this is the final product of how to call it:
pipe = TextPipeline(
    ConvertCase('upper'),
    RemovePunkt(),
    RemoveDigit(),
    RemoveSpace(),
)
pipe.transfor(my_text)
