# 1. Crawling

## 1.1 Load from file

In [1]:
from pathlib import Path
import re
from abc import ABC, abstractmethod
import string
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.utils import print_success_plain, print_grey
from IPython.display import clear_output


In [2]:
data = {}

for doc_path in list(Path('../src/data/football_teams').iterdir()):
    if doc_path.suffix != '.txt':
        continue
    with open(doc_path) as f:
        data[doc_path.stem] = f.read()

In [3]:
data

{'Juventus FC': 'Juventus Football Club (from Latin: iuventūs, \'youth\'; Italian pronunciation: [juˈvɛntus]), commonly known as Juventus or colloquially as Juve (pronounced [ˈjuːve]),[5] is an Italian professional football club based in Turin, Piedmont, who compete in Serie A, the top tier of the Italian football league system. Founded in 1897 by a group of Torinese students, the club played in different grounds around the city, and plays now in Juventus Stadium.\n\nNicknamed la Vecchia Signora ("the Old Lady"), it has won 36 official league titles, 15 Coppa Italia trophies and nine Italian Super Cups, being the record holder for all these competitions; they also hold two Intercontinental Cups, two European Cup / UEFA Champions Leagues, one European Cup Winners\' Cup, three UEFA Cups (Italian record), two UEFA Super Cups and one UEFA Intertoto Cup (Italian record).[6][7] Consequently, the side leads the historical Federazione Italiana Giuoco Calcio (FIGC) classification,[c] whilst on 

## 1.2 Pre-processing

In [4]:
# Thanks to pytopia.ai for providing the pre-processing code
class TextProcessor(ABC):
    @abstractmethod
    def transform(self, text):
        pass


class ConvertCase(TextProcessor):
    def __init__(self, casing='lower'):
        self.casing = casing

    def transform(self, text):
        if self.casing == 'lower':
            return text.lower()
        elif self.casing == 'upper':
            return text.upper()
        elif self.casing == 'title':
            return text.title()


class RemoveDigit:
    def transform(self, text):
        return ''.join(char if not char.isdigit() else ' ' for char in text)


class RemovePunkt:
    def transform(self, text):
        return ''.join(' ' if char in string.punctuation else char for char in text)


class RemoveSpace:
    def transform(self, text):
        return ' '.join(text.split())


class TextPipeline:
    def __init__(self, *args):
        self.transformers = args

    def transform(self, text):
        for tf in self.transformers:
            text = tf.transform(text)
        return text

    def __str__(self):
        transformers = ' -> '.join([tf.__class__.__name__ for tf in self.transformers])
        return f'Pipeline: [{transformers}]'
    
pipe = TextPipeline(
    ConvertCase('lower'),
    RemoveDigit(),
    RemovePunkt(),
    RemoveSpace()
)

In [5]:
pipe.transform('Hello Worl    fasf asfa    ! 1234')

'hello worl fasf asfa'

# 2. Indexing

In [6]:
index = {
    # 'invincible': set(),
    # 'unbeaten': set(),
    # 'three times': set(),
    # 'la decima': set(),
    # 'triple': set(),
    # 'double': set(),
    # 'treble': set(),
    # 'champions league': set(),
    # 'premier league': set(),
    # 'la liga': set(),
    # 'bundesliga': set(),
    # 'serie a': set(),
    # 'ligue 1': set(),
}

# Read stop words
stop_words = open('../src/data/stop_words.txt').read().splitlines()
stop_words = set(map(pipe.transform, stop_words))

for doc_name, doc_content in data.items():
    words = re.split('\W+', doc_content)
    for word in words:
        word = pipe.transform(word)

        # ignore if empty
        if not word:
            continue

        # Remove stop words
        if word in stop_words:
            continue

        # Check and add to index
        if word.lower() in index:
            index[word.lower()].add(doc_name)
        else:
            index[word.lower()] = {doc_name}

# 3.Search

In [None]:
while True:
    search_input = input('Enter search term (or "exit" to quit): ')
    clear_output(wait=True)
    search_input = pipe.transform(search_input)
    if search_input == 'exit':
        break
    search_tokens = re.split('\W+', search_input)
    results = []
    for token in search_tokens:
        results.extend(index.get(token, []))

    print_grey(f'Search results for "{search_input}":\n')

    if not results:
        print_success_plain('No results found.')
        continue
    
    # Print unique results
    for result in set(results):
        print_success_plain(f'- {result}', end = '\n')

[40m[97mSearch results for "santiago":
[0m[1m[32m- Read Madrid CF[0m
[1m[32m- FC Barcelona[0m
[1m[32m- Paris Saint-Germain F.C.[0m
[1m[32m- Inter Milan[0m


In [9]:
from faker import Faker

In [17]:
[Faker().name() for _ in range(3)]

['Joshua Jacobs', 'Tony Gomez', 'James Price']