# Data retrieval - inverted index
## Skip lists ( and positional indexes?)
**Paweł Kruczkiewicz**

In [1]:
# All imports
from pathlib import Path
from Lib import traceback
import os
from functools import reduce

from time import time
from IPython.display import Video

from math import sqrt, ceil

# all_folder_names
images = Path("dataset/images")
sw_video = Path("dataset/star_wars_video")
sw_scripts = Path("dataset/star_wars_scripts")
all_scripts = Path("dataset/all_scripts")

sw_lemmas_dir = Path("dataset/lemmas_star_wars")
all_lemmas_dir = Path("dataset/lemmas_all_scripts")
#time measure function
def measure_time(func, message):
    t_start = time()
    result = func()
    t_end = time()
    print(f"{message}: {t_end - t_start} [s]")
    return result


In [2]:
#source: https://www.youtube.com/watch?v=5GFW-eEWXlc
Video(sw_video / "arst_arsw_edit.mp4")

## Ale po co to wszystko?

### Nie wiem...

### Zatem znajdźmy sobie jakiś własny cel!

Przypuśćmy, że chcemy wiedzieć dokładnie, w której filmowej części *Gwiezdnych Wojen* zostały zawarte słowa "Luke" oraz "Leia" (jest to tzw. *Boolean retrieval*)

#### I sposób - *grepping*
Liniowe przejście po tekście

1. Wczytanie potrzebnych dokumentów i ponumerowanie ich (tzw. *indeksacja*)
2. Dla każdego dokumentu: Iteracja po każdym wyrazie i sprawdzenie, czy wyraz zawiera się w dokumecie.
3. Przecięcie zbiorów jest szukanym wyrażeniem.

In [3]:
# 1. wczytanie dokumentów 
def open_document(path):
    with open(path, "r", encoding="UTF-8") as file:
        text = file.read()
    words = list(map(lambda word: word.strip(), text.split()))
    return words

def get_words_in_docs_from_dir(dir_name):
    cwd = os.getcwd()
    os.chdir(dir_name)
    words_in_docs = []
    try:
        for doc_path in os.listdir():
            words_in_docs.append(open_document(doc_path))
    except FileNotFoundError:
        traceback.print_exc()
    finally:
        os.chdir(cwd)
    
    return words_in_docs

sw_words = get_words_in_docs_from_dir(sw_scripts)
doc_indexes= [1,2,3,4,5,6,7]  # from Phantom Menace to Force Awakenes

no_words = sum(list(map(len, sw_words)))   # not neccessarly unique words
print(f"\nTotal number of words: {no_words}") 


Total number of words: 194102


In [4]:
# 2. iteracja po dokumentac i wyszukiwanie słów

def is_word_in_doc(query_word, doc):
    for word in doc:  # można zapisac to w jednej linii, ale tak widac, że jest to liniowe
        if query_word == word:
            return True
    return False

def docs_with_query_word(query_word, docs):
    result = set()
    for i, doc in enumerate(docs):
        if is_word_in_doc(query_word, doc):
            result.add(doc_indexes[i])
    return result

episodes_with_luke = docs_with_query_word("Luke", sw_words)
episodes_with_leia = docs_with_query_word("Leia", sw_words)

print(f"Luke Skywalker was mentioned in episodes {episodes_with_luke}")
print(f"Leia Organa was mentioned in episodes {episodes_with_leia}")

Luke Skywalker was mentioned in episodes {3, 4, 5, 6, 7}
Leia Organa was mentioned in episodes {4, 5, 6, 7}


In [5]:
# 3. Część wspólna

luke_and_leia_eps = episodes_with_luke.intersection(episodes_with_leia) 
print(f"They were both mentioned in episodes {luke_and_leia_eps}")

They were both mentioned in episodes {4, 5, 6, 7}


### 2 zasadnicze problemy z **greppingiem**

### 1. Źle wybrane wyrazy

In [6]:
episodes_with_motivator = docs_with_query_word("motivator", sw_words)
print(f"Episodes with 'motivartor': {episodes_with_motivator}")

Episodes with 'motivartor': {5}


### Chyba jest dobrze ...
<br>

![Fragment części 5](dataset/images/motivator_ep_5.png)

**Fragment *Gwiezdnych Wojen* - części 5**

### ... albo jednak nie

![fragment_ep_4](dataset/images/motivator_ep_4.png)

**Fragment *Gwiezdnych Wojen* - części 4**  - nieznalezione przez naszą wyszukiwarkę

Nasz algorytm posiada **wysoki precision**, ale **niski recall**

![precisionrecall](dataset/images/Precisionrecall.svg.png)

By Walber - Own work, CC BY-SA 4.0, https://commons.wikimedia.org/w/index.php?curid=36926283

### Rozwiązanie - odpowiednia *tokenizacja* i *lematyzacja*

**tokenizacja** - podział tekst na słowa tak, by zachować ich znaczenia ("and." => "and", "\lightsabers..." => "lightsabers")

**lematyzacja** - pozbycie się końcówek fleksyjnych ("lightsabers" => "lightsaber")

Wówczas wszsytkie zapytania również nalezy stokenizować i zlematyzować.

Dalej będziemy używać jedynie lematyzacji.

In [7]:
# !python -m spacy download en

import spacy
def lematize_document(path, nlp):
    with open(path, "r", encoding="UTF-8") as file:
        text = file.read()
    tokens = nlp(text)
    tokens = list(filter(lambda token: not (token.is_punct or token.is_space), tokens))
    
    for token in tokens:
        print(f"Token: {token.text:15} Lemma: {token.lemma_}") 
    
    lemmas = list(map(lambda token: token.lemma_, tokens))
    return lemmas

In [8]:
#prezentacja    
nlp = spacy.load('en_core_web_sm')
lematize_document(sw_scripts / "1Script_Star Wars_ The Phantom Menace.txt", nlp)

Token: Star            Lemma: Star
Token: Wars            Lemma: war
Token: Episode         Lemma: episode
Token: 1               Lemma: 1
Token: The             Lemma: the
Token: Phantom         Lemma: Phantom
Token: Menace          Lemma: Menace
Token: TITLE           Lemma: TITLE
Token: CARD            Lemma: card
Token: A               Lemma: a
Token: long            Lemma: long
Token: time            Lemma: time
Token: ago             Lemma: ago
Token: in              Lemma: in
Token: a               Lemma: a
Token: galaxy          Lemma: galaxy
Token: far             Lemma: far
Token: far             Lemma: far
Token: away            Lemma: away
Token: A               Lemma: a
Token: vast            Lemma: vast
Token: sea             Lemma: sea
Token: of              Lemma: of
Token: stars           Lemma: star
Token: serves          Lemma: serve
Token: as              Lemma: as
Token: the             Lemma: the
Token: backdrop        Lemma: backdrop
Token: for             Lemma:

Token: back            Lemma: back
Token: to              Lemma: to
Token: see             Lemma: see
Token: the             Lemma: the
Token: monstrous       Lemma: monstrous
Token: troop           Lemma: troop
Token: transports      Lemma: transport
Token: emerging        Lemma: emerge
Token: from            Lemma: from
Token: the             Lemma: the
Token: mist            Lemma: mist
Token: Animals         Lemma: animal
Token: begin           Lemma: begin
Token: to              Lemma: to
Token: run             Lemma: run
Token: past            Lemma: past
Token: him             Lemma: he
Token: in              Lemma: in
Token: a               Lemma: a
Token: panic           Lemma: panic
Token: An              Lemma: an
Token: odd             Lemma: odd
Token: frog            Lemma: frog
Token: like            Lemma: like
Token: Gungan          Lemma: Gungan
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: INKS            Lemma: INKS
Token: squats        

Token: good            Lemma: good
Token: Hey             Lemma: hey
Token: OBI             Lemma: OBI
Token: WAN             Lemma: WAN
Token: What            Lemma: what
Token: is              Lemma: be
Token: it              Lemma: it
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: looks           Lemma: look
Token: back            Lemma: back
Token: to              Lemma: to
Token: where           Lemma: where
Token: they            Lemma: they
Token: 're             Lemma: be
Token: drifting        Lemma: drift
Token: He              Lemma: he
Token: sees            Lemma: see
Token: they            Lemma: they
Token: are             Lemma: be
Token: headed          Lemma: head
Token: for             Lemma: for
Token: a               Lemma: a
Token: huge            Lemma: huge
Token: waterfall       Lemma: waterfall
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: What            Lemma: what
Token: Oie             Lemma: Oie
Tok

Token: Cant'd          Lemma: cant'd
Token: The             Lemma: the
Token: Hyperdrive      Lemma: hyperdrive
Token: generator       Lemma: generator
Token: is              Lemma: be
Token: gone            Lemma: go
Token: We              Lemma: we
Token: will            Lemma: will
Token: need            Lemma: need
Token: a               Lemma: a
Token: new             Lemma: new
Token: one             Lemma: one
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: moves           Lemma: move
Token: closer          Lemma: close
Token: to              Lemma: to
Token: OBI             Lemma: OBI
Token: WAN             Lemma: WAN
Token: and             Lemma: and
Token: speaks          Lemma: speak
Token: quietly         Lemma: quietly
Token: to              Lemma: to
Token: him             Lemma: he
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: Do              Lemma: do
Token: n't             Lemma: n't
Token: let             Lemma: 

Token: the             Lemma: the
Token: ground          Lemma: ground
Token: The             Lemma: the
Token: Gungan          Lemma: Gungan
Token: desperately     Lemma: desperately
Token: tries           Lemma: try
Token: to              Lemma: to
Token: scramble        Lemma: scramble
Token: to              Lemma: to
Token: safety          Lemma: safety
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: to              Lemma: to
Token: himself         Lemma: himself
Token: Why             Lemma: why
Token: mesa            Lemma: mesa
Token: always          Lemma: always
Token: da              Lemma: da
Token: one             Lemma: one
Token: ANAKIN          Lemma: ANAKIN
Token: V.O             Lemma: V.O
Token: Because         Lemma: because
Token: you             Lemma: you
Token: 're             Lemma: be
Token: afraid          Lemma: afraid
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: turns           Lemma: turn
Token: to   

Token: want            Lemma: want
Token: to              Lemma: to
Token: put             Lemma: put
Token: your            Lemma: your
Token: son             Lemma: son
Token: in              Lemma: in
Token: danger          Lemma: danger
Token: We              Lemma: we
Token: will            Lemma: will
Token: find            Lemma: find
Token: another         Lemma: another
Token: way             Lemma: way
Token: SHMI            Lemma: shmi
Token: No              Lemma: no
Token: Annie           Lemma: Annie
Token: 's              Lemma: 's
Token: right           Lemma: right
Token: there           Lemma: there
Token: is              Lemma: be
Token: no              Lemma: no
Token: other           Lemma: other
Token: way             Lemma: way
Token: I               Lemma: I
Token: may             Lemma: may
Token: not             Lemma: not
Token: like            Lemma: like
Token: it              Lemma: it
Token: but             Lemma: but
Token: he              Lemma: he
Toke

Token: get             Lemma: get
Token: me              Lemma: I
Token: onto            Lemma: onto
Token: one             Lemma: one
Token: of              Lemma: of
Token: those           Lemma: those
Token: dreadful        Lemma: dreadful
Token: starships       Lemma: starship
Token: KITSTER         Lemma: kitster
Token: to              Lemma: to
Token: Anakin          Lemma: Anakin
Token: This            Lemma: this
Token: is              Lemma: be
Token: so              Lemma: so
Token: wizard          Lemma: wizard
Token: I               Lemma: I
Token: 'm              Lemma: be
Token: sure            Lemma: sure
Token: you             Lemma: you
Token: 'll             Lemma: 'll
Token: do              Lemma: do
Token: it              Lemma: it
Token: this            Lemma: this
Token: time            Lemma: time
Token: Annie           Lemma: Annie
Token: PADME           Lemma: PADME
Token: Do              Lemma: do
Token: what            Lemma: what
Token: KITSTER         Lemma

Token: the             Lemma: the
Token: pits            Lemma: pit
Token: for             Lemma: for
Token: some            Lemma: some
Token: attention       Lemma: attention
Token: ODY             Lemma: ODY
Token: Droids          Lemma: droid
Token: TERTER          Lemma: terter
Token: is              Lemma: be
Token: getting         Lemma: get
Token: close           Lemma: close
Token: to              Lemma: to
Token: SEBULBA         Lemma: SEBULBA
Token: who             Lemma: who
Token: purposely       Lemma: purposely
Token: breaks          Lemma: break
Token: a               Lemma: a
Token: small           Lemma: small
Token: part            Lemma: part
Token: off             Lemma: off
Token: his             Lemma: his
Token: Pod             Lemma: Pod
Token: sending         Lemma: send
Token: it              Lemma: it
Token: into            Lemma: into
Token: Terter          Lemma: Terter
Token: 's              Lemma: 's
Token: engine          Lemma: engine
Token: causing   

Token: him             Lemma: he
Token: around          Lemma: around
Token: so              Lemma: so
Token: he              Lemma: he
Token: is              Lemma: be
Token: facing          Lemma: face
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: and             Lemma: and
Token: off             Lemma: off
Token: he              Lemma: he
Token: marches         Lemma: marche
Token: like            Lemma: like
Token: the             Lemma: the
Token: brave           Lemma: brave
Token: little          Lemma: little
Token: trooper         Lemma: trooper
Token: that            Lemma: that
Token: he              Lemma: he
Token: is              Lemma: be
Token: He              Lemma: he
Token: marches         Lemma: march
Token: right           Lemma: right
Token: past            Lemma: past
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: starring        Lemma: star
Token: right           Lemma: right
Token: ahead           Lemma: 

Token: to              Lemma: to
Token: OBI             Lemma: OBI
Token: WAN             Lemma: WAN
Token: and             Lemma: and
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: YODA            Lemma: YODA
Token: Trained         Lemma: train
Token: as              Lemma: as
Token: a               Lemma: a
Token: Jedi            Lemma: Jedi
Token: you             Lemma: you
Token: request         Lemma: request
Token: for             Lemma: for
Token: him             Lemma: he
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: Finding         Lemma: find
Token: him             Lemma: he
Token: was             Lemma: be
Token: the             Lemma: the
Token: will            Lemma: will
Token: of              Lemma: of
Token: the             Lemma: the
Token: Force           Lemma: Force
Token: I               Lemma: I
Token: have            Lemma: have
Token: no              Lemma: no
Token: doubt           Lemma: doubt
Token: of 

Token: LANDING         Lemma: landing
Token: PLATFORM        Lemma: PLATFORM
Token: NIGHT           Lemma: NIGHT
Token: QUI             Lemma: QUI
Token: GON             Lemma: GON
Token: OBI             Lemma: OBI
Token: WAN             Lemma: WAN
Token: and             Lemma: and
Token: ANAKIN          Lemma: ANAKIN
Token: stand           Lemma: stand
Token: on              Lemma: on
Token: the             Lemma: the
Token: landing         Lemma: landing
Token: platform        Lemma: platform
Token: outside         Lemma: outside
Token: the             Lemma: the
Token: ship            Lemma: ship
Token: ARTOO           Lemma: artoo
Token: whistles        Lemma: whistle
Token: a               Lemma: a
Token: happy           Lemma: happy
Token: tune            Lemma: tune
Token: as              Lemma: as
Token: he              Lemma: he
Token: leans           Lemma: lean
Token: over            Lemma: over
Token: the             Lemma: the
Token: edge            Lemma: edge
Token: of  

Token: cities          Lemma: city
Token: We              Lemma: we
Token: can             Lemma: can
Token: enter           Lemma: enter
Token: the             Lemma: the
Token: city            Lemma: city
Token: using           Lemma: use
Token: the             Lemma: the
Token: secret          Lemma: secret
Token: passages        Lemma: passage
Token: on              Lemma: on
Token: the             Lemma: the
Token: waterfall       Lemma: waterfall
Token: side            Lemma: side
Token: Once            Lemma: once
Token: we              Lemma: we
Token: get             Lemma: get
Token: to              Lemma: to
Token: the             Lemma: the
Token: main            Lemma: main
Token: entrance        Lemma: entrance
Token: Captain         Lemma: Captain
Token: Pnaka           Lemma: Pnaka
Token: will            Lemma: will
Token: create          Lemma: create
Token: a               Lemma: a
Token: diversion       Lemma: diversion
Token: so              Lemma: so
Token: that   

Token: hill            Lemma: hill
Token: JAR             Lemma: JAR
Token: JAR             Lemma: JAR
Token: scrambles       Lemma: scramble
Token: to              Lemma: to
Token: avoid           Lemma: avoid
Token: being           Lemma: be
Token: hit             Lemma: hit
Token: by              Lemma: by
Token: one             Lemma: one
Token: of              Lemma: of
Token: the             Lemma: the
Token: balls           Lemma: ball
Token: FOUR            Lemma: four
Token: DESTROYER       Lemma: DESTROYER
Token: DROIDS          Lemma: DROIDS
Token: are             Lemma: be
Token: n't             Lemma: n't
Token: so              Lemma: so
Token: lucky           Lemma: lucky
Token: They            Lemma: they
Token: get             Lemma: get
Token: blasted         Lemma: blast
Token: by              Lemma: by
Token: the             Lemma: the
Token: energy          Lemma: energy
Token: balls           Lemma: ball
Token: The             Lemma: the
Token: GUNGANS         Lemm

Token: to              Lemma: to
Token: the             Lemma: the
Token: Senate          Lemma: Senate
Token: and             Lemma: and
Token: explain         Lemma: explain
Token: all             Lemma: all
Token: this            Lemma: this
Token: CAPT            Lemma: CAPT
Token: PANAKA          Lemma: panaka
Token: I               Lemma: I
Token: think           Lemma: think
Token: you             Lemma: you
Token: can             Lemma: can
Token: kiss            Lemma: kiss
Token: your            Lemma: your
Token: Trade           Lemma: Trade
Token: franchise       Lemma: franchise
Token: goodbye         Lemma: goodbye
Token: The             Lemma: the
Token: main            Lemma: main
Token: ramp            Lemma: ramp
Token: of              Lemma: of
Token: the             Lemma: the
Token: cruiser         Lemma: cruiser
Token: is              Lemma: be
Token: lowered         Lemma: lower
Token: as              Lemma: as
Token: OBI             Lemma: OBI
Token: WAN        

['Star',
 'war',
 'episode',
 '1',
 'the',
 'Phantom',
 'Menace',
 'TITLE',
 'card',
 'a',
 'long',
 'time',
 'ago',
 'in',
 'a',
 'galaxy',
 'far',
 'far',
 'away',
 'a',
 'vast',
 'sea',
 'of',
 'star',
 'serve',
 'as',
 'the',
 'backdrop',
 'for',
 'the',
 'main',
 'title',
 'follow',
 'by',
 'a',
 'roll',
 'up',
 'which',
 'crawl',
 'up',
 'into',
 'infinity',
 'EPISODE',
 '1',
 'the',
 'phantom',
 'MENACE',
 'Turmoil',
 'have',
 'engulf',
 'the',
 'Galactic',
 'Republic',
 'the',
 'taxation',
 'of',
 'trade',
 'route',
 'to',
 'outlaye',
 'star',
 'system',
 'be',
 'in',
 'dispute',
 'hope',
 'to',
 'resolve',
 'the',
 'matter',
 'with',
 'a',
 'blockade',
 'of',
 'deadly',
 'battleship',
 'the',
 'greedy',
 'Trade',
 'Federation',
 'have',
 'stop',
 'all',
 'ship',
 'to',
 'the',
 'small',
 'planet',
 'of',
 'Naboo',
 'while',
 'the',
 'congress',
 'of',
 'the',
 'Republic',
 'endlessly',
 'debate',
 'this',
 'alarming',
 'chain',
 'of',
 'event',
 'the',
 'Supreme',
 'Chancellor

In [9]:
def load_lemmas(lemmas_dir):
    cwd = os.getcwd()
    os.chdir(lemmas_dir)
    lemmas_in_docs = []
    try:
        for doc_path in os.listdir():
            with open(doc_path, "r", encoding="UTF-8") as file:
                lemmas = file.readlines()
            
            lemmas = list(map(lambda lemma: lemma.strip(), lemmas))
            lemmas_in_docs.append(lemmas)
            
    except FileNotFoundError:
        traceback.print_exc()
    finally:
        os.chdir(cwd)
    
    return lemmas_in_docs
        


In [45]:
def docs_with_query_lemma(query_word, docs, doc_indexes, nlp):
    query_lemma_doc = nlp(query_word)
    query_lemma = query_lemma_doc[0].lemma_
    
    result = set()
    for i, doc in enumerate(docs):
        if query_word in doc:
            result.add(doc_indexes[i])
    return result


sw_lemmas = load_lemmas(sw_lemmas_dir)
no_lemmas = sum(list(map(len, sw_lemmas)))
print(f'Total number of words = {no_lemmas}')

doc_indexation = [1,2,3,4,5,6,7]

luke_eps_lem = docs_with_query_lemma("Luke", sw_lemmas, doc_indexation, nlp)
leia_eps_lem = docs_with_query_lemma("Leia", sw_lemmas, doc_indexation, nlp)

luke_and_leia_eps_lem = leia_eps_lem.intersection(luke_eps_lem) 

Total number of words = 199216


In [46]:
print(f"Luke Skywalker was mentioned in episodes {luke_eps_lem}")
print(f"Leia Organa was mentioned in episodes {leia_eps_lem}")
print()
print(f"They were both mentioned in episodes {luke_and_leia_eps_lem}")

Luke Skywalker was mentioned in episodes {3, 4, 5, 6, 7}
Leia Organa was mentioned in episodes {3, 4, 5, 6, 7}

They were both mentioned in episodes {3, 4, 5, 6, 7}


#### 2. Czas działania

In [49]:
nlp = spacy.load('en_core_web_sm')

def brute_force_intersection(word_1, word_2, doc_indexes, docs):
    set_1 = docs_with_query_lemma(word_1, docs, doc_indexes, nlp)
    set_2 = docs_with_query_lemma(word_2, docs, doc_indexes, nlp)
    return set_1.intersection(set_2)

    
doc_indexation = [1,2,3,4,5,6,7]
measure_time(lambda: brute_force_intersection("Luke", "Leia", doc_indexation, sw_lemmas),
             "Search 'Luke' and 'Leia'")
measure_time(lambda: brute_force_intersection("Dalek", "Doctor", doc_indexation, sw_lemmas),
             "Search 'Dalek' and 'Doctor'")

Search 'Luke' and 'Leia': 0.01011 [s]
Search 'Dalek' and 'Doctor': 0.00805 [s]


set()

### II sposób - *macierz incydencji*

1. Wczytanie plików jak w sposobie I
2. Tokenizacja i lematyzacja.
3. Stworzenie macierzy A o wym. T x N, gdzie T to liczba wyrazów, a N, to takiej, że:
    a. każdy rząd odpowiada jednemu słowu, a kazdy dokument - jednej kolumnie (zachowany porządek lesykograficzny)
    b. pole równa się 1, jesli słowo odpowiadające danemu rzędowi znajduje sie w dokumencie odpowiadajace danemu dokumentowi lub 0 wpp 
4. Znajdujemy odpowiadające szukanym słowom wiersze, które traktujemy jak binarne wektory. Dokonujemy na nich operacji logicznych i otrzymujemy wynik.

![matrix](dataset/images/incidence_matrix.png)

In [50]:
class IncidenceMatrix:
    def __init__(self, words_in_docs, docs_indexation):
        self.docs_indexation = docs_indexation

        sorted_words = self.__sort_without_dup(reduce(lambda l1, l2: l1 + l2, words_in_docs, []))
        self.words_indexation = {sorted_words[i]: i for i in range(len(sorted_words))}
        self.matrix = [[1 if word in doc_words else 0
                        for doc_words in words_in_docs]
                       for word in sorted_words]
        self.__nlp = spacy.load('en_core_web_sm')

    def __sort_without_dup(self, list_with_dup):  # list_with_dup has hashable elements
        return sorted(list(set(list_with_dup)))

    def intersect(self, word_1, word_2):
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        lem_1, lem_2 = word_to_lemma(word_1), word_to_lemma(word_2)
        ind_1, ind_2 = self.words_indexation.get(lem_1), self.words_indexation.get(lem_2)
        
        if ind_1 is None or ind_2 is None:
            return set()
        
        vec_1, vec_2 = self.matrix[ind_1], self.matrix[ind_2]
        intersection = [min(vec_1[i], vec_2[i]) for i in range(len(vec_1))]
        result = set()

        for i in range(len(intersection)):
            if intersection[i] == 1:
                result.add(self.docs_indexation[i])
        return result

In [51]:
def test_class_with_intersection(class_constructor, docs, doc_indexation, queries):
    class_obj = measure_time(lambda: class_constructor(docs, doc_indexation),
                            f"{class_constructor.__name__} construction time")
    for word1, word2, message in queries:
        measure_time(lambda: class_obj.intersect(word1, word2), message)

# time tests

test_class_with_intersection(IncidenceMatrix, sw_lemmas, [1,2,3,4,5,6,7], 
                            [("Luke", "Leia", "'Luke' and 'Leia' search time"),
                              ("Doctor", "Dalek", "'Doctor' and 'Dalek' search time")])


IncidenceMatrix construction time: 22.59337 [s]
'Luke' and 'Leia' search time: 0.01 [s]
'Doctor' and 'Dalek' search time: 0.01 [s]


### III sposób - odwrócona macierz incydencji

Macierz incydencji to macierz rzadka => lepiej zapamiętywać tylko dokumenty w których słowo występuje.



**Zasadnicza idea**

Trzymamy słownik ze słowami zawartymi w dokumentach.

Każda wartość w słowniku to lista z indeksami dokumentów, w których występuje **uporzadkowana rosnąco**

![img](dataset/images/inv_ind_schema.png)

In [52]:
class InvertedIndex:
    def __init__(self, words_in_docs, docs_indexation):
        self.docs_indexation = docs_indexation
        unsorted_tuples = reduce(lambda l1, l2: l1 + self.__give_word_index(l2),
                                 zip(words_in_docs, range(0, len(words_in_docs))),
                                 [])
        sorted_tuples = sorted(unsorted_tuples)  #automatic radix sort
        self.dictionary = self.__create_dictionary(sorted_tuples)
        self.__nlp = spacy.load("en_core_web_sm")

    def __give_word_index(self, info_tuple):  # info_tuple =  (["the", "great", "game"...],  5)
        words_in_doc, no_doc = info_tuple
        return list(set(map(lambda word: (word, no_doc), words_in_doc)))  # [("the", 5), ("great", 5), ("game", 5) ...]

    def __create_dictionary(self, sorted_tuples_list):  # sorted_tuples_list => [("a", 3), ("a", 5)
        result_dict = {}
        for word, no_document in sorted_tuples_list:
            if word not in result_dict:
                result_dict[word] = [no_document]
            else:
                posting = result_dict[word]
                if posting[-1] != no_document:
                    result_dict[word].append(no_document)

        return result_dict

    def intersect(self, word_1, word_2):
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result_doc_ids = set()
        p1, p2 = 0, 0
        docs_1, docs_2 = self.dictionary.get(word_to_lemma(word_1)), \
                         self.dictionary.get(word_to_lemma(word_2))
        if docs_1 is None or docs_2 is None:
            return set()
        n1, n2 = len(docs_1), len(docs_2)
        while p1 < n1 and p2 < n2:
            if docs_1[p1] == docs_2[p2]:
                result_doc_ids.add(docs_1[p1])
                p1 += 1
                p2 += 1
            elif docs_1[p1] < docs_2[p2]:
                p1 += 1
            else:
                p2 += 1
        return {self.docs_indexation[doc_id] for doc_id in result_doc_ids}

![inverted_index_creation](dataset/images/inverted_indexes_construction.png)

In [53]:
    def intersect(self, word_1, word_2):
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result_doc_ids = set()
        p1, p2 = 0, 0
        docs_1, docs_2 = self.dictionary.get(word_to_lemma(word_1)), \
                         self.dictionary.get(word_to_lemma(word_2))
        if docs_1 is None or docs_2 is None:
            return set()
        n1, n2 = len(docs_1), len(docs_2)
        while p1 < n1 and p2 < n2:
            if docs_1[p1] == docs_2[p2]:
                result_doc_ids.add(docs_1[p1])
                p1 += 1
                p2 += 1
            elif docs_1[p1] < docs_2[p2]:
                p1 += 1
            else:
                p2 += 1
        return {self.docs_indexation[doc_id] for doc_id in result_doc_ids}

In [54]:
# time tests
query = [("Luke", "Leia", "'Luke' and 'Leia' search time"),
         ("Doctor", "Dalek", "'Doctor' and 'Dalek' search time")]
test_class_with_intersection(InvertedIndex, sw_lemmas, [1,2,3,4,5,6,7], query)

InvertedIndex construction time: 0.49017 [s]
'Luke' and 'Leia' search time: 0.00808 [s]
'Doctor' and 'Dalek' search time: 0.0 [s]


In [56]:
all_lemmas = load_lemmas(all_lemmas_dir)   # zbiór ponad tysiaca scenariuszy 
query = [("Luke", "Leia", "'Luke' and 'Leia' search time"),
         ("the", "lightsaber", "'the' and 'lightsaber' search time")]
docs_indexation = list(range(len(all_lemmas)))

print("Brute force aproach:")
for word1, word2, message in query:
    measure_time(lambda: brute_force_intersection(word1, word2, docs_indexation, all_lemmas),
             f"Search '{word1}' and '{word2}'")

print()
print("Inverted indexation: ")
test_class_with_intersection(InvertedIndex, all_lemmas, docs_indexation, query)

Search 'Luke' and 'Leia': 0.89913 [s]
Search 'the' and 'lightsaber': 0.31617 [s]
InvertedIndex construction time: 122.12126 [s]
'Luke' and 'Leia' search time: 0.00806 [s]
'the' and 'lightsaber' search time: 0.00996 [s]


### Sposób III z modyfikacją - użycie skiplisty
Najefektywniejsze: dla równorozdzielonych skip_linków co sqrt(p), gdzie p to długość listy.

![skip_list](dataset/images/skip_list.png)

In [58]:
class SkipListNode:
    def __init__(self, value, skip_index=None):
        self.value = value
        self.skip_index = skip_index

In [59]:
class InvertedIndexWithSkipLists:
    def __init__(self, words_in_docs, docs_indexation):
        self.docs_indexation = docs_indexation
        unsorted_tuples = reduce(lambda l1, l2: l1 + self.__give_word_index(l2),
                                 zip(words_in_docs, range(0, len(words_in_docs))),
                                 [])
        sorted_tuples = sorted(unsorted_tuples)  # automatic radix sort
        self.dictionary = self.__create_dictionary(sorted_tuples)
        self.__add_skip_links()
        self.__nlp = spacy.load("en_core_web_sm")

    def __give_word_index(self, info_tuple):  # info_tuple =  (["the", "great", "game"...],  5)
        words_in_doc, no_doc = info_tuple
        return list(
            set(map(lambda word: (word, no_doc), words_in_doc)))  # [("the", 5), ("great", 5), ("game", 5) ...]

    def __create_dictionary(self, sorted_tuples):
        result_dict = {}
        for word, no_document in sorted_tuples:
            if word not in result_dict:
                result_dict[word] = [SkipListNode(no_document)]
            else:
                posting = result_dict[word]
                if posting[-1] != no_document:
                    result_dict[word].append(SkipListNode(no_document))
        return result_dict

    def __add_skip_links(self):
        for skip_list in self.dictionary.values():
            n = len(skip_list)
            step = ceil(sqrt(n))
            i = 0
            while i < n:
                skip_list[i].skip_index = min(i + step, n - 1)
                i += step

    def intersect(self, word_1, word_2):
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result_doc_ids = set()
        p1, p2 = 0, 0
        docs_1, docs_2 = self.dictionary[word_to_lemma(word_1)], \
                         self.dictionary[word_to_lemma(word_2)]
        if docs_1 is None or docs_2 is None:
            return set()
        n1, n2 = len(docs_1), len(docs_2)

        while p1 < n1 and p2 < n2:
            if docs_1[p1].value == docs_2[p2].value:
                result_doc_ids.add(docs_1[p1].value)
                p1 += 1
                p2 += 1
            elif docs_1[p1].value < docs_2[p2].value:
                skip_ind_1 = docs_1[p1].skip_index
                if skip_ind_1 is not None and \
                        docs_1[skip_ind_1].value < docs_2[p2].value:
                    while skip_ind_1 is not None and \
                            docs_1[skip_ind_1].value < docs_2[p2].value:
                        p1 = skip_ind_1
                        skip_ind_1 = docs_1[p1].skip_index
                else:
                    p1 += 1
            else:
                skip_ind_2 = docs_2[p2].skip_index
                if skip_ind_2 is not None and \
                        docs_2[skip_ind_2].value < docs_1[p1].value:
                    while skip_ind_2 is not None and \
                            docs_2[skip_ind_2].value < docs_1[p1].value:
                        p2 = skip_ind_2
                        skip_ind_2 = docs_2[p2].skip_index
                else:
                    p2 += 1
        return {self.docs_indexation[doc_id] for doc_id in result_doc_ids}


In [60]:
    def intersect(self, word_1, word_2):
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result_doc_ids = set()
        p1, p2 = 0, 0
        docs_1, docs_2 = self.dictionary[word_to_lemma(word_1)], \
                         self.dictionary[word_to_lemma(word_2)]
        if docs_1 is None or docs_2 is None:
            return set()
        n1, n2 = len(docs_1), len(docs_2)

        while p1 < n1 and p2 < n2:
            if docs_1[p1].value == docs_2[p2].value:
                result_doc_ids.add(docs_1[p1].value)
                p1 += 1
                p2 += 1
            elif docs_1[p1].value < docs_2[p2].value:
                skip_ind_1 = docs_1[p1].skip_index
                if skip_ind_1 is not None and \
                        docs_1[skip_ind_1].value < docs_2[p2].value:
                    while skip_ind_1 is not None and \
                            docs_1[skip_ind_1].value < docs_2[p2].value:
                        p1 = skip_ind_1
                        skip_ind_1 = docs_1[p1].skip_index
                else:
                    p1 += 1
            else:
                skip_ind_2 = docs_2[p2].skip_index
                if skip_ind_2 is not None and \
                        docs_2[skip_ind_2].value < docs_1[p1].value:
                    while skip_ind_2 is not None and \
                            docs_2[skip_ind_2].value < docs_1[p1].value:
                        p2 = skip_ind_2
                        skip_ind_2 = docs_2[p2].skip_index
                else:
                    p2 += 1
        return {self.docs_indexation[doc_id] for doc_id in result_doc_ids}

In [63]:
query = [("the", "a", "'the' and 'a' search time"),
        ("Luke", "Leia", "'Luke' and 'Leia' search time"),
         ("the", "lightsaber", "'the' and 'lightsaber' search time")]

docs_indexation = list(range(len(all_lemmas)))
test_class_with_intersection(InvertedIndex, all_lemmas, docs_indexation, query)
print()
test_class_with_intersection(InvertedIndexWithSkipLists, all_lemmas, docs_indexation, query)


InvertedIndex construction time: 168.21737 [s]
'the' and 'a' search time: 0.08 [s]
'Luke' and 'Leia' search time: 0.01 [s]
'the' and 'lightsaber' search time: 0.0 [s]
InvertedIndexWithSkipLists construction time: 173.20083 [s]
'the' and 'a' search time: 0.05 [s]
'Luke' and 'Leia' search time: 0.0 [s]
'the' and 'lightsaber' search time: 0.01 [s]


## Czego nam brakuje?

1. Odporności na literówki
2. Wyszukiwanie słów w danej odległości od siebie (np. ("May", "Force", 2))
3. Niebinarność:
     1. wartościowanie słów (np. na bazie częstości wystąpień)
     2. wartościowanie wyników (lepsze dopasowania powinny być wyżej)


## Indeksy pozycyjne (ang. positional indexes)
**Rozwiązanie 2 problemu**

![positional_index](dataset/images/positional_index.png)

In [66]:
class PositionalIndexes:
    def __init__(self, words_in_docs, docs_indexation):
        self.docs_indexation = docs_indexation
        unsorted_tuples = reduce(lambda l1, l2: l1 + self.__give_word_index_and_position(l2),
                                 zip(words_in_docs, range(0, len(words_in_docs))),
                                 [])

        sorted_tuples = sorted(unsorted_tuples)
        self.dictionary = self.__create_dictionary(sorted_tuples)
        self.__nlp = spacy.load("en_core_web_sm")

    def __give_word_index_and_position(self, info_tuple):  # info_tuple =  (["the", "great", "game"...],  5)
        words_in_doc, no_doc = info_tuple
        words_with_position = zip(words_in_doc, range(len(words_in_doc)))
        return list(
            map(lambda x: (x[0], no_doc, x[1]),
                words_with_position))

    def __create_dictionary(self, sorted_tuples):
        def add_no_occurrences(dictionary):
            no_occurrences = sum(map(len, dictionary.values()))
            return no_occurrences, dictionary

        result_dict = {}
        for word, no_doc, pos in sorted_tuples:
            result_dict.setdefault(word, dict())
            result_dict[word].setdefault(no_doc, [])
            result_dict[word][no_doc].append(pos)

        result_dict = {word: add_no_occurrences(word_dict)
                       for word, word_dict in result_dict.items()}
        # result_dict = dict(map(add_no_occurrences, result_dict.values()))
        return result_dict

    def positional_intersect(self, word_1, word_2, k):  # k - odległość miedzy słowami jest mniejsza lub równa k
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result = set()
        p1, p2 = 0, 0
        (_, doc_1) = self.dictionary.get(word_to_lemma(word_1))  # we omit frequency
        (_, doc_2) = self.dictionary.get(word_to_lemma(word_2))
        n1, n2 = len(doc_1), len(doc_2)
        while p1 < n1 and p2 < n2:
            doc_ids_1, doc_ids_2 = list(doc_1.keys()), list(doc_2.keys())
            doc_id_1, doc_id_2 = doc_ids_1[p1], doc_ids_2[p2]
            if doc_id_1 == doc_id_2:
                retrieved_positions = self.__check_positions(doc_id_1,
                                                             doc_1[doc_id_1], doc_2[doc_id_2], k)
                result.update(retrieved_positions)
                p1 += 1
                p2 += 1
            elif doc_id_1 < doc_id_2:
                p1 += 1
            else:
                p2 += 1
        return result

    def __check_positions(self, doc_id, pos_list_1, pos_list_2, k):
        result, tmp = set(), []
        p1, p2 = 0, 0
        n1, n2 = len(pos_list_1), len(pos_list_2)
        while p1 < n1:
            while p2 < n2:
                if abs(pos_list_1[p1] - pos_list_2[p2]) <= k:
                    tmp.append(pos_list_2[p2])
                elif pos_list_2[p2] > pos_list_1[p1]:
                    break
                p2 += 1
                while tmp != [] and abs(tmp[0] - pos_list_1[p1]) > k:
                    del tmp[0]
                for pos_2 in tmp:
                    result.add((self.docs_indexation[doc_id], pos_list_1[p1], pos_2))

            p1 += 1

        return result

In [None]:
    def positional_intersect(self, word_1, word_2, k):  # k - odległość miedzy słowami jest mniejsza lub równa k
        def word_to_lemma(word):
            lemma_doc = self.__nlp(word)
            return lemma_doc[0].lemma_

        result = set()
        p1, p2 = 0, 0
        (_, doc_1) = self.dictionary.get(word_to_lemma(word_1))  # we omit frequency
        (_, doc_2) = self.dictionary.get(word_to_lemma(word_2))
        n1, n2 = len(doc_1), len(doc_2)
        while p1 < n1 and p2 < n2:
            doc_ids_1, doc_ids_2 = list(doc_1.keys()), list(doc_2.keys())
            doc_id_1, doc_id_2 = doc_ids_1[p1], doc_ids_2[p2]
            if doc_id_1 == doc_id_2:
                retrieved_positions = self.__check_positions(doc_id_1,
                                                             doc_1[doc_id_1], doc_2[doc_id_2], k)
                result.update(retrieved_positions)
                p1 += 1
                p2 += 1
            elif doc_id_1 < doc_id_2:
                p1 += 1
            else:
                p2 += 1
        return result

In [None]:
    def __check_positions(self, doc_id, pos_list_1, pos_list_2, k):
        result, tmp = set(), []
        p1, p2 = 0, 0
        n1, n2 = len(pos_list_1), len(pos_list_2)
        while p1 < n1:
            while p2 < n2:
                if abs(pos_list_1[p1] - pos_list_2[p2]) <= k:
                    tmp.append(pos_list_2[p2])
                elif pos_list_2[p2] > pos_list_1[p1]:
                    break
                p2 += 1
                while tmp != [] and abs(tmp[0] - pos_list_1[p1]) > k:
                    del tmp[0]
                for pos_2 in tmp:
                    result.add((self.docs_indexation[doc_id], pos_list_1[p1], pos_2))

            p1 += 1

        return result

In [None]:
words_in_docs = load_lemmas(sw_lemmas)
positional_ind = PositionalIndexes(words_in_docs, [1, 2, 3, 4, 5, 6, 7])

print(positional_ind.positional_intersect("Master", "Yoda", 1))

may_the_force = positional_ind.positional_intersect("may", "you", 5)

print(sorted(list(filter(lambda x: x[2] - x[1] == 5, may_the_force))))