# Druhé cvičení - nejbližší sousedé

## 1) Nalezení nejlépe asociované dvojice slov

- Vytvořte funkci, která načte soubor TEXTEN1.txt
- Odstraňte interpunkci z textu
- Nalezněte nejlépe asociovanou dvojici pomocí bodové vzájemné informace: 
$pmi(x,y) \equiv log{_2}{p(x,y) \over p(x)p(y)}$

- Uvažujte pouze slova, která mají výskyt větší než 9

In [1]:
import glob, os, re, string
from math import log
from collections import Counter
import pandas as pd

In [2]:
def compute_pointwise_mutual_information(file, min_occ=10):
    
    # Načtení souboru, odfiltrování interpunkce a prázdných řádků
    with open(file, "r") as f:
        # words = ['<s>'] + [
        words = [
            i for i in f.read().splitlines()
            if re.match(r'[\w\-]+', i)
        ]
    word_counts = Counter(words)
    words = [i if word_counts[i] >= min_occ else None for i in words]
    pair_counts = Counter((
        (prev, this) for prev, this in zip(words[:-1], words[1:])
        if prev is not None and this is not None
    ))
    word_total, pair_total = sum(i is not None for i in words), pair_counts.total()
    
    pmi = pd.DataFrame([
        (prev, this, log((pair_count / pair_total) / (word_counts[prev] * word_counts[this] / (word_total ** 2)), 2))
        for (prev, this), pair_count in pair_counts.items()
    ], columns = ['Prev', 'Word', 'PMI']).set_index(['Prev', 'Word']).sort_values(by='PMI', ascending=False)

    print(f'Pair count: {pair_counts.total()}')
    print(f'Unique pair count: {len(pair_counts)}')
    
    return pmi
    

In [3]:
computed_result_en = compute_pointwise_mutual_information("TEXTEN1.txt")
print(computed_result_en.loc[('La', 'Plata')])
computed_result_en[:10]

# Očekávané výsledky
## pairs_count = 197358
## len(pairs_set) = 78426
## Nejlépe asociovaná dvojice: La Plata 14.005507594503369



Pair count: 162489
Unique pair count: 49640
PMI    13.998753
Name: (La, Plata), dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,PMI
Prev,Word,Unnamed: 2_level_1
La,Plata,13.998753
Asa,Gray,13.86125
Fritz,Muller,13.191398
worth,while,13.162252
faced,tumbler,13.091863
lowly,organised,13.046281
Malay,Archipelago,12.939859
Alph,de,12.883276
shoulder,stripe,12.883276
E,Forbes,12.776361


## 2) Nalezněte nejlépe asociovanou dvojici s okolím
- Upravte vytvořenou funkci tak, aby nejlépe asociované dvojice slov hledala s využitím okolí 50 slov (+- 25)

In [4]:
def compute_pointwise_mutual_information(file, k, min_occ=10):
    
    # Načtení souboru a odfiltrování interpunkce
    with open(file, "r") as f:
        # words = ['<s>'] + [
        words = [
            i for i in f.read().splitlines()
            if re.match(r'[\w\-]+', i)
        ]
    word_counts = Counter(words)
    words = [i if word_counts[i] >= min_occ else None for i in words]
    word_total = len(words) # sum(i is not None for i in words)
    pair_total = 0
    pair_counts = Counter()
    window = Counter(words[:2 * k - 1])
    
    for n in range(k, len(words) - k - 1):
        window[words[n + k - 1]] += 1
        if words[n] is not None:
            for prev, count in window.items():
                if prev != words[n] and prev is not None:
                    pair_counts[(prev, words[n])] += 1 # counts
                    pair_total += 1 # counts
        window[words[n - k]] -= 1
        if window[words[n - k]] == 0:
            del window[words[n - k]]
    
    pmi = pd.DataFrame([
        (prev, this, log((pair_count / pair_total) / (word_counts[prev] * word_counts[this] / (word_total ** 2)), 2))
        for (prev, this), pair_count in pair_counts.items()
        if prev < this
    ], columns = ['Prev', 'Word', 'PMI']).set_index(['Prev', 'Word']).sort_values(by='PMI', ascending=False)
    
    return pmi

In [5]:
computed_result_en = compute_pointwise_mutual_information("TEXTEN1.txt", 25)
print(computed_result_en.loc[('La', 'Plata')])
computed_result_en[:10]

# Očekávané výsledky
## Nejlépe asociovaná dvojice: La Plata 9.361742783133026



PMI    9.017609
Name: (La, Plata), dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,PMI
Prev,Word,Unnamed: 2_level_1
La,Plata,9.017609
Asa,Gray,8.880105
crop,pouter,8.628567
fittest,survival,8.602571
dimorphic,trimorphic,8.52362
fantail,pouter,8.503036
faced,tumbler,8.432646
Old,Worlds,8.432646
dried,floated,8.432646
lowly,organised,8.354644
