In [1]:
from math import log
from collections import Counter, OrderedDict, deque

In [2]:
def read_file(path):
    with open(path, 'r') as file:
        return file.read()


In [3]:
def compute_entropy(prob_list,log_base=2):
    h=0
    for prob in prob_list:
        if prob == 0 or prob == 1:
            continue
        h -= prob * log(prob,2)
    return h

In [4]:
def _compute_probability_of_next_element_after_ngram(ngrams_with_next_element,ngram_size): 
    temp_name = {}
    for ngram_with_next_element in ngrams_with_next_element:
        ngram = str(ngram_with_next_element[:-1])
        next_element = str(ngram_with_next_element[-1])
        if ngram in temp_name:
            temp_name[ngram].append(next_element)
        else:
            temp_name[ngram] = [next_element]

    omega = len(ngrams_with_next_element) + ngram_size -1  
    result = {}       
    for ngram,next_elements in temp_name.items():
        counter_dict = Counter(next_elements)
        prob_dict = {key:value/omega for key, value in counter_dict.items()}
        result[ngram] = prob_dict
    return result

def compute_probabilty_of_char(text_corpus,ngram_size):
    ngrams_with_letter = []

    for i,_ in enumerate(text_corpus[:-ngram_size]):
        ngram_with_letter = text_corpus[i:i+ngram_size]
        ngrams_with_letter.append(ngram_with_letter)
    else:
        #Last iteration is executed in sepcial way. iterable[:-0] return []
        ngram_with_letter = text_corpus[-ngram_size:]
        ngrams_with_letter.append(ngram_with_letter)
    return _compute_probability_of_next_element_after_ngram(ngrams_with_letter,ngram_size)   

In [5]:
def compute_probabilty_of_word(text_corpus,ngram_size):
    ngrams_with_next_word = []
    text_corpus = text_corpus.split()
    for i,_ in enumerate(text_corpus[:-ngram_size]):
        ngram_with_next_word = text_corpus[i:i+ngram_size]
        ngrams_with_next_word.append(ngram_with_next_word)
    else:
        #Last iteration is executed in sepcial way. iterable[:-0] return []
        ngram_with_next_word = text_corpus[-ngram_size:]
        ngrams_with_next_word.append(ngram_with_next_word)
    return _compute_probability_of_next_element_after_ngram(ngrams_with_next_word,ngram_size)  

In [6]:
def compute_entropy_order(text,prob_function,order,log_base=2):
    h = 0
    prob_dict =  prob_function(text,order)
    for key,value in prob_dict.items():
        cum_prob = sum(value.values())
        for k in value.keys():
            h -= cum_prob * (value[k]/cum_prob) * log(value[k]/cum_prob,log_base)
    return h  

In [15]:
def compute_entropy_statistics_for_language(text_corpus,start_order=1,end_order=5):
    char_entropy = [compute_entropy_order(text_corpus,compute_probabilty_of_char,i)
                    for i in range(start_order,end_order+1)]
    word_entropy = [compute_entropy_order(text_corpus,compute_probabilty_of_word,i)
                    for i in range(start_order,end_order+1)]

    return char_entropy,word_entropy

# Strefa testowa

In [8]:
compute_entropy([0,1])

0

In [9]:
test_text = read_file("lab3/norm_wiki_en.txt")

In [10]:
test_prob_dict =  compute_probabilty_of_char(test_text,1)

In [11]:
compute_entropy(list(test_prob_dict[''].values()))

4.288221453845133

In [12]:
compute_entropy_order(test_text,compute_probabilty_of_char,1)

4.288221453845132

Pierwsza lista to entropia kolejnych rzedów dla pojedyńczych znaków. 

Druga lista to entropia kolejnych rzedów dla słów.

# Język Angielski

In [16]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_en.txt"),end_order=5)

([4.288221453845132,
  3.5166044715516596,
  3.018318628714029,
  2.481565226392995,
  2.0211841795425207],
 [11.54399377363079,
  6.389171499464897,
  2.1764580263700775,
  0.484678057213775,
  0.10965290888646144])

# Język Łaciński

In [19]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_la.txt"),end_order=5)

([4.228247465746813,
  3.4501258348186434,
  2.8234916445972984,
  2.1520309167691716,
  1.6427633479948576],
 [11.969194044361334,
  4.400023372765311,
  1.1668819429215571,
  0.3880341890458278,
  0.2064678318752358])

# Język Esperanto

In [20]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_eo.txt"),end_order=5)

([4.176788490262166,
  3.3400049969313046,
  2.8718264902832717,
  2.39262461467093,
  1.9915096242290702],
 [11.56052995031191,
  6.557676609634665,
  2.4847179452059187,
  0.6336176581141884,
  0.16165807248059008])

# Język Estoński

In [21]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_et.txt"),end_order=5)

([4.169833224728489,
  3.506957782603394,
  3.1344054450799406,
  2.6108648187647634,
  2.111441373570926],
 [13.746243545094805,
  5.424178340802329,
  0.9047388662911572,
  0.11619672771532127,
  0.02364722325582635])

# Język Somalijski

In [22]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_so.txt"),end_order=5)

([4.04011386038252,
  3.299565271291083,
  2.8443705862755846,
  2.374309293922122,
  1.9450310652105032],
 [11.73110473724335,
  5.398731838327208,
  1.608620151841968,
  0.40960509009235724,
  0.11661039835065375])

# Język Haitański

In [23]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_ht.txt"),end_order=5)

([4.146385764101037,
  3.1138598527812307,
  2.27352391448577,
  1.4921515557255713,
  1.0521423905850413],
 [8.166919505034997,
  3.193109648790236,
  1.3113227039865785,
  0.8122242421284019,
  0.6205694737101665])

# Język Navaho

In [24]:
compute_entropy_statistics_for_language(read_file("lab3/norm_wiki_nv.txt"),end_order=5)

([3.8749372588209434,
  2.947264130515973,
  2.3675747761828796,
  1.7952574012820592,
  1.3415909575617972],
 [9.154011830801968,
  3.8639055133499074,
  1.7187812157113613,
  0.8992338755635442,
  0.5385556537930066])

# Sample 0

In [25]:
compute_entropy_statistics_for_language(read_file("lab3/sample0.txt"),end_order=5)

([4.273001240566633,
  2.9158935760366673,
  2.0003586572754375,
  1.5392811525798737,
  1.4385809174668112],
 [7.748741386140157,
  7.486385697485466,
  4.406696389524785,
  0.5950072737259781,
  0.01206208897985021])

**Nie jest to język** - Entropia dla pojdyńczych słów i par słów jest podobna. W przypadku języków zauważalny jest duży spadek wartośći.

# Sample 1

In [26]:
compute_entropy_statistics_for_language(read_file("lab3/sample1.txt"),end_order=5)

([4.12700613554972,
  3.2391495355451823,
  2.8612788781571723,
  2.326683756681927,
  1.8135089866130725],
 [11.500698483659221,
  5.372240059108988,
  1.574736219727881,
  0.5075098044042741,
  0.2934569177422313])

**Język** - Entropia kolejnych rzędów dla znaków i słów wykazuje podobną charakterystykę do języków.

# Sample 2

In [27]:
compute_entropy_statistics_for_language(read_file("lab3/sample2.txt"),end_order=5)

([3.993311800232584,
  3.0504387292006654,
  2.46765934010102,
  1.9397712700800276,
  1.7020313384458878],
 [8.023869815826421,
  7.348616622680935,
  3.781929363769915,
  0.8595036671136258,
  0.08199094379393915])

**Nie jest to język** - Entropia dla pojdyńczych słów i par słów jest podobna. W przypadku języków zauważalny jest duży spadek wartośći.

# Sample 3

In [28]:
compute_entropy_statistics_for_language(read_file("lab3/sample3.txt"),end_order=5)

([3.930297834157987,
  3.184466603089769,
  2.627894943734949,
  2.0239906040647146,
  1.534242461186765],
 [9.061119324694518,
  5.950215682736722,
  2.630802850325793,
  1.2640881316354724,
  0.4143259561699921])

**Język** - Entropia kolejnych rzędów dla znaków i słów wykazuje podobną charakterystykę jak w językach.

# Sample 4

In [29]:
compute_entropy_statistics_for_language(read_file("lab3/sample4.txt"),end_order=5)

([4.2538095673790135,
  4.22910114996374,
  4.226828376195623,
  4.178534315372819,
  3.7661305149811053],
 [17.129669110962844,
  3.4442512402777354,
  0.23407576260104365,
  0.0032274223495286193,
  7.608877603594868e-06])

**Nie jest to język** - Entropia kolejnych rzędów dla znaków maleje zbyt wolno.

# Sample 5

In [30]:
compute_entropy_statistics_for_language(read_file("lab3/sample5.txt"),end_order=5)

([4.441688018481797,
  3.52309786820294,
  3.2506203787744012,
  2.8342708632659854,
  2.172440153984199],
 [16.509527607380036, 0.0, 0.0, 0.0, 0.0])

**Nie jest to język** - Entropia dla słów rzędów biorących pod uwagę przynajmniej jednego poprzednika jest zerowa. Jest to prawdopodobnie sekwenjca wyrazów w której panuje determinizm.