# heartdisease corpus

In [1]:
txtfile=open('heartdisease.txt')
text = txtfile.readlines()

## persian text normalization

In [2]:
from parsivar import Normalizer
normalizer = Normalizer()
text_norm = []
for sent in text:
    sent_norm = normalizer.normalize(sent)
    text_norm.append(sent_norm)

## tokenization

In [3]:
from parsivar import Tokenizer 
tokenizer = Tokenizer()
sentences = []
for sent in text_norm:
    sent_norm = tokenizer.tokenize_sentences(sent)
    sentences+=sent_norm

In [4]:
from parsivar import Tokenizer 
tokenizer = Tokenizer()
tokenized_sentences= []
for sent in sentences:
    sent_norm = tokenizer.tokenize_words(sent)
    tokenized_sentences.append(sent_norm)

## remove punctuations

In [5]:
from string import punctuation
word_counter = 0
for sent in tokenized_sentences:
    for word in sent:
        word_counter += 1
        if word in punctuation:
            sent.remove(word)
            word_counter -= 1
len(tokenized_sentences)

2696

## split into train and test sets of size >= 20000 and 2000 respectively

In [6]:
def counter(words):
    word_counter = 0
    for sent in words:
        for word in sent:
            word_counter += 1
    return word_counter

In [7]:
import random

test_size = 200
test_text = random.sample(tokenized_sentences, test_size)
train_text = []

for sent in tokenized_sentences:
    if sent not in test_text:
        train_text.append(sent)

In [8]:
train_count = counter(train_text)
test_count = counter(test_text)

print("train word count :")
print(train_count)
print("test word count :")
print(test_count)

train word count :
31271
test word count :
2458


In [9]:
print(train_text[0])

['واژه', 'بیماری', 'قلبی', 'معمولا', 'برای', 'اشاره', 'به', 'حمله', 'قلبی', 'اشاره', 'می\u200cشود', '،', 'با', 'اینحال', 'بیماری', 'قلبی', 'شامل', 'سایر', 'بیماری\u200cهای', 'قلب', 'از', 'جمله', 'بیماری', 'عروق', 'کرونر', '،', 'نارسایی', 'قلبی', '،', 'سکته', 'قلبی', '،', 'آریتمی', 'قلبی', 'و', 'کاردیومیوپاتی', 'می\u200cباشد']


# without smoothing

In [10]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

from nltk.util import bigrams
from nltk.util import trigrams

In [12]:
def n_gramModel(n,test_data,train_text):
    model = MLE(n)
    train, vocab = padded_everygram_pipeline(n, train_text)
    #train_unigrams = everygrams(flatten(pad_both_ends(sent, n=1) for sent in train_text), max_len=1)

    model.fit(train, vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [13]:
test_padded = list(flatten(pad_both_ends(sent, n=1) for sent in test_text))
test_unigrams = list(everygrams(test_padded, max_len=1))

model1 = n_gramModel(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### bigram

In [14]:
test_padded = list(flatten(pad_both_ends(sent, n=2) for sent in test_text))
test_bigrams = list(bigrams(test_padded))

model2 = n_gramModel(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### trigram

In [15]:
test_padded = list(flatten(pad_both_ends(sent, n=3) for sent in test_text))
test_trigrams = list(trigrams(test_padded))

model3 = n_gramModel(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


# smoothing method : Lidstone

In [16]:
from nltk.lm import Lidstone

In [17]:
def n_gramModelLidstone(n,test_data,train_text):
    model = Lidstone(n,0.01)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [18]:
model1 = n_gramModelLidstone(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
656.3968742108017
model entropy on test sentences :
9.358424558468958


### bigram

In [19]:
model2 = n_gramModelLidstone(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
1552.8289630599802
model entropy on test sentences :
10.60068321694101


### trigram

In [20]:
model3 = n_gramModelLidstone(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
2714.131278852394
model entropy on test sentences :
11.40627478831084


# smoothing method : add-one (Laplace)

In [21]:
from nltk.lm import Laplace

In [22]:
def n_gramModelLaplace(n,test_data,train_text):
    model = Laplace(n)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [23]:
model1 = n_gramModelLaplace(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
656.3968742108017
model entropy on test sentences :
9.358424558468958


### bigram

In [24]:
model2 = n_gramModelLaplace(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
1219.9018900413228
model entropy on test sentences :
10.25254940913677


### trigram

In [25]:
model3 = n_gramModelLaplace(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
2302.051541073207
model entropy on test sentences :
11.168704419269634


# smoothing method : KneserNey

In [26]:
from nltk.lm import KneserNeyInterpolated

In [27]:
def n_gramModelKneserNey(n,test_data,train_text):
    model = KneserNeyInterpolated(n)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [28]:
model1 = n_gramModelKneserNey(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :


ZeroDivisionError: division by zero

### bigram

In [29]:
model2 = n_gramModelKneserNey(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### trigram

In [30]:
model3 = n_gramModelKneserNey(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


# smoothing method : WittenBell

In [31]:
from nltk.lm import WittenBellInterpolated

In [32]:
def n_gramModelWittenBell(n,test_data,train_text):
    model = WittenBellInterpolated(n)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [33]:
model1 = n_gramModelWittenBell(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### bigram

In [34]:
model2 = n_gramModelWittenBell(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### trigram

In [35]:
model3 = n_gramModelWittenBell(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


# smoothing method : backoff (stupid backoff)

In [36]:
from nltk.lm import StupidBackoff

In [37]:
def n_gramModelbackoff(n,test_data,train_text):
    model = StupidBackoff(n,0.4)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [38]:
model1 = n_gramModelbackoff(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### bigram

In [39]:
model2 = n_gramModelbackoff(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### trigram

In [40]:
model3 = n_gramModelbackoff(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


# smoothing method : AbsoluteDiscounting

In [41]:
from nltk.lm import AbsoluteDiscountingInterpolated

In [42]:
def n_gramModelAbsoluteDisc(n,test_data,train_text):
    model = AbsoluteDiscountingInterpolated(n)
    train, vocab = padded_everygram_pipeline(n, train_text)
    model.fit(train,vocab)
    
    print("model vocabulary :")
    print(model.vocab)
    print(len(model.vocab))
    
    print("model perplexity on test sentences :")
    print(model.perplexity(test_data))
    print("model entropy on test sentences :")
    print(model.entropy(test_data))
    
    return model

### unigram

In [43]:
model1 = n_gramModelAbsoluteDisc(1,test_unigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3614 items>
3614
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### bigram

In [44]:
model2 = n_gramModelAbsoluteDisc(2,test_bigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf


### trigram

In [45]:
model3 = n_gramModelAbsoluteDisc(3,test_trigrams,train_text)

model vocabulary :
<Vocabulary with cutoff=1 unk_label='<UNK>' and 3616 items>
3616
model perplexity on test sentences :
inf
model entropy on test sentences :
inf
