In [2]:
from hazm import Normalizer, sent_tokenize, word_tokenize
import glob
normalizer = Normalizer()


In [3]:
class News:
    def __init__(self, text, index):
        self.index = index
        self.normalized_text = normalizer.normalize(text)
        self.sentences = [word_tokenize(sent) for sent in sent_tokenize(self.normalized_text)]
        self.letter_sentences = [list(sent) for sent in sent_tokenize(self.normalized_text)]
        

In [4]:
categories = ('culture', 'finance', 'politic', 'social', 'sport', 'technology')
data = dict()

for category in categories:
    data[category] = dict()    
    for file in glob.glob('data/train/%s/*.txt' % category):
        news_id = file.split('/')[3].split('.')[0]
        news = open(file).read()
        data[category][news_id] = News(news, news_id)
        

In [5]:
import random

training_data = dict()
validation_data = dict()

for category in data.keys():
    shuffled_data = list(data[category].values())
    random.shuffle(shuffled_data)
    cut_index = int(len(shuffled_data) * 0.8)
    training_data[category] = shuffled_data[0:cut_index]
    validation_data[category] = shuffled_data[cut_index:]


In [68]:
from nltk.lm.models import WittenBellInterpolated, MLE
from nltk.lm.preprocessing import padded_everygram_pipeline, flatten
from nltk.lm import NgramCounter

word_unigram_models = dict()
word_bigram_models = dict()

for category in categories:
    text = [sentence for news in training_data[category] for sentence in news.sentences]
    
    train, vocab = padded_everygram_pipeline(1, text)
    
    word_unigram_models[category] = WittenBellInterpolated(1)
    word_unigram_models[category].fit(train, vocab)

    train, vocab = padded_everygram_pipeline(2, text)
    word_bigram_models[category] = WittenBellInterpolated(2)
    word_bigram_models[category].fit(train, vocab)

In [7]:
letter_unigram_models = dict()
letter_bigrams_models = dict()
for category in categories:
    text = [sentence for news in training_data[category] for sentence in news.letter_sentences]
    
    train, vocab = padded_everygram_pipeline(1, text)
    letter_unigram_models[category] = MLE(1)
    letter_unigram_models[category].fit(train, vocab)

    train, vocab = padded_everygram_pipeline(2, text)
    letter_bigrams_models[category] = MLE(2)
    letter_bigrams_models[category].fit(train, vocab)



In [72]:
from nltk import ngrams

res = dict()
news = validation_data['culture'][0]
for category in categories:
    test = [ngram for sentence in news.sentences for ngram in ngrams(sentence, 2)]
    res[category] = word_bigram_models[category].entropy(test)
