In [200]:
import os
import nltk
import random
from nltk.util import ngrams
from collections import defaultdict, Counter
from nltk.probability import FreqDist, ConditionalFreqDist

# Path to the dataset folder
DATASET_PATH = "dataset/"

# Load all diary entries from files
def load_corpus():
    corpus = []
    for filename in os.listdir(DATASET_PATH):
        if filename.endswith(".txt"):  # Assuming files are .txt
            with open(os.path.join(DATASET_PATH, filename), "r", encoding="utf-8") as file:
                corpus.append(file.read().lower())  # Convert to lowercase for consistency
    return " ".join(corpus)  # Merge all diaries into a single corpus

# Load dataset
text_corpus = load_corpus()


In [201]:
import re
import string

def tokenize_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation.replace(":", "")))  # Remove punctuation but keep ':'
    tokens = nltk.word_tokenize(text)  # Tokenize words
    
    # Remove standalone numbers (1, 2, etc.) but keep time formats like '8:45'
    tokens = [word for word in tokens if not re.match(r'^\d+$', word)]  

    return tokens

# Tokenized words after preprocessing
tokens = tokenize_text(text_corpus)
print("Sample Tokens:", tokens[:20])  # Print first 20 tokens


Sample Tokens: ['aj', 'subha', 'main', '8:15', 'pa', 'utha', 'aur', 'phir', 'main', 'washroom', 'fresh', 'hona', 'chala', 'gya', 'fresh', 'hona', 'ka', 'baad', 'main', 'kapra']


In [202]:
# Unigram Model (Word Frequency Distribution)
unigram_freq = FreqDist(tokens)


In [203]:
# Bigram Model (Conditional Frequency Distribution)
bigrams = list(nltk.bigrams(tokens))
bigram_freq = ConditionalFreqDist(bigrams)


In [204]:
# Trigram Model (Conditional Frequency Distribution)
trigrams = list(nltk.trigrams(tokens))
trigram_freq = ConditionalFreqDist(((w1, w2), w3) for w1, w2, w3 in trigrams)


In [205]:
# Extract realistic sentence beginnings from the dataset
starting_phrases = []
for i in range(len(tokens) - 1):
    if tokens[i] == "." or tokens[i] == "!":  # Detect sentence boundaries
        starting_phrases.append(tokens[i+1])  # Next word is likely a sentence start

# Use these starting words instead of random selection
starting_words = list(set(starting_phrases))  # Remove duplicates



In [206]:

### UNIGRAM GENERATION ###
def generate_better_unigram_sentence():
    sentence_length = random.randint(7, 12)
    
    # Select a first word from common starting words
    first_word = random.choice(starting_words) if starting_words else random.choice(list(unigram_freq.keys()))
    sentence = [first_word]

    # Pick the remaining words based on unigram frequency
    sentence += random.choices(list(unigram_freq.keys()), weights=unigram_freq.values(), k=sentence_length - 1)

    return " ".join(sentence).capitalize()

print("Unigram Sentence:", generate_better_unigram_sentence())

Unigram Sentence: Raat main gya shuru b baad nashta apni ami


In [207]:
def generate_better_bigram_sentence():
    sentence_length = random.randint(7, 12)
    first_word = random.choice(starting_words) if starting_words else random.choice(list(bigram_freq.keys()))
    sentence = [first_word]

    for _ in range(sentence_length - 1):
        last_word = sentence[-1]
        if last_word in bigram_freq:
            next_word = random.choices(list(bigram_freq[last_word].keys()), 
                                       weights=bigram_freq[last_word].values())[0]
            sentence.append(next_word)
        else:
            break

    return " ".join(sentence).capitalize() + "."


print("Bigram Sentence:", generate_better_bigram_sentence())

Bigram Sentence: Shuru hogya tha ami na apna room main unhein mila aur.


In [208]:
### TRIGRAM GENERATION ###
def generate_better_trigram_sentence(prev_last_word=None):
    sentence_length = random.randint(7, 12)

    # If there's a previous sentence, try to start with its last word
    if prev_last_word and prev_last_word in bigram_freq:
        first_two_words = (prev_last_word, random.choice(list(bigram_freq[prev_last_word].keys())))
    else:
        first_two_words = random.choice(list(trigram_freq.keys()))  # Pick random bigram

    sentence = list(first_two_words)

    for _ in range(sentence_length - 2):
        last_two_words = tuple(sentence[-2:])
        if last_two_words in trigram_freq:
            next_word = random.choices(list(trigram_freq[last_two_words].keys()), 
                                       weights=trigram_freq[last_two_words].values())[0]
            sentence.append(next_word)
        else:
            break

    return " ".join(sentence).capitalize()


print("Trigram Sentence:", generate_better_trigram_sentence())

Trigram Sentence: Aj sham meri bari behn ki call ayi kuch kaam


In [214]:
# Improved perplexity function
def calculate_perplexity(model, sentence_tokens, ngram_type="unigram"):
    """
    Calculates the perplexity of a sentence given an n-gram model.
    """
    n = len(sentence_tokens)
    prob = 1.0
    
    if ngram_type == "unigram":
        total_count = sum(model.values())  # Total words in unigram model
        vocab_size = len(model)
        for word in sentence_tokens:
            word_prob = (model.get(word, 0) + 1) / (total_count + vocab_size)  # Laplace Smoothing
            prob *= 1 / word_prob
    
    elif ngram_type == "bigram":
        for i in range(len(sentence_tokens) - 1):
            w1, w2 = sentence_tokens[i], sentence_tokens[i+1]
            if w1 in model and w2 in model[w1]:
                word_prob = (model[w1][w2] + 1) / (sum(model[w1].values()) + len(model))
            else:
                word_prob = 1e-5  # Small smoothing factor for unseen bigrams
            prob *= 1 / word_prob

    elif ngram_type == "trigram":
        for i in range(len(sentence_tokens) - 2):
            w1, w2, w3 = sentence_tokens[i], sentence_tokens[i+1], sentence_tokens[i+2]
            if (w1, w2) in model and w3 in model[(w1, w2)]:
                word_prob = (model[(w1, w2)][w3] + 1) / (sum(model[(w1, w2)].values()) + len(model))
            else:
                word_prob = 1e-5
            prob *= 1 / word_prob

    return math.pow(prob, 1/n)

# Example usage:
sample_unigram_sentence = generate_better_unigram_sentence().split()
sample_bigram_sentence = generate_better_bigram_sentence().split()
sample_trigram_sentence = generate_better_trigram_sentence().split()

print("Perplexity (Unigram):", calculate_perplexity(unigram_freq, sample_unigram_sentence, "unigram"))
print("Perplexity (Bigram):", calculate_perplexity(bigram_freq, sample_bigram_sentence, "bigram"))
print("Perplexity (Trigram):", calculate_perplexity(trigram_freq, sample_trigram_sentence, "trigram"))


Perplexity (Unigram): 370.0237931408215
Perplexity (Bigram): 185.66215786911124
Perplexity (Trigram): 266.9431531773597


In [215]:
def generate_smooth_diary_entry(num_sentences=5):
    diary_entry = []
    prev_last_word = None  

    for _ in range(num_sentences):
        # If we have a previous word, try generating a sentence based on it
        if prev_last_word and prev_last_word in bigram_freq:
            sentence = generate_better_trigram_sentence(prev_last_word)
        else:
            sentence = generate_better_bigram_sentence()

        diary_entry.append(sentence)
        prev_last_word = sentence.split()[-1]  # Update last word for the next sentence

    return " ".join(diary_entry)

# Generate and print a smooth diary entry
print("Smooth Diary Entry:")
print(generate_smooth_diary_entry())


Smooth Diary Entry:
Jumma ka liya nikal ay tha to time guzara bazar. Bnaye aur phir sb bike nikala tyar hona. Reh gya raat ko wapis aa ga khana khaya aur phir main. Mai soo gya aj late soya tha main. Khala b jana tha wo sb bhot khush rha phir nicha.


In [216]:
import random
from nltk.probability import ConditionalFreqDist

# Create reversed bigrams (right-to-left)
reversed_bigrams = [(w2, w1) for w1, w2 in bigrams]
backward_bigram_freq = ConditionalFreqDist(reversed_bigrams)

def generate_backward_bigram_sentence():
    sentence_length = random.randint(7, 12)
    
    # Select a random last word from frequent words
    last_word = random.choice(starting_words) if starting_words else random.choice(list(unigram_freq.keys()))
    sentence = [last_word]

    for _ in range(sentence_length - 1):
        current_word = sentence[-1]
        if current_word in backward_bigram_freq:
            prev_word = random.choices(list(backward_bigram_freq[current_word].keys()), 
                                       weights=backward_bigram_freq[current_word].values())[0]
            sentence.append(prev_word)
        else:
            break

    # Reverse the generated words to make the sentence readable
    return " ".join(sentence[::-1]).capitalize() + "."

# Generate a backward bigram sentence
print("Backward Bigram Sentence:", generate_backward_bigram_sentence())


Backward Bigram Sentence: Or kuch der baad main apna kamroon.


In [218]:
def generate_bidirectional_bigram_sentence():
    sentence_length = random.randint(7, 12)

    # Select a random middle word to start the sentence
    middle_word = random.choice(starting_words) if starting_words else random.choice(list(unigram_freq.keys()))
    sentence = [middle_word]

    # Generate words forward (right)
    for _ in range(sentence_length // 2):
        last_word = sentence[-1]
        if last_word in bigram_freq:
            next_word = random.choices(list(bigram_freq[last_word].keys()), 
                                       weights=bigram_freq[last_word].values())[0]
            sentence.append(next_word)
        else:
            break

    # Generate words backward (left)
    for _ in range(sentence_length // 2):
        first_word = sentence[0]
        if first_word in backward_bigram_freq:
            prev_word = random.choices(list(backward_bigram_freq[first_word].keys()), 
                                       weights=backward_bigram_freq[first_word].values())[0]
            sentence.insert(0, prev_word)
        else:
            break

    return " ".join(sentence).capitalize() + "."

# Generate a bidirectional bigram sentence
print("Bidirectional Bigram Sentence:", generate_bidirectional_bigram_sentence())


Bidirectional Bigram Sentence: Aya to hum na tasweerein bnayi aur phir main.
