In [12]:
from collections import Counter, defaultdict
import math

# tokenize text and prepare trigrams

In [13]:
file_path = r"C:\Users\mostafa\AI&ML\assiment-nlp\AI-work.txt"

def collect_and_tokenize_corpus(file_path):
    # Read file contents
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()  
    
    tokens = text.split()
    
    trigram_model = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
    
    return tokens, trigram_model
def extract_trigrams(file_path, num_samples=10):
    _, trigram_model = collect_and_tokenize_corpus(file_path)
    return trigram_model[:num_samples]

trigram_sample = extract_trigrams(file_path, num_samples=20)

print("Sample Trigrams:")
for trigram in trigram_sample:
    print(trigram)

Sample Trigrams:
('artificial', 'intelligence', 'techniques')
('intelligence', 'techniques', 'in')
('techniques', 'in', 'financial')
('in', 'financial', 'trading:')
('financial', 'trading:', 'a')
('trading:', 'a', 'systematic')
('a', 'systematic', 'artificial')
('systematic', 'artificial', 'intelligence')
('artificial', 'intelligence', 'financial')
('intelligence', 'financial', 'technology')
('financial', 'technology', 'artificial')
('technology', 'artificial', 'intelligence')
('artificial', 'intelligence', '(ai)')
('intelligence', '(ai)', 'approaches')
('(ai)', 'approaches', 'have')
('approaches', 'have', 'been')
('have', 'been', 'increasingly')
('been', 'increasingly', 'used')
('increasingly', 'used', 'in')
('used', 'in', 'financial')


## build trigram frequency model with Laplace Smoothing


In [14]:
def build_trigram_model(trigram_model, tokens, alpha=1):
    trigram_counts = Counter(trigram_model) #how many times each trigram appears
    total_trigrams = sum(trigram_counts.values())
    vocabulary_size = len(set(tokens)) #removes any duplicates
    
    smoothed_probs = defaultdict(lambda: alpha / (total_trigrams + alpha * vocabulary_size)) # default probability for any trigram
    
    for trigram, count in trigram_counts.items():
        smoothed_probs[trigram] = (count + alpha) / (total_trigrams + alpha * vocabulary_size) #probability to each trigram
    
    return trigram_counts, smoothed_probs

# Autocomplete function


In [15]:
def autocomplete(input_text, smoothed_probs):
    input_tokens = input_text.lower().split()
    if len(input_tokens) < 2:
        return ["Please type at least two words."]
    
    last_bigram = tuple(input_tokens[-2:])
    suggestions = {
        trigram[-1]: prob for trigram, prob in smoothed_probs.items() if trigram[:2] == last_bigram
    }
    
    if not suggestions:
        return ["No suggestions available."]
    
    sorted_suggestions = sorted(suggestions.items(), key=lambda x: -x[1])[:5] 
    return [word for word, _ in sorted_suggestions]

# Function to calculate perplexity


In [16]:
def calculate_perplexity(smoothed_probs, test_trigrams):
    N = len(test_trigrams)
    log_prob_sum = 0
    for trigram in test_trigrams:
        prob = smoothed_probs[trigram]
        log_prob_sum += math.log(prob, 2)  # log(P1×P2×P3)=log(P1)+log(P2)+log(P3)
    perplexity = 2 ** (-log_prob_sum / N)
    return perplexity


# Load and prepare the corpus data

In [17]:
from collections import Counter, defaultdict

tokens, trigram_model = collect_and_tokenize_corpus(file_path)
trigram_counts, smoothed_probs = build_trigram_model(trigram_model, tokens)


# Streamlit Interface


In [18]:
import streamlit as st

st.title("Trigram Model Autocomplete and Perplexity Calculator")

st.write(f"Total Words in Corpus: {len(tokens)}")
st.write(f"Total Trigrams in Corpus: {len(trigram_model)}")
st.write(f"Vocabulary Size: {len(set(tokens))}")

# Autocomplete Section
st.header("Autocomplete Suggestions")
user_input = st.text_input("Type your text here:")

if user_input:
    suggestions = autocomplete(user_input, smoothed_probs)
    st.write("Suggestions:")
    for suggestion in suggestions:
        st.write(suggestion)

2024-11-19 09:26:23.494 
  command:

    streamlit run C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
