In [1]:
from functions.preprocessing import remove_accents, preprocess_text, identify_ambiguous_words, clean_text
from functions.feature_extraction import extract_features, extract_position_features
from collections import defaultdict
from functions.decision_list import classify_word, measure_collocations, calculate_log_likelihood, build_decision_list 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/onkars/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Load the data from the datasets folder
def load_data(file_path):
    """
    Load the dataset file, split into tokens (assuming one word per line or separated by spaces).
    """
    with open(file_path, 'r') as file:
         return file.read().replace("\n", " ").replace("\t", " ")

# Load training and testing datasets
bass_train = load_data('datasets/bass.trn')
bass_test = load_data('datasets/bass.tst')
sake_train = load_data('datasets/sake.trn')
sake_test = load_data('datasets/sake.tst')

In [3]:
bass_test[:100]

'bass: the frantic drums and pulsing bass unleashed by Roni Size and bass: timbres into a single mudd'

In [4]:
sake_test

'sake: had been done for the sake of knowledge and he had sake: Herrera, we assumed for the sake of argument ``that in a sake: happily grow them for the sake of my garden. <DOC id="NYT20000825.0173" sake: undergo relentless persecution for the sake of upholding deeply held beliefs *sake: been the hallmarks of the sake industry in the 20th century. sake: of our people, for the sake of other parties, too, and sake: and immigration facilities for the sake of business, according to Bangladeshi sake: and the law for the sake of their own interests,\'\' said sake: on Tuesday night. ``For the sake of our children and our sake: disturb ``faithful people\'\' for the sake of fighting fundamentalism. Yilmaz also sake: her sake and my mom\'s sake and my other sister\'s sake. sake: to gas-fueled stations for the sake of environmental protection. <DOC id="XIE20000130.0153" sake: do bring beer, for the sake of the rest of us sake: But, ``size for its own sake is not what we are sake: refusing all coo

In [5]:
bass_train



In [6]:
bass_train_clean = clean_text(bass_train)
bass_train_clean

['bass',
 'stephan',
 'weidner',
 'the',
 'composer',
 'and',
 'bass',
 'player',
 'for',
 'boehse',
 'onkelz',
 'a',
 'bass',
 'valued',
 'at',
 '250000',
 'another',
 'double',
 'bass',
 'trapped',
 'in',
 'the',
 'room',
 'is',
 'bass',
 'portion',
 'of',
 'shrimp',
 'mussels',
 'sea',
 'bass',
 'and',
 'whatnot',
 'in',
 'a',
 'spicy',
 'bass',
 'optional',
 'material',
 'follows',
 'striped',
 'bass',
 'are',
 'also',
 'being',
 'spotted',
 'in',
 'bass',
 'source',
 'of',
 'entertainment',
 'is',
 'payperview',
 'bass',
 'fishing',
 'yet',
 'this',
 'is',
 'still',
 'bass',
 'herring',
 'and',
 'the',
 'enormous',
 'striped',
 'bass',
 'that',
 'feed',
 'on',
 'them',
 'it',
 'bass',
 'incorporates',
 'the',
 'entire',
 'trio',
 'the',
 'bass',
 'and',
 'drums',
 'arent',
 'simply',
 'supporting',
 'bass',
 'an',
 'immediate',
 'increase',
 'in',
 'rumbling',
 'bass',
 'tones',
 'associated',
 'with',
 'explosions',
 'but',
 'bass',
 '_',
 'complete',
 'with',
 'horn',
 'section'

In [5]:


# Preprocess the datasets: lowercasing and tokenizing
bass_train_clean = preprocess_text(bass_train)
bass_test_clean = preprocess_text(bass_test)
sake_train_clean = preprocess_text(sake_train)
sake_test_clean = preprocess_text(sake_test)

# Identify ambiguous words in the training data
ambiguous_words_bass = identify_ambiguous_words(bass_train_clean)
print(ambiguous_words_bass)

ambiguous_words_sake = identify_ambiguous_words(sake_train_clean)
print(ambiguous_words_sake)

{}
{}


In [6]:
bass_train_clean[:20]

'bass stephan weidner'

In [7]:
sake_train_clean

'sake of the society for the sake of their reemployment at an sake sacrifice their souls for the sake of their country doc idafe sake however making controversy for controversys sake isnt what hes about he sake branches of government for the sake of suing cardinale said and sake hurt taking experience for the sake of taking experience despite that sake commitment to art for arts sake and a commitment to the sake and coarsely chopped  cup sake  cup chicken stock  sake to make sacrifices for the sake economic stability she said she sake stay together indefinitely for the sake of the kids in a sake to killing prostitutes for the sake of god the killings have sake to change them for the sake of pragmatic political gains what sake we must play for the sake of the game kottan said sake a major concession for the sake of stability dropping a demand sake of history for the future sake of the world the forces sake sacrificing your life for the sake or work or they believe sake moralminded will 

In [8]:


# Extract features from the training data (bass and sake datasets)
bass_train_features = extract_features(bass_train_clean, ambiguous_words_bass, window_size=21)
print(bass_train_features)

sake_train_features = extract_features(sake_train_clean, ambiguous_words_sake, window_size=21)
print(sake_train_features)

# Optionally, extract positional features (-k, +k)
bass_train_positional_features = extract_position_features(bass_train_clean, ambiguous_words_bass, k=1)
sake_train_positional_features = extract_position_features(sake_train_clean, ambiguous_words_sake, k=1)

[]
[]


In [9]:


# Measure collocations in the training data (bass and sake)
bass_collocation_freq = measure_collocations(bass_train_features)
sake_collocation_freq = measure_collocations(sake_train_features)

# Placeholder total collocations (replace with actual frequencies if available)
total_collocations = defaultdict(int)

# Calculate log-likelihoods
bass_log_likelihoods = calculate_log_likelihood(bass_collocation_freq, total_collocations)
sake_log_likelihoods = calculate_log_likelihood(sake_collocation_freq, total_collocations)

# Build the decision lists
bass_decision_list = build_decision_list(bass_log_likelihoods)
sake_decision_list = build_decision_list(sake_log_likelihoods)

In [10]:


# Testing on bass test dataset
correct_predictions_bass = 0
for word in bass_test_clean:
    context = []  # Extract the context around the word
    classified_word = classify_word(context, bass_decision_list)
    if classified_word == word:
        correct_predictions_bass += 1

# Testing on sake test dataset
correct_predictions_sake = 0
for word in sake_test_clean:
    context = []  # Extract the context around the word
    classified_word = classify_word(context, sake_decision_list)
    if classified_word == word:
        correct_predictions_sake += 1

# Calculate accuracy
bass_accuracy = correct_predictions_bass / len(bass_test_clean)
sake_accuracy = correct_predictions_sake / len(sake_test_clean)

print(f"Bass dataset accuracy: {bass_accuracy}")
print(f"Sake dataset accuracy: {sake_accuracy}")

Bass dataset accuracy: 0.0
Sake dataset accuracy: 0.0


In [11]:
correct_predictions_bass

0

In [12]:
sake_test_clean

'sake had been done for the sake of knowledge and he had sake herrera we assumed for the sake of argument that in a sake happily grow them for the sake of my garden doc idnyt sake undergo relentless persecution for the sake of upholding deeply held beliefs sake been the hallmarks of the sake industry in the th century sake of our people for the sake of other parties too and sake and immigration facilities for the sake of business according to bangladeshi sake and the law for the sake of their own interests said sake on tuesday night for the sake of our children and our sake disturb faithful people for the sake of fighting fundamentalism yilmaz also sake her sake and my moms sake and my other sisters sake sake to gasfueled stations for the sake of environmental protection doc idxie sake do bring beer for the sake of the rest of us sake but size for its own sake is not what we are sake refusing all cookies for the sake of privacy im more selective sake hope really just for the sake of it