In [1]:
# Load the data from the datasets folder
def load_data(file_path):
    """
    Load the dataset file, split into tokens (assuming one word per line or separated by spaces).
    """
    with open(file_path, 'r') as file:
        return file.read().split()

# Load training and testing datasets
bass_train = load_data('datasets/bass.trn')
bass_test = load_data('datasets/bass.tst')
sake_train = load_data('datasets/sake.trn')
sake_test = load_data('datasets/sake.tst')

In [2]:
bass_test

['bass:',
 'the',
 'frantic',
 'drums',
 'and',
 'pulsing',
 'bass',
 'unleashed',
 'by',
 'Roni',
 'Size',
 'and',
 'bass:',
 'timbres',
 'into',
 'a',
 'single',
 'muddy',
 'bass',
 'line.',
 '``The',
 'treble',
 'fared',
 'no',
 'bass:',
 'but',
 'instead',
 'of',
 'playing',
 'his',
 'bass',
 'with',
 'a',
 'pick,',
 'as',
 'Avery',
 '*bass:',
 'presentation',
 'of',
 'a',
 'whole',
 'black',
 'bass',
 '_',
 'slit',
 'and',
 'sprinkled',
 'with',
 'bass:',
 'compare',
 'themselves',
 'to',
 'one',
 'another,',
 'bass',
 'player',
 'Milt',
 'Hinton,',
 'who',
 'gained',
 'bass:',
 'song',
 'with',
 'a',
 'hauntingly',
 'deep',
 'bass',
 'and',
 'monotone',
 'refrain:',
 'One',
 'more',
 '*bass:',
 'the',
 'rocks',
 'or',
 'wreck',
 'with',
 'bass',
 'fishing',
 'gear.',
 'On',
 'Saturday,',
 'the',
 'bass:',
 'Laurent,',
 'movingly',
 'portrayed',
 'by',
 'the',
 'bass',
 'Robert',
 'Lloyd,',
 'secretly',
 'marries',
 'them,',
 'bass:',
 'dramatic',
 'licks',
 'of',
 'the',
 'electr

In [3]:
sake_test

['sake:',
 'had',
 'been',
 'done',
 'for',
 'the',
 'sake',
 'of',
 'knowledge',
 'and',
 'he',
 'had',
 'sake:',
 'Herrera,',
 'we',
 'assumed',
 'for',
 'the',
 'sake',
 'of',
 'argument',
 '``that',
 'in',
 'a',
 'sake:',
 'happily',
 'grow',
 'them',
 'for',
 'the',
 'sake',
 'of',
 'my',
 'garden.',
 '<DOC',
 'id="NYT20000825.0173"',
 'sake:',
 'undergo',
 'relentless',
 'persecution',
 'for',
 'the',
 'sake',
 'of',
 'upholding',
 'deeply',
 'held',
 'beliefs',
 '*sake:',
 'been',
 'the',
 'hallmarks',
 'of',
 'the',
 'sake',
 'industry',
 'in',
 'the',
 '20th',
 'century.',
 'sake:',
 'of',
 'our',
 'people,',
 'for',
 'the',
 'sake',
 'of',
 'other',
 'parties,',
 'too,',
 'and',
 'sake:',
 'and',
 'immigration',
 'facilities',
 'for',
 'the',
 'sake',
 'of',
 'business,',
 'according',
 'to',
 'Bangladeshi',
 'sake:',
 'and',
 'the',
 'law',
 'for',
 'the',
 'sake',
 'of',
 'their',
 'own',
 "interests,''",
 'said',
 'sake:',
 'on',
 'Tuesday',
 'night.',
 '``For',
 'the',
 '

In [4]:
from functions.preprocessing import preprocess_text, identify_ambiguous_words

# Preprocess the datasets: lowercasing and tokenizing
bass_train_clean = preprocess_text(' '.join(bass_train))
bass_test_clean = preprocess_text(' '.join(bass_test))
sake_train_clean = preprocess_text(' '.join(sake_train))
sake_test_clean = preprocess_text(' '.join(sake_test))

# Identify ambiguous words in the training data
ambiguous_words_bass = identify_ambiguous_words(bass_train_clean)
ambiguous_words_sake = identify_ambiguous_words(sake_train_clean)

In [7]:
from functions.feature_extraction import extract_features, extract_position_features
from functions.preprocessing import remove_accents

# Extract features from the training data (bass and sake datasets)
bass_train_features = extract_features(bass_train_clean, ambiguous_words_bass, window_size=10)
sake_train_features = extract_features(sake_train_clean, ambiguous_words_sake, window_size=10)

# Optionally, extract positional features (-k, +k)
bass_train_positional_features = extract_position_features(bass_train_clean, ambiguous_words_bass, k=1)
sake_train_positional_features = extract_position_features(sake_train_clean, ambiguous_words_sake, k=1)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/onkars/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
from collections import defaultdict
from functions.decision_list import measure_collocations, calculate_log_likelihood, build_decision_list

# Measure collocations in the training data (bass and sake)
bass_collocation_freq = measure_collocations(bass_train_features)
sake_collocation_freq = measure_collocations(sake_train_features)

# Placeholder total collocations (replace with actual frequencies if available)
total_collocations = defaultdict(int)

# Calculate log-likelihoods
bass_log_likelihoods = calculate_log_likelihood(bass_collocation_freq, total_collocations)
sake_log_likelihoods = calculate_log_likelihood(sake_collocation_freq, total_collocations)

# Build the decision lists
bass_decision_list = build_decision_list(bass_log_likelihoods)
sake_decision_list = build_decision_list(sake_log_likelihoods)

In [9]:
from functions.decision_list import classify_word

# Testing on bass test dataset
correct_predictions_bass = 0
for word in bass_test_clean:
    context = []  # Extract the context around the word
    classified_word = classify_word(context, bass_decision_list)
    if classified_word == word:
        correct_predictions_bass += 1

# Testing on sake test dataset
correct_predictions_sake = 0
for word in sake_test_clean:
    context = []  # Extract the context around the word
    classified_word = classify_word(context, sake_decision_list)
    if classified_word == word:
        correct_predictions_sake += 1

# Calculate accuracy
bass_accuracy = correct_predictions_bass / len(bass_test_clean)
sake_accuracy = correct_predictions_sake / len(sake_test_clean)

print(f"Bass dataset accuracy: {bass_accuracy}")
print(f"Sake dataset accuracy: {sake_accuracy}")

Bass dataset accuracy: 0.0
Sake dataset accuracy: 0.0
