In [1]:
from utils.preprocessing import load_data, clean_text
from utils.feature_extraction import extract_all_features
from utils.log_likelihood import (
    count_feature_frequencies,
    calculate_probabilities,
    calculate_log_likelihood,
    rank_features_by_log_likelihood
)
from utils.decision_list import build_decision_list, classify_test_data, get_default_sense
from utils.evaluation import evaluate_classifier_with_metrics

ImportError: cannot import name 'classify_test_instance' from 'utils.decision_list' (/Users/onkars/Documents/PSYC681/Problem Set/nlp1_rit_course/section2/utils/decision_list.py)

## Data Ingestion

In [16]:
train_file_bass = '/Users/onkars/Documents/PSYC681/Problem Set/nlp1_rit_course/section2/datasets/bass_trn.txt'
train_file_sake = '/Users/onkars/Documents/PSYC681/Problem Set/nlp1_rit_course/section2/datasets/sake_trn.txt'
test_file_bass = '/Users/onkars/Documents/PSYC681/Problem Set/nlp1_rit_course/section2/datasets/bass_tst.txt'
test_file_sake = '/Users/onkars/Documents/PSYC681/Problem Set/nlp1_rit_course/section2/datasets/sake_tst.txt'

train_data_bass = load_data(train_file_bass)
train_data_sake = load_data(train_file_sake)

test_data_bass = load_data(test_file_bass)
test_data_sake = load_data(test_file_sake)



In [None]:
print(train_data_bass[:3])  
print(train_data_sake[:3])  

## Feature Extraction

In [18]:
def apply_feature_extraction(data):
    """
    Applies feature extraction to each instance in the dataset.
    :param data: List of dictionaries containing context, target word, and sense.
    :return: List of dictionaries, each containing the extracted features and the sense label.
    """
    feature_data = []
    for instance in data:
        features = extract_all_features(instance)
        features['sense'] = instance['sense']  # Keep the sense label for classification later
        feature_data.append(features)
    
    return feature_data


In [19]:
train_features_bass = apply_feature_extraction(train_data_bass)
test_features_bass = apply_feature_extraction(train_data_bass)

train_features_sake = apply_feature_extraction(train_data_sake)
test_features_sake = apply_feature_extraction(train_data_sake)



In [None]:
# Output the first feature set to inspect
display(train_features_bass[0])  
display(train_features_sake[0])  

## Log-Likelihood Calculation

In [None]:
# Count feature frequencies for both senses (bass = fish/music, sake = beer/cause)
freq_bass_fish, freq_bass_music, total_bass_fish, total_bass_music = count_feature_frequencies(
    train_features_bass, sense_label1="*bass", sense_label2="bass"
)

freq_sake_beer, freq_sake_cause, total_sake_beer, total_sake_cause = count_feature_frequencies(
    train_features_sake, sense_label1="*sake", sense_label2="sake"
)



In [22]:
# Calculate probabilities (apply Laplace smoothing)
num_unique_features = len(set([f for data in train_features_bass for f in data if f != 'sense']))  # Unique features

prob_bass_fish = calculate_probabilities(freq_bass_fish, total_bass_fish, num_unique_features=num_unique_features)
prob_bass_music = calculate_probabilities(freq_bass_music, total_bass_music, num_unique_features=num_unique_features)

prob_sake_beer = calculate_probabilities(freq_sake_beer, total_sake_beer, num_unique_features=num_unique_features)
prob_sake_cause = calculate_probabilities(freq_sake_cause, total_sake_cause, num_unique_features=num_unique_features)



In [23]:
# Calculate log-likelihood ratios
log_likelihood_bass = calculate_log_likelihood(prob_bass_fish, prob_bass_music)
log_likelihood_sake = calculate_log_likelihood(prob_sake_beer, prob_sake_cause)



In [None]:
# Rank features by log-likelihood (top features are most predictive)
ranked_bass_features = rank_features_by_log_likelihood(log_likelihood_bass)
ranked_sake_features = rank_features_by_log_likelihood(log_likelihood_sake)

# Output the top 10 features
print("Top 10 features for 'bass':")
for feature, score in ranked_bass_features[:10]:
    print(f"{feature}: {score}")
    
print("\nTop 10 features for 'sake':")
for feature, score in ranked_sake_features[:10]:
    print(f"{feature}: {score}")

## Decision List Classifier

In [None]:
# Step 1: Build decision lists for 'bass' and 'sake'
decision_list_bass = build_decision_list(ranked_bass_features, sense_label1="*bass", sense_label2="bass")
decision_list_sake = build_decision_list(ranked_sake_features, sense_label1="*sake", sense_label2="sake")

# Step 2: Get default senses from training data
default_sense_bass = get_default_sense(train_data_bass, sense_label1="*bass", sense_label2="bass")
default_sense_sake = get_default_sense(train_data_sake, sense_label1="*sake", sense_label2="sake")

# Step 3: Classify the test data
predictions_bass = classify_test_data(test_data_bass, decision_list_bass, default_sense_bass)
predictions_sake = classify_test_data(test_data_sake, decision_list_sake, default_sense_sake)


In [38]:
accuracy_bass, conf_matrix_bass, precision_bass, recall_bass, f1_bass = evaluate_classifier_with_metrics(
    predictions_bass, [instance['sense'] for instance in test_data_bass]
)
accuracy_sake, conf_matrix_sake, precision_sake, recall_sake, f1_sake = evaluate_classifier_with_metrics(
    predictions_sake, [instance['sense'] for instance in test_data_sake]
)

In [None]:
# Output additional metrics for "bass" and "sake"
print(f"Bass Classifier - Accuracy: {accuracy_bass * 100:.2f}%, Precision: {precision_bass:.2f}, Recall: {recall_bass:.2f}, F1-Score: {f1_bass:.2f}")
print("Bass Confusion Matrix:")
print(conf_matrix_bass)

print(f"Sake Classifier - Accuracy: {accuracy_sake * 100:.2f}%, Precision: {precision_sake:.2f}, Recall: {recall_sake:.2f}, F1-Score: {f1_sake:.2f}")
print("Sake Confusion Matrix:")
print(conf_matrix_sake)