In [1]:
import math

In [3]:
with open('spectX-1.txt', 'r') as f:
    features = [list(map(int, line.rstrip(' \n').split(' '))) for line in f]

with open('spectY-1.txt', 'r') as f:
    labels = [int(line.rstrip("\n")) for line in f]

In [5]:
num_samples = len(features)
iterations = 256
num_features = 23
probabilities = [0.05] * num_features
feature_counts = [0] * num_features

In [7]:
for i in range(num_samples):
    for j in range(num_features):
        if features[i][j] == 1:
            feature_counts[j] += 1

In [9]:
log_likelihood = [0] * (iterations + 1)
mistakes = [0] * (iterations + 1)

In [11]:
for iteration in range(iterations + 1):
    for sample in range(num_samples):
        term_1 = 0
        term_2 = 1
        for feature in range(num_features):
            if features[sample][feature] == 1:
                term_1 += features[sample][feature] * (0 if probabilities[feature] == 0 else math.log(1 - probabilities[feature]))
            term_2 *= (1 - probabilities[feature]) ** features[sample][feature]
        log_likelihood[iteration] += (1 - labels[sample]) * term_1 + labels[sample] * math.log(1 - term_2)
    
    log_likelihood[iteration] /= num_samples
    mistakes[iteration] = 0
    for sample in range(num_samples):
        probability = 1
        for feature in range(num_features):
            probability *= (1 - probabilities[feature]) ** features[sample][feature]
        prob_of_1 = 1 - probability
        if (labels[sample] == 1 and prob_of_1 <= 0.5) or (labels[sample] == 0 and prob_of_1 >= 0.5):
            mistakes[iteration] += 1

    posteriors = []
    for sample in range(num_samples):
        denominator = 1
        for feature in range(num_features):
            denominator *= (1 - probabilities[feature]) ** features[sample][feature]
        posteriors.append([labels[sample] * features[sample][feature] * probabilities[feature] / (1 - denominator) for feature in range(num_features)])

    for feature in range(num_features):
        probabilities[feature] = sum(posteriors[sample][feature] for sample in range(num_samples)) / feature_counts[feature]

In [12]:
for iteration in [0, 1, 2, 4, 8, 16, 32, 64, 128, 256]:
    print(f"Iteration: {iteration} \t Mistakes: {mistakes[iteration]} \t Log Likelihood: {round(log_likelihood[iteration], 5)}")


Iteration: 0 	 Mistakes: 175 	 Log Likelihood: -0.95809
Iteration: 1 	 Mistakes: 56 	 Log Likelihood: -0.49592
Iteration: 2 	 Mistakes: 43 	 Log Likelihood: -0.40822
Iteration: 4 	 Mistakes: 42 	 Log Likelihood: -0.36461
Iteration: 8 	 Mistakes: 44 	 Log Likelihood: -0.3475
Iteration: 16 	 Mistakes: 40 	 Log Likelihood: -0.33462
Iteration: 32 	 Mistakes: 37 	 Log Likelihood: -0.32258
Iteration: 64 	 Mistakes: 37 	 Log Likelihood: -0.31483
Iteration: 128 	 Mistakes: 36 	 Log Likelihood: -0.31116
Iteration: 256 	 Mistakes: 36 	 Log Likelihood: -0.31016
