# Lab 6

In [1]:
import numpy as np
import sys

In [2]:
def load_data():
    lInf = []
    f=open('data/inferno.txt', encoding="ISO-8859-1")
    for line in f:
        lInf.append(line.strip())
    f.close()
    lPur = []
    f=open('data/purgatorio.txt', encoding="ISO-8859-1")
    for line in f:
        lPur.append(line.strip())
    f.close()
    lPar = []
    f=open('data/paradiso.txt', encoding="ISO-8859-1")
    for line in f:
        lPar.append(line.strip())
    f.close() 
    return lInf, lPur, lPar

In [3]:
def split_data(l, n):
    lTrain, lTest = [], []
    for i in range(len(l)):
        if i % n == 0:
            lTest.append(l[i])
        else:
            lTrain.append(l[i])          
    return lTrain, lTest

First, the data for training and testing must be retrieved. For example, lInf_train is the training set for cantica Inferno and lInf_evaluation is the evaluation set for the model we will train. The same for the other two cantiche

In [4]:
lInf, lPur, lPar = load_data()
lInf_train, lInf_evaluation = split_data(lInf, 4)
lPur_train, lPur_evaluation = split_data(lPur, 4)
lPar_train, lPar_evaluation = split_data(lPar, 4)

Now we obtain a set (sDictCommon) containing all the possible words of each cantica:

In [5]:
def build_dictionary(lTercets):
    sDict = set([])
    for s in lTercets:
        words = s.split()
        for w in words:
            sDict.add(w)
    return sDict

In [6]:
hCls2Idx = {'inferno': 0, 'purgatorio': 1, 'paradiso': 2}

hlTercetsTrain = {
    'inferno': lInf_train,
    'purgatorio': lPur_train,
    'paradiso': lPar_train
}
sDictCommon = set([]) # set of all the words that can be found in the cantica
for cls in hlTercetsTrain: # Loop over class labels (the three cantica)
    lTercets = hlTercetsTrain[cls]
    sDictCls = build_dictionary(lTercets)
    sDictCommon = sDictCommon.union(sDictCls)

Using a pseudo-count strategy to avoid having words with count 0 in one of the cantiche (for example a word appearing in Inferno could never appear in Paradiso, to avoid that the word in Paradiso will have count=0 we initialize each counter with eps, that is a hyperparameter of our model)

In [7]:
h_clsLogProb = {}
eps = 0.001
for cls in hlTercetsTrain: # Loop over class labels
    h_clsLogProb[cls] = {w: eps for w in sDictCommon} # Create a dictionary for each class that contains all words as keys and the pseudo-count as initial values

The dictionary h_clsLogProb contains three keys (Inf, Pur, Par) and for each key the whole list of words appearing in all the commedia with the counter initialized to 0.001.
Now we compute the actual word-count separately for each cantica:

In [8]:
for cls in hlTercetsTrain: # Loop over class labels
    tercets_class = hlTercetsTrain[cls]
    for tercet in tercets_class: # Loop over all tercets of the class
        words = tercet.split()
        for w in words: # Loop over words of the given tercet
            h_clsLogProb[cls][w] += 1

Now h_clsLogProb will contain for each key (cantica) how many times each word appears in the cantica itself. All the words that still have as counter value eps means that they never appear in that cantica but they appear in at least one of the other two.

Now we compute the logarithm of the frequency of each word for each class: log(N_{cls,w} / N_{cls})

In [9]:
for cls in hlTercetsTrain: # Loop over class labels
    nWordsCls = sum(h_clsLogProb[cls].values()) # Get all occurrencies of words in cls and sum them. this is the number of words (including pseudo-counts)
    for w in h_clsLogProb[cls]: # Loop over all words
        h_clsLogProb[cls][w] = np.log(h_clsLogProb[cls][w]) - np.log(nWordsCls) # Compute log N_{cls,w} / N

Now it's time to compute the matrix of class-conditional log-likelihoods for each class each tercet in lTercets

In [14]:
def S1_compute_logLikelihoods(h_clsLogProb, text):
    logLikelihoodCls = {cls: 0 for cls in h_clsLogProb}
    for cls in h_clsLogProb: # Loop over classes
        for word in text.split(): # Loop over words
            if word in h_clsLogProb[cls]:
                logLikelihoodCls[cls] += h_clsLogProb[cls][word]
    return logLikelihoodCls

In [15]:
hCls2Idx = {cls:idx for idx, cls in enumerate(sorted(h_clsLogProb))} 
# This is a map between textual labels (keys of h_clsLogProb) and matrix rows. 
# If not provided, automatic mapping based on alphabetical oreder is used
# Inferno: 0
# Paradiso: 1
# Purgatorio: 2

S = np.zeros((len(h_clsLogProb), len(lTercets)))
for tIdx, tercet in enumerate(lTercets):
    hScores = S1_compute_logLikelihoods(h_clsLogProb, tercet)
    for cls in h_clsLogProb: # We sort the class labels so that rows are ordered according to alphabetical order of labels
        clsIdx = hCls2Idx[cls]
        S[clsIdx, tIdx] = hScores[cls]
