In [1]:
import numpy as np
import pandas as pd
import matplotlib as matplot
import nltk
import sklearn as sk
import re
import scipy.sparse

# Question 1

In [2]:
en_df_raw = pd.read_csv('data/CONcreTEXT_trial_EN.tsv', sep='\t') # load data files
it_df_raw = pd.read_csv('data/CONcreTEXT_trial_IT.tsv', sep='\t')

en_df = pd.DataFrame()
it_df = pd.DataFrame()

en_df_raw.head()

Unnamed: 0,TARGET,POS,INDEX,TEXT,MEAN
0,achievement,N,3,"Bring up academic achievements , awards , and ...",3.06
1,achievement,N,9,"Please list people you have helped , your pers...",3.03
2,activate,V,1,Add activated carbon straight to your vodka .,3.83
3,activate,V,15,"Place sensors around your garden , and when a ...",5.51
4,adventure,N,9,Look for a partner that shares your level of a...,2.03


In [3]:
en_df['SENTENCES'] = en_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())
it_df['SENTENCES'] = it_df_raw['TEXT'].apply(lambda sent: sent.strip().lower())

In [4]:
tokenizer = nltk.RegexpTokenizer(r"[A-Za-zÀ-ÖØ-öø-ÿ']+") # because I want to keep apostrophes and accented characters
en_df['WORDS'] = en_df['SENTENCES'].apply(tokenizer.tokenize)
it_df['WORDS'] = it_df['SENTENCES'].apply(tokenizer.tokenize)

In [5]:
en_df['TOKENS'] = en_df['WORDS'].apply(lambda words: ["<s>"] + words + ["</s>"])
it_df['TOKENS'] = it_df['WORDS'].apply(lambda words: ["<s>"] + words + ["</s>"])
en_df['TOKENS'][0]

['<s>',
 'bring',
 'up',
 'academic',
 'achievements',
 'awards',
 'and',
 'other',
 'milestones',
 'in',
 'your',
 'life',
 '</s>']

In [6]:
en_vocab = nltk.lm.Vocabulary([word for sentence in en_df['WORDS'] for word in sentence])
len(en_vocab)

644

In [82]:
class PPMI:
    def __init__(self, sentences, window_size=3):
        self.w = window_size
        self.sentences = sentences

        self.bagofwords = [word for sentence in self.sentences for word in sentence]
        self.vocab = nltk.lm.Vocabulary(self.bagofwords)
        self.Z = len(self.bagofwords)
        self.comatrix = pd.DataFrame(0, columns=self.vocab, index=self.vocab, dtype=np.float32)
        self.ppmimx = pd.DataFrame(0, columns=self.vocab, index=self.vocab, dtype=np.float32)
        
        self.compute()
        
    def compute(self):
        tokens = list(self.vocab) # so we can use the symmetry
        
        # map co-occurances
        index = 0
        for word1 in tokens[0:]:
            for word2 in tokens[index+1:]:
                self.comatrix[word1][word2] += self.co(word1, word2)
            index += 1
            
        for col in self.comatrix.index:
            for row in self.comatrix.index:
                self.comatrix[row][col] = self.comatrix[col][row]

                
        #compute ppmi
        index = 0
        for word1 in tokens[0:]:
            for word2 in tokens[index+1:]:
                if self.comatrix[word1][word2] > 0:
                    numerator = self.comatrix[word1][word2] * self.Z
                    denominator = self.vocab[word1] * self.vocab[word2]

                    quotient = numerator / denominator
                    self.ppmimx[word1][word2] = max(0,np.log(quotient))
                
            index += 1
        
        # mirror matrix
        for col in self.ppmimx.index:
            for row in self.ppmimx.index:
                self.ppmimx[row][col] = self.ppmimx[col][row]
                
    def co(self, word1, word2):
        count = 0
        for sentence in self.sentences:
            for index in range(0, len(sentence)):
                if sentence[index] == word1:
                    left_slice_index = max(0, index-self.w)
                    right_slice_index = index+self.w

                    for inner_index, inner_word in enumerate(sentence[left_slice_index : right_slice_index+1]):
                        if inner_word == word2: count += 1

        return count

In [83]:
en_ppmi = PPMI(en_df['TOKENS'])

In [84]:
en_ppmi.ppmimx

Unnamed: 0,<s>,bring,up,academic,achievements,awards,and,other,milestones,in,...,men,women,wear,same,shoes,woman,whom,she,speaking,<UNK>
<s>,0.000000,2.021548,2.021548,2.714695,0.000000,0.000000,0.000000,0.000000,0.0,0.817575,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0
bring,2.021548,0.000000,5.933570,6.626718,5.933570,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0
up,2.021548,5.933570,0.000000,6.626718,5.933570,6.626718,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0
academic,2.714695,6.626718,6.626718,0.000000,6.626718,7.319865,3.823357,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0
achievements,0.000000,5.933570,5.933570,6.626718,0.000000,6.626718,3.130210,5.240423,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
woman,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,7.319865,0.000000,0.000000,0.0
whom,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,7.319865,0.000000,0.000000,0.000000,0.0
she,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,7.319865,0.0
speaking,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,7.319865,0.000000,0.0


## Question 2

The algorithm I made for forming the PPMI matrix is pretty simple, but I had to optimize some parts to halve the processing time.

First, we tokenize the input sentences. I didn't do this as part of my PPMI class since I don't think it should be encapulated by that class, logically speaking.

Once we have fed the input tokenized sentences into the class constructor, we construct an NLTK Vocabulary object, as well as a Pandas DataFrame filled with 0s.

Then, once compute() is called, we listify the vocabulary so we can use slicing. We slice the two dimensions such that we only compute `[x][y]` instead of both `[x][y]` *and* `[y][x]`. Then, for every combination of two words (excluding symmetrical combinations) we use the sliding window algorithm on each sentence. This ensures that we find all co-occurances for two unique word combinations in each sentence.

The sliding window algorithm is pretty simple:
For each sentence in our corpus, and for each instance of the search word in a given sentence, we go through the neighboring `K` words to see if the second word is found. We avoid overlooking duplicate words in a sentence ince we don't use the index() function.

After we've found all co-occurances and put them in our matrix, we just loop through the same cells as before (the symmetrical subset such that it's from one corner to the diagonal) and compute the probability of the two words independently and use the formula: `log2(cooccurances/probability of individual)`. Then, we just mirror the matrix to the other side of the diagonal.

I think the time complexity is around O(n^4) since even though we split the input token size in half computationally, that's a linear growth which doesn't affect the time complexity increase. It's four nested loops with some if conditionals but I think it works out to n^4. It's not great, but it works!

## Question 3

I would look at the maximum ppmi correspondence for a given word `w` and use the `pd.Series.argmax()` function to find the index of the row that has the highest correlation. Now, our corpus only has 100 sentences and ~670 unique words. I expect this to be a pretty sparse matrix due to this fact. But let's look at two examples! We can see below that 'awards' and the words 'achievements', 'milestones' are pretty correlated.

In [93]:
print("PPMI awards, milestones:", en_ppmi.ppmimx['awards']['milestones'])
print("PPMI awards, achievements:", en_ppmi.ppmimx['awards']['achievements'])

print("\nPPMI awards, and:", en_ppmi.ppmimx['awards']['and'])

PPMI awards, milestones: 7.3198647
PPMI awards, achievements: 6.6267176

PPMI awards, and: 3.8233573


Contrast this to 'awards' and 'and'. There's not so much correlation in the corpus there. Now, this doesn't _necessitate_ that 'awards' and 'milestones' are highly correlated in the real world, as our corpus only has 100 sentences, which is not a lot. If we use a corpus of the size of the Google Books library from even just the past year, I think we could find meaningful connections.

We would find these relationships by using some sort of unsupervised learning model that clusters word pairings together. I think k-nearest neighbor would do well, since it would easily define clusters.

# Question 4

In [95]:
it_ppmi = PPMI(it_df['TOKENS'])
it_ppmi.ppmimx

Unnamed: 0,<s>,guardati,i,piedi,o,fai,finta,di,essere,affascinata,...,conigli,hanno,ottimo,udito,un',ottima,individuare,predatori,facilmente,<UNK>
<s>,0.000000,2.710048,1.668594,2.710048,0.000000,0.000000,0.000000,0.724133,0.918289,0.0,...,2.710048,2.710048,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
guardati,2.710048,0.000000,4.482005,7.315218,4.917323,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
i,1.668594,4.482005,0.000000,4.482005,2.084110,4.482005,0.000000,0.000000,0.000000,0.0,...,4.482005,4.482005,0.0,0.000000,0.000000,0.0,4.482005,4.482005,4.482005,0.0
piedi,2.710048,7.315218,4.482005,0.000000,4.917323,7.315218,7.315218,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
o,0.000000,4.917323,2.084110,4.917323,0.000000,4.917323,4.917323,0.985497,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ottima,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,7.315218,7.315218,0.0,0.000000,0.000000,0.000000,0.0
individuare,0.000000,0.000000,4.482005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,7.315218,7.315218,0.0
predatori,0.000000,0.000000,4.482005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,7.315218,0.000000,7.315218,0.0
facilmente,0.000000,0.000000,4.482005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,7.315218,7.315218,0.000000,0.0
