## Matrix and Vocabulary Construction

In [6]:
import pandas as pd

import numpy as np

from scipy import sparse

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/josemsf/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/josemsf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
news = pd.read_csv("../data/estadao_noticias_eleicao.csv", encoding="utf-8")
news = news.fillna("")

In [8]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")


<h2>Generating a Co-occurence Matrix</h2>

In [9]:
def co_occurrence_matrix(corpus):
    '''
        By: https://github.com/allansales
        Source: https://github.com/allansales/information-retrieval/tree/master/Lab%202
    '''
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

#### Removing punctuation

In [10]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_lists = content.apply(lambda text: tokenizer.tokenize(text.lower()))

#### Removing stopwords

In [48]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword_])


#### Transforming list of lists into one list

In [12]:
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [28]:
matrix, vocab = co_occurrence_matrix(tokens)


<h2>Get the TOP 3 most frequent corelated word</h2>

In [29]:

def top_3(word):
    '''
        Get the top 3 word more curelated whit the receavide as argument
        
        ARGS:
            word: String which will search for other words most corealated with this one
        RETURN:
            List: List with the most corelated words
    '''
    word_id = vocab[word]
    top = []
    for i, j, k in zip(matrix.row, matrix.col, matrix.data):
        if (i == word_id):
            top.append(j)
        if (len(top) == 3):
            break
    top = get_word_by_id(top)
    return top


def get_word_by_id(array):
    '''
        Transfor the list of IDs word into a array of words
        
        ARGS:
            array: list of id words
        RETURN:
            A list with the words instead the ids
    '''
    result = ['','','']
    for i in vocab.keys():
        for j in range(len(array)):
            if (vocab[i] == array[j]):
                result[j] = i
    return result



## Consult Bigram Frequency

In [15]:
consultable_matrix = matrix.tocsr()

In [16]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

#### Example

In [17]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3

<h2>Inverted Index</h2>

In [18]:
INVERTED_INDEX = dict()

In [19]:
def search_term(term, data):
    '''
    Search term presence in data frame and calculate its term frequence. Also full fill the dictionary 
    with the result for future new searchs.
            
    ARGS:
        term: String with single word, that word is the term to be seach in the data frame.
        data: Data frame, where the terms will be search.
        
    RETURN:
        tuple (int n, list l): where n is the total of documents witch the term is presente at least once
                               l is a list of tuple (doc, tf), where doc is the notice id and tf is its 
                               term frequence.
    '''
    
    result = []
    cont = 0
    
    if (term in INVERTED_INDEX):
        result = INVERTED_INDEX[term]
    else:
        
        rows = data.shape[0]
                
        for doc in range(rows):
            title = (data.loc[doc, 'titulo']).lower()
            sub_title = (data.loc[doc, 'subTitulo']).lower()
            content = (data.loc[doc, 'conteudo']).lower()
                        
            tf = 0
            text = nltk.word_tokenize(title + ' ' + sub_title + ' ' + content)
            i = 0
            
            while (i < len(text)):
                if (text[i].lower() == term):
                    tf += 1
                    exist = True
                i += 1
            if (tf):
                id_notice = data.loc[doc, 'idNoticia']
                result.append((id_notice, tf))
                cont += 1
        INVERTED_INDEX[term] = (cont, result)
    
    return INVERTED_INDEX[term]

In [41]:

def disjunction(query, data): 
    '''
    To get a dictionary with all notice id and its terms frequency in a list representing for all terms
    of the query.
        
    ARGS:
        query: String with the with words or terms to be seach
        data: Data frame, where the terms will be search.
        
    RETURN:
        {doc:l} doc is the notice id and l is a list with the tf (term frequency) of each term in the query
        
    '''
    dic_docs = dict()
    
    for term in query:
        term_set = search_term(term, data)
        for doc in term_set[1]:
            if (doc[0] in dic_docs):
                dic_docs[doc[0]].append(doc[1])
            else:
                dic_docs[doc[0]] = [doc[1]]
    return dic_docs
    

In [35]:
def vsm_tf(query, data):
    '''
    Improved VSM with Term Frequency Weighting
        Search for a specific query in a data frame with TF logic. For each term in the query it will result,
        a notice id with the sum of tf for each doc witch it's is present the term.
                
    ARGS:
        query: String with the with words or terms to be seach
        data: Data frame, where the terms will be search.
        
    RETURN:
        List of tuple (doc, tf), where doc repersents the notice id and tf its term frequency
    
    '''
    dic_docs = disjunction(query,data)
    
    result = []
    
    for doc in dic_docs.keys():
        result.append((doc, sum(dic_docs[doc])))
            
    return result

<h1>Analysis</h1>
<p>For a seach with the term <i>petrobrás</i> we have got the following TOP 3 result as most frequents co-words:</p>

In [53]:
top = top_3('petrobrás')
for i, j in enumerate(top):
    print('%d °: %s' % (i + 1, j))

1 °: paulo
2 °: é
3 °: graça


<p>The total of documents returned for the consult with only term <i>petrobrás</i> was about:</p>

In [61]:
print(len(vsm_tf(["petrobrás"], news)))

1043


<p>In another hand, for the expanded query, we have got the total of documents: </p>

In [62]:
result = vsm_tf(top, news)
print(len(result))

6402


<p>
    <p style="font-size:160%;">Look it closer</p>
    
The question is: does it makes sense the result?
As far we can see the answer is YES! 
That specific term 'petrobrás' we've got as result the word <i>paulo</i>, so what about?
    First we should know what is <i>Petrobrás</i>, Petrobrás is a semi-public Brazilian multinational corporation in the petroleum industry headquartered in Rio de Janeiro, Brazil. 
Second what is <i>paulo</i>', it is a person name Paulo Roberto Costa who is a Brazilian engineer and former director of Petrobras Supply, between 2004 and 2012.
So that makes totally sense as result for the expanded query!

Taking a look to <b>precision</b> and <b>recall</b> we have a trade-off

In the simple query we have less documents than in the expanded one, however those documents could be more related 
with the user search, so we should say that the we have a better <b>precision</b> but maybe not the best <b>recall</b>. For expanded query it will return a greater of documents but it will lose <b>precision</b>


</p>