## Matrix and Vocabulary Construction

In [35]:
import pandas as pd

import numpy as np

from scipy import sparse

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manorlf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/manorlf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [52]:
news = pd.read_csv("../data/estadao_noticias_eleicao.csv", encoding="utf-8")
news = news.fillna("")

In [53]:
content = news.titulo + " " + news.subTitulo + " " + news.conteudo
content = content.fillna("")


In [38]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    n = len(vocab)
   
    vocab_to_index = {word:i for i, word in enumerate(vocab)}
    
    bi_grams = list(bigrams(corpus))

    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    I=list()
    J=list()
    V=list()
    
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]

        I.append(vocab_to_index[previous])
        J.append(vocab_to_index[current])
        V.append(count)
        
    co_occurrence_matrix = sparse.coo_matrix((V,(I,J)), shape=(n,n))

    return co_occurrence_matrix, vocab_to_index

#### Removing punctuation

In [39]:
tokenizer = RegexpTokenizer(r'\w+')
tokens_lists = content.apply(lambda text: tokenizer.tokenize(text.lower()))

#### Removing stopwords

In [40]:
stopword_ = stopwords.words('portuguese')
filtered_tokens = tokens_lists.apply(lambda tokens: [token for token in tokens if token not in stopword_])

#### Transforming list of lists into one list

In [41]:
tokens = [token for tokens_list in filtered_tokens for token in tokens_list]

In [42]:
matrix, vocab = co_occurrence_matrix(tokens)

In [65]:

def top_3(word):
    word_id = vocab[word]
    top = []
    for i, j, k in zip(matrix.row, matrix.col, matrix.data):
        if (i == word_id):
            top.append(j)
        if (len(top) == 3):
            break
    top = get_word_by_id(top)
    return top


def get_word_by_id(array):
    result = ['','','']
    for i in vocab.keys():
        for j in range(len(array)):
            if (vocab[i] == array[j]):
                result[j] = i
    return result



## Consult Bigram Frequency

In [10]:
consultable_matrix = matrix.tocsr()

In [11]:
def consult_frequency(w1, w2):
    return(consultable_matrix[vocab[w1],vocab[w2]])

#### Example

In [12]:
w1 = 'poucos'
w2 = 'recursos'
consult_frequency(w1, w2)

3

In [44]:
inverted_index = dict()

In [54]:

def find_term(term, data):
    result = []
    if (term in inverted_index):
        result = data_result[term]
    else:
        rows = data.shape[0]
    
        aux = []
        split = []
        split2 = []
        word1 = ''
        word2 = ''

    
        for i in range(rows):
            word1 = data.loc[i, 'titulo'].lower()
            word2 = data.loc[i, 'conteudo'].lower()
            split = nltk.word_tokenize(word1)
            split2 = nltk.word_tokenize(word2)

            if (contains_term(term, split) or contains_term(term, split2)):
                result.append(data.loc[i, 'idNoticia'])
        data_result[term] = result
    return result
        
def contains_term(term, array):
    for item in array:
        if (term == item):
            return True
    return False



In [63]:
def conjunction(terms, data):
    result = set()
    for term in terms:
        aux = set(find_term(term, data))
        result = result.union(aux)
    return result
                     
        

In [64]:
top = top_3('dilma')
top.append('dilma')
print(top)
result = conjunction(top, news)
print(len(result))

['rousseff', 'é', 'disse', 'dilma']
7290
