In [1]:
import nltk
import spacy
import os
from utils.lexical import normalizador



# Loading corpora data to main memory

In [2]:
corpora_path = '../data/corpora/'
corpora_dirs = os.listdir(corpora_path)
normalizer = normalizador.Normalizador()

corpora = {}
for corpus in corpora_dirs:
    files = [os.path.join(corpora_path + corpus, f)
             for f in os.listdir(corpora_path + corpus)
             if os.path.isfile(os.path.join(corpora_path + corpus, f))]
    corpora[corpus] = {'raw' : [],'s_tokenized' : [], 'w_tokenized' : [], 's_w_tokenized' : []}
    for file in files:
        with open(file, 'r', encoding='utf-8') as txt_file:
            text = txt_file.readlines()
            corpora[corpus]['raw'].extend(text)

In [3]:
#corpora['esporte']['raw'][0]


* Removing trash read from the sports blog

In [4]:
import re
remove_pattern = "^Pesquisar este blog "
for i in range(len(corpora['esporte']['raw'])):
    text = corpora['esporte']['raw'][i]
    corpora['esporte']['raw'][i] = re.sub(remove_pattern, '', text)

In [5]:
#corpora['esporte']['raw'][0]

# Tokenizing by sentences, words and both

In [6]:
for corpus in corpora:
    for text in corpora[corpus]['raw']:
        #tokenizing by sentences
        sentences = normalizer.tokenize_sentences(text)
        corpora[corpus]['s_tokenized'].append(sentences)
        
        #tokenizing by sentence and by words in sequence
        words_by_sent_list = [normalizer.tokenize_words(sent) for sent in sentences]
        corpora[corpus]['s_w_tokenized'].append(words_by_sent_list)
        
        #transforming in lower case and removing accents and puntuation
        text_normalized = normalizer.to_lowercase(text)
        text_normalized = normalizer.remove_accents(text_normalized)
        text_normalized = normalizer.remove_punctuation(text_normalized)
        
        #tokenizing by words
        words = normalizer.tokenize_words(text_normalized)
        corpora[corpus]['w_tokenized'].append(words)

# Counting word frequencies

In [7]:
stopwords = nltk.corpus.stopwords.words('portuguese')
word_frequencies = {} # elements are like corpus: {word : frquence}
for corpus in corpora:
    word_frequencies[corpus] = {}
    for tokens_list in corpora[corpus]['w_tokenized']:
        words = normalizer.remove_stopwords(tokens_list)
        for w in words:
            if w not in word_frequencies[corpus]:
                word_frequencies[corpus][w] = 1
            else:
                word_frequencies[corpus][w] += 1         

In [8]:
#print(word_frequencies['esporte'])
#print(word_frequencies['ciencia_e_tecnologia'])


# The 20 words frequent most

In [9]:
import operator
import math

increasing_freqs = {}
for corpus in word_frequencies:
    decreasing_freqs = sorted(word_frequencies[corpus].items(), key=operator.itemgetter(1), reverse=True)
    increasing_freqs[corpus] = sorted(word_frequencies[corpus].items(), key=operator.itemgetter(1))
    print('\n' + corpus + ":\n")
    for word, freq in decreasing_freqs[:20]:
        print("\t\t{} => {}".format(word, freq) )  


ciencia_e_tecnologia:

		nao => 1101
		ser => 968
		equipe => 750
		sao => 720
		pode => 626
		energia => 584
		forma => 542
		inteligencia => 532
		artificial => 527
		imagem => 488
		universidade => 482
		tambem => 464
		madeira => 463
		ainda => 450
		podem => 436
		tecnologia => 415
		calor => 413
		sistema => 412
		dados => 410
		disse => 407

esporte:

		nao => 1421
		ja => 503
		time => 478
		sao => 457
		jogo => 453
		equipe => 451
		contra => 423
		copa => 390
		ser => 368
		dois => 368
		final => 365
		tambem => 364
		brasileiro => 362
		brasil => 355
		partida => 344
		ainda => 342
		1 => 337
		apos => 334
		tempo => 326
		gol => 326



# The 20 least frequent words 

In [10]:
for corpus in increasing_freqs:
    print('\n' + corpus + ':\n')
    for word, freq in increasing_freqs[corpus][0:20]:
        print("\t\t{} => {}".format(word, freq))
        


ciencia_e_tecnologia:

		propriocepcao => 1
		equilibrados => 1
		pernas => 1
		alcancem => 1
		verifiquem => 1
		mioeletricas => 1
		recuperem => 1
		voluntario => 1
		explorando => 1
		antebraco => 1
		fortemente => 1
		tornando => 1
		adiante => 1
		excelentes => 1
		silvestro => 1
		micera => 1
		restabelece => 1
		externas => 1
		inseridos => 1
		traduzir => 1

esporte:

		hornets => 1
		fecharam => 1
		contaram => 1
		chauncey => 1
		billups => 1
		distribuir => 1
		converter => 1
		seattle => 1
		supersonics => 1
		dallas => 1
		retrospecto => 1
		esp => 1
		oferecidos => 1
		recebidos => 1
		gaspar => 1
		espanhola => 1
		convencelos => 1
		liberalo => 1
		liberacao => 1
		causada => 1


# The average word size

In [11]:
av_word_size_by_corpus = {} #elements are like corpus : av_w_size
for corpus in corpora:
    total_words, av_word_size = 0, 0
    for word_list in corpora[corpus]['w_tokenized']:
        for word in word_list:
            total_words += 1
            av_word_size += len(word)
    
    av_word_size = av_word_size / total_words
    print("\nCorpus {}:\n".format(corpus))
    print("\n\tAverage word size: {}".format(av_word_size))
        


Corpus ciencia_e_tecnologia:


	Average word size: 5.363682337516389

Corpus esporte:


	Average word size: 4.733147660752371




# Average sentence size in number of words

In [12]:
#Note that here we are counting punctiation
for corpus in corpora:
    num_of_sentences = 0
    num_of_words = 0
    for text in corpora[corpus]['s_w_tokenized']:
        for sentence in text:
            num_of_sentences += 1
            num_of_words += len(sentence)
    
    av_words_by_sent = num_of_words / num_of_sentences
    print("\nCorpus {}\n:".format(corpus))
    print("\n\tAverage number of words by sentece: {}".format(av_words_by_sent))


Corpus ciencia_e_tecnologia
:

	Average number of words by sentece: 29.450447183949723

Corpus esporte
:

	Average number of words by sentece: 23.471547536433032


# The 20 biggest words

* Calculating the size of each word in each corpus:

In [13]:
word_sizes = {} #the elements are like corpus:{word: size}
for corpus in corpora:
    word_sizes[corpus] = {}
    for word_list in corpora[corpus]['w_tokenized']:
        for word in word_list:
            word_sizes[corpus][word] = len(word)


* Printing the 20 biggest words of each corpus:

In [14]:
for corpus in word_sizes:
    decreasing_sizes = sorted(word_sizes[corpus].items(), key=operator.itemgetter(1), reverse=True)
    print("\nCorpus {}:\n".format(corpus))
    for word, freq in decreasing_sizes[:20]:
        print("\t* {}".format(word))


Corpus ciencia_e_tecnologia:

	* httptecnologiasocialfbborgbr
	* hidretacaomoagemdehidretacao
	* umidificacaodesumidificacao
	* 101103physrevlett121177202
	* 101016jijrefrig201901006
	* 101126sciroboticsaau6914
	* 101016jheliyon2017e00234
	* 101126sciroboticsaav1488
	* 101021acsnanolett8b05051
	* 10108813672630188083041
	* acusticogravitacionais
	* titanioaluminiovanadio
	* 101038s4159801838303x
	* dixoncosmographicacom
	* 101038s41467018062448
	* titanioniobiozirconio
	* 101038s41467018080553
	* benzodioxociclohexeno
	* 101016jjoule201711007
	* 101038nenergy2017144

Corpus esporte:

	* corinthianseficiencia
	* setembrocorinthians5
	* diferenciacaotalvez
	* corinthians30081987
	* corinthians12121991
	* corinthians10051998
	* corinthians12052002
	* janeirocorinthians5
	* outubrocorinthians4
	* janeirocorinthians4
	* estatisticasultimo
	* profissionalnumero
	* brasileiro13121990
	* agostocorinthians4
	* tempointernacional
	* coordenadortecnico
	* pinheirosmackenzie
	* experienciajacques

# The 20 most frequent bigrams

In [15]:
for corpus in corpora:
    #gets all bigrams in the corpus
    bigrams = []
    for tokens_list in corpora[corpus]['w_tokenized']:
        #removing stop words to get more interesting results
        word_list = normalizer.remove_stopwords(tokens_list)
        bigrams.extend(list(nltk.bigrams(word_list)))
    
    #calculates the frequencies of each bigram
    bigrams_freq = {} #elements are like str(bigram) : freq
    for bigram in bigrams:
        if str(bigram) not in bigrams_freq:
            bigrams_freq[str(bigram)] = 1
        else:
            bigrams_freq[str(bigram)] += 1
    
    #sorts the bigrams by frequence in decreasing order
    ordered_bigram_freqs = sorted(bigrams_freq.items(), key=operator.itemgetter(1), reverse=True)
    
    #prints the 20 most frequent bigrams by corpus
    print("\nCorpus {}:\n".format(corpus))
    for str_bigram, freq in ordered_bigram_freqs[:20]:
        print("\t{} => {}".format(str_bigram, freq))


Corpus ciencia_e_tecnologia:

	('inteligencia', 'artificial') => 470
	('pode', 'ser') => 294
	('disse', 'professor') => 202
	('podem', 'ser') => 162
	('celulas', 'solares') => 141
	('temperatura', 'ambiente') => 125
	('et', 'al') => 120
	('comprimento', 'onda') => 99
	('lei', 'termodinamica') => 92
	('ate', 'agora') => 89
	('agua', 'salgada') => 82
	('colegas', 'universidade') => 78
	('aprendizagem', 'profunda') => 75
	('resistente', 'fogo') => 75
	('ponto', 'vista') => 75
	('primeira', 'vista') => 74
	('ampla', 'gama') => 72
	('gerado', 'maquina') => 72
	('espumas', 'metalicas') => 72
	('eletricidade', 'partir') => 72

Corpus esporte:

	('sao', 'paulo') => 258
	('campeonato', 'brasileiro') => 99
	('copa', 'mundo') => 94
	('segundo', 'tempo') => 83
	('primeiro', 'tempo') => 82
	('copa', 'brasil') => 82
	('neste', 'domingo') => 71
	('oitavas', 'final') => 70
	('neste', 'sabado') => 64
	('selecao', 'brasileira') => 62
	('formula', '1') => 57
	('serie', 'b') => 53
	('1', '0') => 50
	('an