## Term Frequency (TF)
Relative frequency of a term in a document
    = term instances / total terms
    
## Inverse Document Frequency (IDF)
Relative count of documents
    = log(docs/docs with term)

In [1]:
from string import punctuation
import nltk 
nltk.download("stopwords")
from nltk.corpus import stopwords

# Pegando documento do Github e abrindo
# Doc1
!curl https://raw.githubusercontent.com/MicrosoftLearning/AI-Introduction/master/files/Moon.txt -o Moon.txt
    
# Abrindo documento em modo de leitura e imprimindo seu conteudo
doc1 = open("Moon.txt", "r")
doc1Txt = doc1.read()
print(doc1Txt)
# removendo numeros
txt = ''.join(n for n in doc1Txt if not n.isdigit())

# removendo pontuacao e convertendo todo o texto para minusculo
txt = ''.join(n for n in txt if n not in punctuation).lower()
txt = ' '.join([word for word in txt.split() if word not in (stopwords.words('english'))])

# imprimindo o texto normalizado
print(txt)
print("--------------------------------------------------------------------------------------------------------")

# Doc2
!curl https://raw.githubusercontent.com/MicrosoftLearning/AI-Introduction/master/files/Gettysburg.txt -o Gettysburg.txt
doc2 = open("Gettysburg.txt", "r")
doc2Txt = doc2.read()
print(doc2Txt)
txt2 = ''.join(n for n in doc2Txt if not n.isdigit())
txt2 = ''.join(n for n in txt2 if n not in punctuation).lower()
txt2 = ' '.join([word for word in txt2.split() if word not in (stopwords.words('english'))])
print(txt2)
print("--------------------------------------------------------------------------------------------------------")

# Doc3
!curl https://raw.githubusercontent.com/MicrosoftLearning/AI-Introduction/master/files/Cognitive.txt -o Cognitive.txt
doc3 = open("Cognitive.txt", "r")
doc3Txt = doc3.read()
print(doc3Txt)
txt3 = ''.join(n for n in doc3Txt if not n.isdigit())
txt3 = ''.join(n for n in txt3 if n not in punctuation).lower()
txt3 = ' '.join([word for word in txt3.split() if word not in (stopwords.words('english'))])
print(txt3)
print("---------------------------------------------------------------------------------------------------------")

[nltk_data] Downloading package stopwords to /home/mrcs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   630  100   630    0     0    542      0  0:00:01  0:00:01 --:--:--   543


### Obter valores de TF-IDF para as três principais palavras em cada documento

In [2]:
# Instalar a biblioteza textblob e definir as funções TF-IDF

#!pip install -U textblob
import math
from textblob import TextBlob as tb

def tf(word, doc):
    return doc.words.count(word) / len(doc.words)

def contains(word, docs):
    return sum(1 for doc in docs if word in doc.words)

def idf(word, docs):
    return math.log(len(docs) / (1 + contains(word, docs)))

def tfidf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)

# Cria a coleção de documentos com o textblob
doc1 = tb(txt)
doc2 = tb(txt2)
doc3 = tb(txt3)
docs = [doc1,doc2,doc3]

# Use TF-IDF para obter as três mais importantes palavras de cada documento
print("---------------------------------------------------------------------------------------------------------")
for i, doc in enumerate(docs):
    print("Top palavras no documento: {}".format(i + 1))
    scores = {word: tfidf(word, doc, docs) for word in doc.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse = True)
    for word, score in sorted_words[:3]:
        print("\tPalavra: {}, TF-IDF: {}".format(word, round(score,5)))
    print("---------------------------------------------------------------------------------------------------------\n")
