Imports iniciais

In [82]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from threadpoolctl import threadpool_limits
import spacy
spacy.cli.download('pt_core_news_md')
import re
import string
from nltk.stem import RSLPStemmer
import nltk
nltk.download('rslp')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [83]:
def proccess_text(text):
    nlp = spacy.load("pt_core_news_md")

    doc = nlp(text)

    tokens = [token.text for token in doc]

    tokens_normalizados = [token.lower() for token in tokens]

    tokens_filtrados = [
        token for token in tokens_normalizados
        if token not in nlp.Defaults.stop_words and token not in string.punctuation
    ]
    stemmer = RSLPStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens_filtrados]
    lemmatized_tokens = [token.lemma_ for token in nlp(' '.join(tokens_filtrados))]

    return tokens_filtrados, stemmed_tokens, lemmatized_tokens

def tokens_query(query):
    tokens, _, _ = proccess_text(query)
    return tokens

def stem_query(query):
    _, stemmed_query, _ = proccess_text(query)
    return stemmed_query

def lemma_query(query):
    _, _, lemmatized_query = proccess_text(query)
    return lemmatized_query



In [84]:
def bag_of_words(corpus):
    bow_vec = CountVectorizer()
    x_bow = bow_vec.fit_transform(corpus)
    dt_bow = pd.DataFrame(x_bow.toarray(), columns=bow_vec.get_feature_names_out())
    return bow_vec, x_bow, dt_bow

def tf_idf(corpus):
    tfidf_vec = TfidfVectorizer()
    x_tfidf = tfidf_vec.fit_transform(corpus)
    dt_tfidf = pd.DataFrame(x_tfidf.toarray(), columns=tfidf_vec.get_feature_names_out())
    return tfidf_vec, x_tfidf, dt_tfidf


# ler arquivo CSV filmes_processados.csv
data_filmes_processados = pd.read_csv('filmes_processados_csv/filmes_processados.csv')
data_comentarios_processados = pd.read_csv('filmes_processados_csv/comentarios_processados.csv')

Criação dos corpus

In [85]:
corpus_filmes_tokens = data_filmes_processados['resumo_tokens'].tolist()
corpus_filmes_stems = data_filmes_processados['resumo_stem'].tolist()
corpus_filmes_lemmas = data_filmes_processados['resumo_lema'].tolist()

corpus_comentarios_tokens = data_comentarios_processados['conteudo_tokens'].tolist()
corpus_comentarios_stems = data_comentarios_processados['conteudo_stem'].tolist()
corpus_comentarios_lemmas = data_comentarios_processados['conteudo_lema'].tolist()


BoW Filmes Tokens

In [87]:
print("Matriz Documento-Termo (BoW):")
bow_filmes_tokens_vec, x_bow_filmes_tokens, bow_filmes_tokens = bag_of_words(corpus_filmes_tokens)
bow_filmes_tokens

Matriz Documento-Termo (BoW):


Unnamed: 0,10,100,11,117,118,12,126,15,156,177,...,órfãs,ótima,ótimas,ótimo,ótimos,última,últimos,única,únicas,único
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,1,0,0,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0


BoW Filmes Stems

In [88]:
print("Matriz Documento-Termo (BoW):")
bow_filmes_stems_vec,  x_bow_filmes_stems, bow_filmes_stems = bag_of_words(corpus_filmes_stems)
bow_filmes_stems

Matriz Documento-Termo (BoW):


Unnamed: 0,10,100,11,117,118,12,126,15,156,177,...,óbv,óbvi,ócul,ódi,óper,órbit,órfã,ótim,últ,únic
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,1,2,0,0


BoW Filmes Lemmas

In [89]:
print("Matriz Documento-Termo (BoW):")
bow_filmes_lemas_vec, x_bow_filmes_lemas, bow_filmes_lemas = bag_of_words(corpus_filmes_lemmas)
bow_filmes_lemas

Matriz Documento-Termo (BoW):


Unnamed: 0,10,100,11,117,118,12,126,15,156,177,...,órbita,órfão,órfãs,ótim,ótima,ótimm,ótimo,ótimos,último,único
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,1,0,0,1,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0


TF-IDF Filmes Tokens

In [64]:
print("Matriz Documento-Termo (TF-IDF):")
tfidf_filmes_tokens_vec, x_tfidf_filmes_tokens, tfidf_filmes_tokens = tf_idf(corpus_filmes_tokens)
tfidf_filmes_tokens

Matriz Documento-Termo (TF-IDF):


Unnamed: 0,10,100,11,117,118,12,126,15,156,177,...,órfãs,ótima,ótimas,ótimo,ótimos,última,últimos,única,únicas,único
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07212,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.04429,0.0,0.0,0.0,0.03839,0.04104,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.07099,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.04357,0.04357,0.0,0.0,0.04357,0.04357,0.0,0.0,0.0,...,0.0,0.0355,0.0,0.02898,0.0,0.0,0.0,0.0,0.0,0.0


TF-IDF Filmes Stems

In [90]:
print("Matriz Documento-Termo (TF-IDF):")
tfidf_filmes_stems_vec, x_tfidf_filmes_stems, tfidf_filmes_stems = tf_idf(corpus_filmes_stems)
tfidf_filmes_stems

Matriz Documento-Termo (TF-IDF):


Unnamed: 0,10,100,11,117,118,12,126,15,156,177,...,óbv,óbvi,ócul,ódi,óper,órbit,órfã,ótim,últ,únic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05023
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03205,0.07081,0.0
2,0.0,0.0,0.0,0.0,0.07641,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07461,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.09413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.04517,0.04517,0.0,0.0,0.04517,0.04517,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.04027,0.05147,0.0,0.0


TF-IDF Filmes Lemmas

In [92]:
print("Matriz Documento-Termo (TF-IDF):")
tfidf_filmes_lemas_vec, x_tfidf_filmes_lemas, tfidf_filmes_lemas = tf_idf(corpus_filmes_lemmas)

Matriz Documento-Termo (TF-IDF):


Busca por Similaridade

In [99]:
def procurar_e_rankear(query, funcao_processar_texto, vectorizer, X_corpus, corpus, nome_metodo):
    query = funcao_processar_texto(query)

    q_vec = vectorizer.transform([" ".join(query)])
    sim_scores = cosine_similarity(q_vec, X_corpus).ravel()
    rank = np.argsort(sim_scores)[::-1]

    print(f"Top-3 Similares para a Query (usando {nome_metodo}):")
    for i in rank[:3]:
        if sim_scores[i] > 0.01: # Apenas mostra se houver alguma similaridade
            print(f"  Doc{i+1} (score={sim_scores[i]:.3f}): {corpus[i]}")
    print("-" * 40)

procurar_e_rankear("Animação bonita", tokens_query, bow_filmes_tokens_vec, x_bow_filmes_tokens, corpus_filmes_tokens, "BoW Filmes Tokens")
print("------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA----------------------")
procurar_e_rankear("Animação bonita", stem_query, bow_filmes_stems_vec, x_bow_filmes_stems, corpus_filmes_stems, "BoW Filmes Stems")
print("------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA----------------------")
procurar_e_rankear("Animação bonita", lemma_query, bow_filmes_lemas_vec, x_bow_filmes_lemas, corpus_filmes_lemmas, "BoW Filmes Lema")

Top-3 Similares para a Query (usando BoW Filmes Tokens):
  Doc2 (score=0.175): assistir filmes cinema mudo trata-se outro estilo refletido visão mundo época própria linguagem cinematográfica cinema simples expressivo contava história basicamente emoções pirotecnias atuais muita criatividade cinema falado seja ruim existem várias pérolas produzidas formato cinema mudo charme justamente sentimento wall-e ousada produção pixar traz volta    ousada foge completamente panorama cinema animação comercial última década chega curioso visto ascensão animação computadorizada começou toy story 1º longa-metragem ... pixar curioso estranho longo anos estúdio mostrou passo frente investiu melhoria técnica animação roteiro projetos diferencial invés explorar extremo mesma fórmula pixar busca criar filmes pode-se perceber situações surpreendentes seja pequenos detalhes contexto conceito filme wall-e chega espantoso suposta animação infantil tantas referências história cinema seguindo diferente comum an

In [101]:
procurar_e_rankear("Animação bonita", tokens_query, tfidf_filmes_tokens_vec, x_tfidf_filmes_tokens, corpus_filmes_tokens, "TF-IDF Filmes Tokens")
print("------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA----------------------")
procurar_e_rankear("Animação bonita", stem_query, tfidf_filmes_stems_vec, x_tfidf_filmes_stems, corpus_filmes_stems, "TF-IDF Filmes Stems")
print("------------------AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA----------------------")
procurar_e_rankear("Animação bonita", lemma_query, tfidf_filmes_lemas_vec, x_tfidf_filmes_lemas, corpus_filmes_lemmas, "TF-IDF Filmes Lema")

Top-3 Similares para a Query (usando TF-IDF Filmes Tokens):
  Doc2 (score=0.148): assistir filmes cinema mudo trata-se outro estilo refletido visão mundo época própria linguagem cinematográfica cinema simples expressivo contava história basicamente emoções pirotecnias atuais muita criatividade cinema falado seja ruim existem várias pérolas produzidas formato cinema mudo charme justamente sentimento wall-e ousada produção pixar traz volta    ousada foge completamente panorama cinema animação comercial última década chega curioso visto ascensão animação computadorizada começou toy story 1º longa-metragem ... pixar curioso estranho longo anos estúdio mostrou passo frente investiu melhoria técnica animação roteiro projetos diferencial invés explorar extremo mesma fórmula pixar busca criar filmes pode-se perceber situações surpreendentes seja pequenos detalhes contexto conceito filme wall-e chega espantoso suposta animação infantil tantas referências história cinema seguindo diferente comum