In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk import pos_tag

In [4]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

def clean_text(text_string, punctuations=r'''!()-[]{};:'\"\,<>./?@#$%^&*_~'''):
    """
    Cleans input text and returns a list of processed tokens (nouns only).
    
    Parameters:
    - text_string (str): Input text to be cleaned.

    Returns:
    - list: A list of cleaned, stemmed, and lemmatized nouns.
    """

    # Check if the input is valid
    if isinstance(text_string, list):
        text_string = ' '.join(text_string)  # Convert list to string if necessary
    elif not isinstance(text_string, str):
        return []  # Return empty list for invalid inputs

    # Cleaning URLs
    string = re.sub(r'https?://\S+|www\.\S+', '', text_string)

    # Cleaning HTML elements
    string = re.sub(r'<.*?>', '', string)

    # Removing punctuations
    string = re.sub(r'[^\w\s]', '', string)

    # Converting the text to lowercase
    string = string.lower()

    # Removing stop words
    filtered_words = [word for word in string.split() if word not in stopwords.words('english')]

    # Custom stop words list
    customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
                  "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
                  "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
                  "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

    # Removing custom stop words
    final_words = list(set(filtered_words) - set(customlist))

    # Tokenization
    tokens = word_tokenize(' '.join(final_words))

    # Remove numbers and keep only alphabetic words
    tokens = [word for word in tokens if word.isalpha()]

    # Extract nouns from tokens
    nouns = extract_nouns(tokens)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in nouns]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in lemmatized_words]

    return stemmed_words

def extract_nouns(token_list):
    """
    Extracts nouns from a list of tokens using part-of-speech tagging.

    Parameters:
    - token_list (list): List of words to be processed.

    Returns:
    - list: A list of nouns from the input tokens.
    """
    # Ensure the input is a list of strings
    if not isinstance(token_list, list) or not all(isinstance(word, str) for word in token_list):
        raise ValueError("Expected a list of strings.")

    # Part-of-speech tagging
    pos_tags = pos_tag(token_list)

    # Extract nouns based on POS tags
    nouns = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]

    return nouns



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\toto9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\toto9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\toto9\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Ler DF geral e criar coluna com descrição em listas

df = pd.read_csv(r"C:\Users\toto9\Desktop\PAC\Trabalho\books_with_genres.csv")
df["descriptionList"] = (df["description"]).apply(lambda x: x.split() if isinstance(x, str) else x)
df.dropna(subset=["descriptionList"], axis=0, inplace=True)
df.head()

Unnamed: 0,Title,description,authors,image,categories,title_raw,genres,descriptionList
0,the red witch,romance dutch east indies takes place,['Garland Roark'],httpbooksgooglecombookscontentidahtvqeacaajpri...,childrens literature,The Red Witch,Children and Education,"[romance, dutch, east, indies, takes, place]"
1,childrens book illustration design ii,using stepbystep sketches instructions book gu...,['Martin Salisbury'],httpbooksgooglecombookscontentidyxnbeaaaqbajpr...,childrens literature,Children's Book Illustration & Design II,Children and Education,"[using, stepbystep, sketches, instructions, bo..."
2,elsie dinsmore collection,pious young girl difficulty establishing relat...,['Martha Finley'],httpbooksgooglecombookscontentiducfaqaaiaajpri...,childrens literature,Elsie Dinsmore Collection,Children and Education,"[pious, young, girl, difficulty, establishing,..."
3,parade of stories child horizons,selection stories folk fairy tales poems autho...,"['Esther M. Bjoland', 'Anne Neigoff']",httpbooksgooglecombookscontentidwpcyhzbtocprin...,childrens literature,Parade of Stories (Child Horizons),Children and Education,"[selection, stories, folk, fairy, tales, poems..."
4,rommel drives on deep into egypt,rommel drives deep egypt richard brautigans ei...,['Richard Brautigan'],httpbooksgooglecombookscontentidkaaaaamaajprin...,childrens literature,Rommel Drives on Deep into Egypt,Children and Education,"[rommel, drives, deep, egypt, richard, brautig..."


In [6]:
# Criar coluna só com os substantivos
df["descriptionNouns"] = df["descriptionList"].apply(extract_nouns)

In [23]:
# Visualizar contagem por gênero
df["genres"].value_counts()

genres
Non-Fiction                    35546
Fiction                        15218
Arts and Culture                3421
Hobbies and Leisure             3133
Children and Education          2066
Literature and Poetry           2048
Comics and Graphic Novels        532
Lifestyle and Relationships      305
History and Society              263
Science Fiction and Fantasy       55
Name: count, dtype: int64

In [25]:
# Selecionar apenas livros de um gênero -> repetir o processo para quantos gêneros forem necessários
df_simplificado = df[df["genres"] == "Fiction"]
df_simplificado.shape

(15218, 9)

In [26]:
# Criar matriz

from sklearn.feature_extraction.text import CountVectorizer

count_matrix = CountVectorizer(max_features=10000)
df_simplificado["descriptionNounsString"] = df_simplificado["descriptionNouns"].apply(lambda x: " ".join(x))

X = count_matrix.fit_transform(df_simplificado["descriptionNounsString"])

count_matrix.get_feature_names_out()
print(X.toarray())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_simplificado["descriptionNounsString"] = df_simplificado["descriptionNouns"].apply(lambda x: " ".join(x))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
# Construir DF vetorizado (opcional, permite uma melhor visualização do processo)

#vectorized_df = pd.DataFrame(X.toarray(), columns=count_matrix.get_feature_names_out(), index=df_simplificado["Title"])
#vectorized_df.to_csv("books_description_vectorized.csv")

In [28]:
# Aplicar cosine_similarity à matriz

from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(X)
print(similarities)

[[1.         0.         0.20412415 ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.04244764 0.         0.        ]
 [0.20412415 0.         1.         ... 0.05812382 0.         0.        ]
 ...
 [0.         0.04244764 0.05812382 ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [31]:
# Salvar DF com os nomes dos livros -> substituir nome dos ficheiros

df_livros = pd.DataFrame(similarities, columns=df_simplificado["Title"], index=df_simplificado["Title"]).reset_index()
df_livros.head()
#df_livros.to_csv("df_livros_correl.csv")

Title,Title.1,the hunting of the snark,rowan and the keeper of the crystal rowan of rin,the dastardly murder of dirty pete,just so stories watermill classic,megan in ancient greece pb magic attic club,mother west wind why stories,the torpedo run great railway adventure,the hunting of the snark.1,the bedford incident,...,winterwood,only you sierra the sierra jensen series,snow walker,z for zachariah,the little town where time stood still f a novel,the last apprentice revenge of the witch,crazy for you sweet dreams series,jake finds out making out,born of the sun,the whispering knights complete unabridged
0,the hunting of the snark,1.0,0.0,0.204124,0.0,0.0,0.0,0.182574,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,rowan and the keeper of the crystal rowan of rin,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.187867,0.048795,0.0,0.0,0.0,0.042448,0.0,0.0
2,the dastardly murder of dirty pete,0.204124,0.0,1.0,0.0,0.0,0.0,0.111803,0.204124,0.0,...,0.032009,0.0,0.0,0.0,0.0,0.0,0.0,0.058124,0.0,0.0
3,just so stories watermill classic,0.0,0.0,0.0,1.0,0.0,0.57735,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,megan in ancient greece pb magic attic club,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.057735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
