# Tasca M9T01, Anàlisi de sentiment i textos

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import scipy.stats as stats

In [3]:
# configurar els gràfics
sns.set_style('whitegrid')
sns.set_palette('flare')

In [56]:
# import libraries
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# download nltk corpus (first time only)
import nltk

### Càrrega de l'arxiu

In [75]:
import PyPDF2
with open('C:/Users/Patricia/Desktop/prideandprejudice.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()

Pel anàlisi de sentiment farè servir un pdf que conté "Orgull i prejudici" (en anglès, "Pride and Prejudice") és una novel·la escrita per Jane Austen. Publicada per primera vegada el 1813, la història es desenvolupa a la zona rural d'Anglaterra al segle XIX i segueix les vivències de la família Bennet. L'eix central de la trama és la relació entre Elizabeth Bennet, una dona intel·ligent i vivaç, i Fitzwilliam Darcy, un home orgullós i aparentment arrogant.

La novel·la explora temes com el matrimoni, la societat i les expectatives de gènere de l'època. Mitjançant diàlegs perspicaços i situacions còmiques, Austen retrata les complexitats de les relacions humanes i critica la rigidesa social i la importància excessiva que es donava a l'estatus i els diners en la societat de l'època.

Al llarg de la història, Elizabeth ha d'enfrontar-se als seus prejudicis i superar les seves pròpies concepcions errònies sobre el caràcter de Darcy. Al mateix temps, Darcy ha d'aprendre a deixar de banda el seu orgull i prejudici inicials i reconèixer el veritable valor d'Elizabeth. Junts, Elizabeth i Darcy descobreixen l'amor i la capacitat de transcendir les expectatives i convencions socials.

    També és de les primeres obres en tenir el clixé "enemies to lovers"

### Preprocessat

Primer dividim el text per capitols

In [76]:
# Patró de coincidència per extreure els marcadors de capítol i el contingut
pattern = r"Chapter (\d+)\n([\s\S]*?)(?=(Chapter \d+|$))"

# Troba tots els marcadors de capítol i el contingut amb el patró
matches = re.findall(pattern, text)

# Crea una llista de capítols amb el número de capítol i el contingut
chapters = [(int(match[0]), match[1].strip()) for match in matches]

# Crea un DataFrame amb els capítols
df = pd.DataFrame(chapters, columns=["Chapter", "Content"])

print(df)

    Chapter                                            Content
0         1  It is a truth universally acknowledged, that a...
1         2  Mr. Bennet was among the earliest of those who...
2         3  Not all that Mrs. Bennet, however, with the as...
3         4  When Jane and Elizabeth were alone, the former...
4         5  Within a short walk of Longbourn lived a famil...
..      ...                                                ...
56       57  The discomposure of spirits which this extraor...
57       58  Instead of receiving any such letter of excuse...
58       59  “My dear Lizzy, where can you have been walkin...
59       60  Elizabeth’s spirits soon rising to playfulness...
60       61  Happy for all her maternal feelings was the da...

[61 rows x 2 columns]


Com que el text selecionat és una novel·la caldria identificar i eliminar el soroll, així com normalitzar els caràcters sense tenir fer-li masking a les nostres dades. Com que és un pdf també caldrà revisar si hi ha etiquetes HTML o URLs

In [103]:
import re
from unidecode import unidecode

def normalize_text(df, column):
    # Passos d'eliminació de soroll i normalització de caràcters
    passos = [
        (r'<[^<]+?>', '', 'Etiquetes HTML'),      # Eliminació d'etiquetes HTML
        (r'http\S+', '', 'URLs'),                  # Eliminació d'URLs
        (r'[^\w\s]', '', 'puntuacio'),             # Eliminació de símbols de puntuació
        (r'\d+', '', 'nombres'),                    # Eliminació de nombres
        (None, lambda x: x.lower(), ''),           # Conversió a minúscules
        (None, lambda x: unidecode(x), '')         # Eliminació d'accentuació i diacrítics
    ]

    # Aplicació dels passos al text
    for index, row in df.iterrows():
        text = row[column]
        for patro, substitucio, etiqueta in passos:
            if patro:
                text = re.sub(patro, substitucio, text)
            elif substitucio:
                text = substitucio(text)
        df.at[index, column] = text

    return df

In [135]:
normalized = normalize_text(df, 'Content')
print(normalized)

    Chapter                                            Content
0         1  it is a truth universally acknowledged that a ...
1         2  mr bennet was among the earliest of those who ...
2         3  not all that mrs bennet however with the assis...
3         4  when jane and elizabeth were alone the former ...
4         5  within a short walk of longbourn lived a famil...
..      ...                                                ...
56       57  the discomposure of spirits which this extraor...
57       58  instead of receiving any such letter of excuse...
58       59  my dear lizzy where can you have been walking ...
59       60  elizabeths spirits soon rising to playfulness ...
60       61  happy for all her maternal feelings was the da...

[61 rows x 2 columns]


In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Inicialització del vectoritzador TfidfVectorizer
vectorizer = TfidfVectorizer()

# Vectorització de la columna 'Content' de la taula normalized per obtenir la matriu term-document
matriu_term_document = vectorizer.fit_transform(normalized['Content'])

# Llista de termes (columnes) de la matriu term-document
termes = list(vectorizer.vocabulary_.keys())

##### Ara si fariem el preprocessat

In [158]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Crear función preprocess_text con stemming
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [PorterStemmer().stem(lemmatizer.lemmatize(token)) for token in filtered_tokens]
    processed_text = ' '.join(stemmed_tokens)
    return processed_text

In [156]:
normalized['Content'] = normalized['Content'].apply(preprocess_text)

Unnamed: 0,Chapter,Content
0,1,it is a truth universally acknowledged that a ...
1,2,mr bennet was among the earliest of those who ...
2,3,not all that mrs bennet however with the assis...
3,4,when jane and elizabeth were alone the former ...
4,5,within a short walk of longbourn lived a famil...
...,...,...
56,57,the discomposure of spirits which this extraor...
57,58,instead of receiving any such letter of excuse...
58,59,my dear lizzy where can you have been walking ...
59,60,elizabeths spirits soon rising to playfulness ...


In [157]:
# Preprocess the text and remove common words
processed = normalized['Chapter'], normalized['Content'].apply(preprocess_text)

In [160]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Inicialització del vectoritzador TfidfVectorizer
vectorizer = TfidfVectorizer()

# Vectorització de la columna 'Content' de la taula normalized per obtenir la matriu term-document
matriu_term_document = vectorizer.fit_transform(normalized['Content'])

# Inicialització de l'algoritme LSA
lsa = TruncatedSVD(10)

# Aplicació de LSA a la matriu term-document
lsa_result = lsa.fit_transform(matriu_term_document)

# Mostra els resultats de LSA
print("Resultats de LSA:")
print(lsa_result)

Resultats de LSA:
[[ 7.21676619e-01  3.96945540e-01  1.24751497e-01  3.33803254e-02
  -1.43514809e-02 -1.77103496e-02  3.10801158e-01  1.00407788e-01
   1.70887463e-01 -1.15026763e-01]
 [ 7.56716104e-01  2.92989137e-01  5.32954812e-02 -7.98415592e-02
   1.91427183e-01 -6.51728461e-02  1.82899489e-01  1.18320601e-01
   1.96920554e-01  1.91401802e-01]
 [ 8.41735793e-01 -1.06051137e-01  2.47017482e-01 -5.23061972e-02
   1.25919800e-01  5.60619017e-02  2.78723217e-02 -3.49739073e-02
   1.12529230e-01  4.28861339e-02]
 [ 8.12646859e-01 -5.40285212e-02  1.98583083e-01 -5.01543615e-02
  -3.64706505e-03  6.17833851e-02  1.18408217e-01 -2.58518618e-01
   4.72238240e-02 -1.07335900e-01]
 [ 7.36654295e-01  1.05145655e-01  3.89032340e-01 -2.49605931e-01
   2.07200186e-02  2.03435308e-01 -2.40640338e-01  2.07421454e-03
  -2.18387689e-02 -7.14826666e-03]
 [ 8.97534192e-01  7.18568686e-02  5.73537587e-02 -6.01430394e-02
   6.67893353e-03 -6.93988348e-02 -3.84064524e-02 -7.75318487e-02
  -2.93761948e-

In [163]:
matriu_term_document

<61x7670 sparse matrix of type '<class 'numpy.float64'>'
	with 41260 stored elements in Compressed Sparse Row format>

In [None]:
Realitzar mineria de texts.
Interpretació dels textos.
Anàlisis dels resultats.

## Exercici 1
Agafa un text en anglès que vulguis, i calcula'n la freqüència de les paraules.

    He pujat el pdf d'un llibre (Orgull i prejudici)

In [165]:
# Creació del DataFrame de freqüències normalitzades
df_frequencies = pd.DataFrame(matriu_term_document.toarray(), columns=termes)

# Aplicació de LSA a la matriu de freqüències normalitzades
lsa_result = lsa.fit_transform(df_frequencies)

In [173]:
df_frequencies

Unnamed: 0,it,is,truth,universally,acknowledged,that,single,man,in,posses,...,bath,heretofore,arrear,sportive,prehend,liberties,indignant,pollution,city,uniting
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.047707,0.000000,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.007975,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.044617,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
57,0.0,0.0,0.0,0.0,0.013972,0.0,0.0,0.0,0.0,0.012050,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
58,0.0,0.0,0.0,0.0,0.014261,0.0,0.0,0.0,0.0,0.012299,...,0.00000,0.0,0.0,0.0,0.018958,0.000000,0.000000,0.000000,0.0,0.0
59,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.025763,0.000000,0.0,0.0


In [169]:
df_frequencies.describe()

Unnamed: 0,it,is,truth,universally,acknowledged,that,single,man,in,posses,...,bath,heretofore,arrear,sportive,prehend,liberties,indignant,pollution,city,uniting
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,0.00032,0.000358,0.001617,0.000239,0.001213,0.000458,0.000514,0.000636,0.001446,0.007484,...,0.001961,0.000144,0.00031,0.000514,0.000311,0.000782,0.000422,0.000731,0.000382,0.000369
std,0.002501,0.002797,0.004651,0.001864,0.004152,0.003581,0.004018,0.004964,0.004909,0.007362,...,0.006894,0.001121,0.002419,0.004018,0.002427,0.006108,0.003299,0.005713,0.002981,0.002878
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.019534,0.021842,0.018053,0.014557,0.018905,0.027965,0.031379,0.038771,0.020305,0.030679,...,0.045178,0.008755,0.018893,0.031379,0.018958,0.047707,0.025763,0.044617,0.023285,0.02248


Obtenim que hi han 7670 paraules distintes

# Exercici 2
Treu les stopwords i realitza stemming al teu conjunt de dades.

In [144]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Descargar las stopwords y el léxico en inglés si no están descargados
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

# Crear función preprocess_text
def preprocess_text(text):
    # Tokenización de palabras
    tokens = word_tokenize(text.lower())
    
    # Filtrado de tokens eliminando stopwords y palabras no reconocidas en inglés
    english_words = set(nltk.corpus.words.words())
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english') and token in english_words]
    
    # Lemmatización de tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Reconstrucción del texto procesado
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patricia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Patricia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Patricia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [145]:
# Preprocess the text and remove common words
processed = normalized['Content'].apply(preprocess_text)

In [146]:
filtered_tokens

NameError: name 'filtered_tokens' is not defined

In [131]:
# Preprocess the text
processed = normalized['Content'].apply(preprocess_text)

## Exercici 3
Realitza sentiment analysis al teu conjunt de dades.

Abans de fer anàlisi de sentiment en de saber-hi què estem buscant, hem de fer una matriu term-document, què en aquest cas seria un array perquè no més tenim un document

In [None]:
import PyPDF2
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Obre el fitxer PDF en mode de lectura binària
with open('C:/Users/Patricia/Desktop/prideandprejudice.pdf', 'rb') as file:
    # Crea un objecte de lectura PDF
    reader = PyPDF2.PdfReader(file)

    # Llegeix cada pàgina i extreu'n el text
    text = ''
    for page in reader.pages:
        text += page.extract_text()

    # Tokenitza el text en paraules
    tokens = word_tokenize(text.lower())

    # Calcula la distribució de freqüència de les paraules
    fdist = FreqDist(tokens)

    # Obté les 10 paraules més freqüents relacionades amb "amor"
    keywords = [word for word, freq in fdist.most_common(10) if 'love' in word or 'affection' in word]

    # Imprimeix les paraules clau
    print("Paraules clau relacionades amb 'amor':")
    for keyword in keywords:
        print(keyword)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inicialització del vectoritzador TfidfVectorizer
vectorizer = TfidfVectorizer()

# Vectorització del document per obtenir la matriu term-document
matriu_term_document = vectorizer.fit_transform([text])

# Llista de termes (columnes) de la matriu term-document
termes = list(vectorizer.vocabulary_.keys())

# Mostra la matriu term-document
print("Matriu term-document:")
print(matriu_term_document.toarray())

Matriu term-document:
[[9.4581235e-05 9.4581235e-05 9.4581235e-05 ... 9.4581235e-05
  9.4581235e-05 1.8916247e-04]]


In [52]:
from sklearn.decomposition import TruncatedSVD

# Nombre de components per a l'LSA
num_components = 2

# Aplicació de LSA
lsa = TruncatedSVD(n_components=num_components)
lsa.fit(matriu_term_document.reshape(1, -1))

# Matriu de termes i conceptes
matriu_termes_conceptes = lsa.transform(matriu_term_document.reshape(1, -1))

# Mostra la matriu de termes i conceptes
print("Matriu de termes i conceptes:")
print(matriu_termes_conceptes)

Matriu de termes i conceptes:
[[1.]]


In [53]:
matriu_termes_conceptes

array([[1.]])

In [39]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Descarrega els recursos necessaris per a l'anàlisi de sentiment
nltk.download('vader_lexicon')

# Inicialitza l'analitzador de sentiment
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Patricia\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [40]:
def get_sentiment_category(text):
    positive_keywords = ['sympathy', 'affection', 'love']
    negative_keywords = ['hate', 'disgust', 'repulsion']

    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']

    if compound_score >= 0.05:
        sentiment_category = 'Love'
    elif compound_score <= -0.05:
        sentiment_category = 'Hate'
    else:
        tokenized_text = nltk.word_tokenize(text.lower())
        if any(keyword in tokenized_text for keyword in positive_keywords):
            sentiment_category = 'Sympathy'
        elif any(keyword in tokenized_text for keyword in negative_keywords):
            sentiment_category = 'Hate'
        else:
            sentiment_category = 'Neutral'

    return sentiment_category

In [41]:
get_sentiment_category(processed_text)

'Love'