---
### Text Mining

📌 Ejemplo inicial


---

In [None]:
# Librerías necesarias
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


In [None]:

# Descargar recursos de NLTK
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = stopwords.words('spanish')
len(stop_words)

313

In [None]:
stop_words_es = stopwords.words('spanish')
stop_words_es[:15]

['de',
 'la',
 'que',
 'el',
 'en',
 'y',
 'a',
 'los',
 'del',
 'se',
 'las',
 'por',
 'un',
 'para',
 'con']

In [None]:

# Datos de ejemplo
data = [
    ("Este producto es excelente, me encantó", "positivo"),
    ("Muy malo, no volvería a comprarlo", "negativo"),
    ("Es regular, esperaba algo mejor", "neutro"),
    ("No me gusta para nada", "negativo"),
    ("Es increíble, lo mejor que he comprado", "positivo")
]



In [None]:
pepe = "Este producto es excelente, me encantó"
pepe.split(" ")

['Este', 'producto', 'es', 'excelente,', 'me', 'encantó']

In [None]:

# Separar texto y etiquetas
texts, labels = zip(*data)


In [None]:

# Preprocesar texto
vectorizer = TfidfVectorizer(stop_words=stopwords.words('spanish'))


In [None]:
vectorizer

---
Que hace TfidVectorizer?

`TfidfVectorizer` realiza los siguientes pasos: <br>

* **Tokenización**: Divide cada documento en palabras (o tokens).
* **Cálculo de Term Frequency (TF)**: Cuenta la frecuencia de cada palabra en un documento. Es decir, cuántas veces aparece cada palabra en el documento.
* **Cálculo de Inverse Document Frequency (IDF)**: Calcula la frecuencia inversa de la palabra en todo el conjunto de documentos. Las palabras que aparecen en muchos documentos obtienen un peso bajo, mientras que las palabras que aparecen en pocos documentos obtienen un peso alto.
* **Multiplicación TF-IDF:** Multiplica el valor de la frecuencia de cada palabra (TF) por el valor de la frecuencia inversa (IDF), generando una representación ponderada de cada palabra en cada documento.
* **Representación numérica:** Convierte los textos en una matriz donde cada fila representa un documento y cada columna representa una palabra del vocabulario del conjunto de documentos. Los valores en la matriz representan los pesos TF-IDF de cada palabra en cada documento.

<br>

**¿Por qué es útil?**  <br>

El TF-IDF ayuda a identificar palabras importantes dentro de un documento en comparación con un conjunto más amplio de documentos. Las palabras comunes en todos los documentos (como stopwords) tendrán un peso bajo, mientras que las palabras que son únicas o importantes en un documento específico tendrán un peso alto.

---

In [None]:

# Dividir en datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:

# Crear un pipeline con TF-IDF y Naive Bayes
model = make_pipeline(vectorizer, MultinomialNB())

# Entrenar el modelo
model.fit(X_train, y_train)


In [None]:

# Evaluar el modelo
accuracy = model.score(X_test, y_test)
print(f"Precisión del modelo: {accuracy:.2f}")


Precisión del modelo: 0.00


In [None]:

# Predecir sentimiento en nuevas frases
nuevos_textos = ["Es un mal producto", "Lo recomiendo a todos"]
predicciones = model.predict(nuevos_textos)
print(predicciones)

['positivo' 'positivo']


---


## Preprocesamiento del Texto


---



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer





In [None]:
# Descargar stopwords
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

# Texto de ejemplo
texto = "The product is absolutely amazing. I totally recommend it!"



In [None]:
# Tokenización
tokens = word_tokenize(texto.lower())


In [None]:
tokens

['the',
 'product',
 'is',
 'absolutely',
 'amazing',
 '.',
 'i',
 'totally',
 'recommend',
 'it',
 '!']

In [None]:

# Eliminación de stopwords
tokens_filtrados = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]



In [None]:
tokens_filtrados

['product', 'absolutely', 'amazing', 'totally', 'recommend']

In [None]:
# Stemming
stemmer = PorterStemmer()
tokens_stemmed = [stemmer.stem(token) for token in tokens_filtrados]

In [None]:
tokens_stemmed

['product', 'absolut', 'amaz', 'total', 'recommend']

In [None]:
# Stemming
stemmer2 = SnowballStemmer('spanish')
tokens_stemmed2 = [stemmer2.stem(token) for token in tokens_filtrados]

In [None]:
tokens_stemmed2

['product', 'absolutely', 'amazing', 'totally', 'recommend']

---

Transformacion

---

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

textos = ["Este es un documento corto.",
          "El documento es largo y contiene muchas palabras interesantes.",
          "Aquí hay otro documento con palabras distintas."]

# Vectorización con TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

print(vectorizer.get_feature_names_out())
print(X.toarray())

['aquí' 'con' 'contiene' 'corto' 'distintas' 'documento' 'el' 'es' 'este'
 'hay' 'interesantes' 'largo' 'muchas' 'otro' 'palabras' 'un']
[[0.         0.         0.         0.50461134 0.         0.29803159
  0.         0.38376993 0.50461134 0.         0.         0.
  0.         0.         0.         0.50461134]
 [0.         0.         0.39206263 0.         0.         0.2315585
  0.39206263 0.29817373 0.         0.         0.39206263 0.39206263
  0.39206263 0.         0.29817373 0.        ]
 [0.41074684 0.41074684 0.         0.         0.41074684 0.2425937
  0.         0.         0.         0.41074684 0.         0.
  0.         0.41074684 0.31238356 0.        ]]


---
### Tokenizacion

---

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

texto = "Hola, ¿cómo estás? Este es un ejemplo de tokenización de oraciones y palabras."

# Tokenización por oraciones
oraciones = sent_tokenize(texto)
print("Oraciones:", oraciones)

# Tokenización por palabras
palabras = word_tokenize(texto)
print("Palabras:", palabras)

Oraciones: ['Hola, ¿cómo estás?', 'Este es un ejemplo de tokenización de oraciones y palabras.']
Palabras: ['Hola', ',', '¿cómo', 'estás', '?', 'Este', 'es', 'un', 'ejemplo', 'de', 'tokenización', 'de', 'oraciones', 'y', 'palabras', '.']


---
### Stemmed

---

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
palabras = ["running", "ran", "run", "runs"]
palabras_stemmed = [stemmer.stem(palabra) for palabra in palabras]

print('Original:', palabras)
print("Stemming:", palabras_stemmed)


Original: ['running', 'ran', 'run', 'runs']
Stemming: ['run', 'ran', 'run', 'run']


---

### Lematizacion

---


In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

palabras = ["running", "ran", "run", "runs"]

palabras_lemmatized = [lemmatizer.lemmatize(palabra, pos='v') for palabra in palabras] # 'v' indica verbo

print('Original:', palabras)
print("Lematización:", palabras_lemmatized)

Original: ['running', 'ran', 'run', 'runs']
Lematización: ['run', 'run', 'run', 'run']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


---
### TF-IDF

---

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

documentos = [
    "Este es un documento.",
    "Este documento es sobre minería de texto.",
    "Otro documento más para minería de texto."
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documentos)

print("Vocabulario:", vectorizer.get_feature_names_out())
print("TF-IDF matriz:", X.toarray())


Vocabulario: ['de' 'documento' 'es' 'este' 'minería' 'más' 'otro' 'para' 'sobre'
 'texto' 'un']
TF-IDF matriz: [[0.         0.37311881 0.4804584  0.4804584  0.         0.
  0.         0.         0.         0.         0.63174505]
 [0.36930805 0.28680065 0.36930805 0.36930805 0.36930805 0.
  0.         0.         0.48559571 0.36930805 0.        ]
 [0.33729513 0.26193976 0.         0.         0.33729513 0.44350256
  0.44350256 0.44350256 0.         0.33729513 0.        ]]


---

### Embeddings

---

In [None]:
from gensim.models import Word2Vec

# Dataset de entrenamiento: una lista de listas de palabras (oraciones tokenizadas)
oraciones = [
    ["el", "gato", "juega", "en", "el", "jardín"],
    ["el", "perro", "corre", "por", "el", "jardín"],
    ["el", "gato", "y", "el", "perro", "juegan", "juntos", "en", "el", "jardín"]
]

# Crear el modelo Word2Vec
# Usamos el modelo Skip-gram (sg=1). Para CBOW, sg=0.
modelo = Word2Vec(sentences=oraciones, vector_size=100, window=5, min_count=1, sg=1)

# Acceder al vector de una palabra
vector_gato = modelo.wv['gato']
print("Vector de 'gato':\n", vector_gato)

# Encontrar las palabras más similares a "gato"
similares_a_gato = modelo.wv.most_similar('gato')
print("Palabras similares a 'gato':\n", similares_a_gato)

# Guardar el modelo entrenado para uso posterior
modelo.save("modelo_word2vec.model")



Vector de 'gato':
 [-0.00713894  0.00124083 -0.00718011 -0.00224425  0.00372071  0.00583406
  0.00119796  0.00210444 -0.00411236  0.00722572 -0.00630599  0.00464761
 -0.00821761  0.00203708 -0.00497604 -0.00424879 -0.00310546  0.00565595
  0.00579721 -0.00497747  0.0007721  -0.00849667  0.00781029  0.00925731
 -0.00274082  0.00079899  0.00074817  0.00547986 -0.00860765  0.00058388
  0.0068724   0.00223193  0.00112286 -0.00932091  0.00847863 -0.00626282
 -0.0029928   0.00349275 -0.00077476  0.00141079  0.00178238 -0.00682758
 -0.00972305  0.00904022  0.00620059 -0.00691051  0.00340207  0.00020468
  0.00475238 -0.00712002  0.00402759  0.00434751  0.00995347 -0.00447528
 -0.00139264 -0.00732204 -0.00969987 -0.00908257 -0.0010239  -0.00650675
  0.004851   -0.00616424  0.00252469  0.00074005 -0.00339489 -0.00097797
  0.00997873  0.00914652 -0.00446518  0.00908166 -0.00564001  0.00593426
 -0.00309793  0.00343223  0.00301771  0.0069003  -0.00237081  0.00877744
  0.00758855 -0.00954625 -0.0080

---


---
### Sentiment Analysis

https://www.datacamp.com/tutorial/text-analytics-beginners-nltk

---
What is Sentiment Analysis


Sentiment analysis is a technique used to determine the emotional tone or sentiment expressed in a text. It involves analyzing the words and phrases used in the text to identify the underlying sentiment, whether it is positive, negative, or neutral.

Sentiment analysis has a wide range of applications, including social media monitoring, customer feedback analysis, and market research.

One of the main challenges in sentiment analysis is the inherent complexity of human language. Text data often contains sarcasm, irony, and other forms of figurative language that can be difficult to interpret using traditional methods.

However, recent advances in natural language processing (NLP) and machine learning have made it possible to perform sentiment analysis on large volumes of text data with a high degree of accuracy.

---

Three Methodologies for Sentiment Analysis


There are several ways to perform sentiment analysis on text data, with varying degrees of complexity and accuracy. The most common methods include a lexicon-based approach, a machine learning (ML) based approach, and a pre-trained transformer-based deep learning approach. Let’s look at each in more detail:

**Lexicon-based analysis**

This type of analysis, such as the NLTK Vader sentiment analyzer, involves using a set of predefined rules and heuristics to determine the sentiment of a piece of text. These rules are typically based on lexical and syntactic features of the text, such as the presence of positive or negative words and phrases.

While lexicon-based analysis can be relatively simple to implement and interpret, it may not be as accurate as ML-based or transformed-based approaches, especially when dealing with complex or ambiguous text data.

**Machine learning (ML)**

This approach involves training a model to identify the sentiment of a piece of text based on a set of labeled training data. These models can be trained using a wide range of ML algorithms, including decision trees, support vector machines (SVMs), and neural networks.

ML-based approaches can be more accurate than rule-based analysis, especially when dealing with complex text data, but they require a larger amount of labeled training data and may be more computationally expensive.

**Pre-trained transformer-based deep learning**

A deep learning-based approach, as seen with BERT and GPT-4, involve using pre-trained models trained on massive amounts of text data. These models use complex neural networks to encode the context and meaning of the text, allowing them to achieve state-of-the-art accuracy on a wide range of NLP tasks, including sentiment analysis. However, these models require significant computational resources and may not be practical for all use cases.

Lexicon-based analysis is a straightforward approach to sentiment analysis, but it may not be as accurate as more complex methods.
Machine learning-based approaches can be more accurate, but they require labeled training data and may be more computationally expensive.
Pre-trained transformer-based deep learning approaches can achieve state-of-the-art accuracy but require significant computational resources and may not be practical for all use cases.
The choice of approach will depend on the specific needs and constraints of the project at hand.

---

In [None]:


import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

---
### Preprocessing Text

Text preprocessing is a crucial step in performing sentiment analysis, as it helps to clean and normalize the text data, making it easier to analyze. The preprocessing step involves a series of techniques that help transform raw text data into a form you can use for analysis. Some common text preproces


Tokenization
Tokenization is a text preprocessing step in sentiment analysis that involves breaking down the text into individual words or tokens. This is an essential step in analyzing text data as it helps to separate individual words from the raw text, making it easier to analyze and understand. Tokenization is typically performed using NLTK's built-in word_tokenize function, which can split the text into individual words and punctuation marks.

Stop words
Stop word removal is a crucial text preprocessing step in sentiment analysis that involves removing common and irrelevant words that are unlikely to convey much sentiment. Stop words are words that are very common in a language and do not carry much meaning, such as "and," "the," "of," and "it." These words can cause noise and skew the analysis if they are not removed.

By removing stop words, the remaining words in the text are more likely to indicate the sentiment being expressed. This can help to improve the accuracy of the sentiment analysis. NLTK provides a built-in list of stop words for several languages, which can be used to filter out these words from the text data.

Stemming and Lemmatization
Stemming and lemmatization are techniques used to reduce words to their root forms. Stemming involves removing the suffixes from words, such as "ing" or "ed," to reduce them to their base form. For example, the word "jumping" would be stemmed to "jump."

Lemmatization, however, involves reducing words to their base form based on their part of speech. For example, the word "jumped" would be lemmatized to "jump," but the word "jumping" would be lemmatized to "jumping" since it is a present participle.

To learn more about stemming and lemmatization, check out our Stemming and Lemmatization in Python tutorial.

---


---
### Bag of Words (BoW) Model

The bag of words model is a technique used in natural language processing (NLP) to represent text data as a set of numerical features. In this model, each document or piece of text is represented as a "bag" of words, with each word in the text represented by a separate feature or dimension in the resulting vector. The value of each feature is determined by the number of times the corresponding word appears in the text.

The bag of words model is useful in NLP because it allows us to analyze text data using machine learning algorithms, which typically require numerical input. By representing text data as numerical features, we can train machine learning models to classify text or analyze sentiments.

The example in the next section will use the NLTK Vader model for sentiment analysis on the Amazon customer dataset. In this particular example, we do not need to perform this step because the NLTK Vader API accepts text as an input instead of numeric vectors, but if you were building a supervised machine learning model to predict sentiment (assuming you have labeled data), you would have to transform the processed text into a bag of words model before training the machine learning model.



---

In [None]:
# import libraries
import pandas as pd

import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer


# download nltk corpus (first time only)
import nltk

nltk.download('all')






[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [None]:
# Load the amazon review dataset

df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [None]:
df.iloc[0].reviewText

'This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff'

In [None]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text

    tokens = word_tokenize(text.lower())

    # Remove stop words

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]


    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]


    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function df

df['reviewText'] = df['reviewText'].apply(preprocess_text)
df

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free . lot different ...,1
2,really cool game . bunch level find golden egg...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . hr fun . grandkids love . ...,1
...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1
19997,love ! game . awesome . wish free stuff house ...,1
19998,love love love app side fashion story fight wo...,1


In [None]:
# initialize NLTK sentiment analyzer

analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function

def get_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['pos'] > 0 else 0

    return sentiment




# apply get_sentiment function

df['sentiment'] = df['reviewText'].apply(get_sentiment)

df

Unnamed: 0,reviewText,Positive,sentiment
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free . lot different ...,1,1
2,really cool game . bunch level find golden egg...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . hr fun . grandkids love . ...,1,1
...,...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1,1
19997,love ! game . awesome . wish free stuff house ...,1,1
19998,love love love app side fashion story fight wo...,1,1


In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['Positive'], df['sentiment']))

[[ 1131  3636]
 [  576 14657]]


In [None]:
from sklearn.metrics import classification_report

print(classification_report(df['Positive'], df['sentiment']))

              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

