# Tokenization
Tenemos 2 ejemplos

*   Utilizando el paquete spacy
*   Utilizando el paquete nltk



In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("Tokenization is the process of breaking down text into  tokens.")
tokens = [token.text for token in doc]
tokens

['Tokenization',
 'is',
 'the',
 'process',
 'of',
 'breaking',
 'down',
 'text',
 'into',
 ' ',
 'tokens',
 '.']

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

text = "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof."

print(sent_tokenize(text))
print(word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', 'Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.']
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 't

# Stopwords


In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords_en = stopwords.words('english')
stopwords_es = stopwords.words('spanish')
print(stopwords_en)
print(stopwords_es)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
text = "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof."
clean_text = [word for word in text.split(" ") if word not in stopwords_en]

print(text.split(" "))
print(clean_text)

['Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'field', 'of', 'computer', 'science,', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(natural)', 'languages,', 'and,', 'in', 'particular,', 'concerned', 'with', 'programming', 'computers', 'to', 'fruitfully', 'process', 'large', 'natural', 'language', 'corpora.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'natural', 'language', 'understanding,', 'natural', 'language', 'generation', '(frequently', 'from', 'formal,', 'machine-readable', 'logical', 'forms),', 'connecting', 'language', 'and', 'machine', 'perception,', 'managing', 'human-computer', 'dialog', 'systems,', 'or', 'some', 'combination', 'thereof.']
['Natural', 'language', 'processing', '(NLP)', 'field', 'computer', 'science,', 'artificial', 'intelligence', 'computational', 'linguistics', 'concerned', 'interactions', 'computers'

# Stemming

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

stemmer = PorterStemmer()
sentence = "The quick brown foxes are jumping over the lazy dogs"

words = word_tokenize(sentence)
stemmed_words = [stemmer.stem(word) for word in words]

print("Original Sentence:", sentence.split(" "))
print("Stemmed Words:", stemmed_words)

Original Sentence: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs']
Stemmed Words: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Lemmatization

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

sentence = "The quick brown foxes are jumping over the lazy dogs"
words = word_tokenize(sentence)
lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]

# Print the original sentence and the lemmatized words
print("Original Sentence:", sentence.split(" "))
print("Lemmatized Words:", lemmatized_words)

Original Sentence: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs']
Lemmatized Words: ['The', 'quick', 'brown', 'fox', 'be', 'jump', 'over', 'the', 'lazy', 'dog']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Normalization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

# Example sentence
sentence = "The quick brown foxes are jumping over the lazy dogs. Isn't this exciting?"

# Step 1: Lowercasing
normalized_sentence = sentence.lower()

# Step 2: Tokenization
tokens = word_tokenize(normalized_sentence)

# Step 3: Removing stopwords and punctuation
punctuation = set(string.punctuation)
filtered_tokens = [token for token in tokens if token not in punctuation]

# Print the original sentence and the normalized tokens
print("Original Sentence:", sentence.split(" "))
print("Normalized Tokens:", filtered_tokens)

Original Sentence: ['The', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs.', "Isn't", 'this', 'exciting?']
Normalized Tokens: ['the', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs', 'is', "n't", 'this', 'exciting']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# Noise & Outliers

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

# Example corpus (a small collection of sentences)
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "The quick brown fox jumps over the lazy dog.",
    "Python is a versatile programming language used in data science.",
    "Python is a versatile programming language used widely in data science.",
    "Today's weather forecast predicts rain in the evening.",
    "Today's weather forecast predicts heavy rain in the evening.",
    "xxyyzzaabb is a rare genetic disorder affecting 1 in 100,000 people.",
    "is a rare disorder affecting 1 in 100,000 people."
]

# Tokenize and normalize the corpus
tokenized_words = []
stop_words = set(stopwords.words('english'))

for sentence in corpus:
    words = word_tokenize(sentence.lower())
    tokenized_words.extend(words)

# Count frequencies of each word
word_freq = Counter(tokenized_words)

# Identify outliers (rare words)
outliers = [word for word, freq in word_freq.items() if freq == 1]

print("Original Corpus:\n", "\n".join(corpus))
print("\nOutliers (Rare Words):", outliers)

Original Corpus:
 The quick brown fox jumps over the lazy dog.
The quick brown fox jumps over the lazy dog.
Python is a versatile programming language used in data science.
Python is a versatile programming language used widely in data science.
Today's weather forecast predicts rain in the evening.
Today's weather forecast predicts heavy rain in the evening.
xxyyzzaabb is a rare genetic disorder affecting 1 in 100,000 people.
is a rare disorder affecting 1 in 100,000 people.

Outliers (Rare Words): ['widely', 'heavy', 'xxyyzzaabb', 'genetic']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import string

# Example sentence with noise
noisy_sentence = "The quick brown fox jumps over the lazy dog!!!"

# Remove punctuation
cleaned_sentence = noisy_sentence.translate(str.maketrans('', '', string.punctuation))

print("Noisy Sentence:", noisy_sentence)
print("Cleaned Sentence:", cleaned_sentence)

Noisy Sentence: The quick brown fox jumps over the lazy dog!!!
Cleaned Sentence: The quick brown fox jumps over the lazy dog


# Vectorization

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus to TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(corpus)

# Print the feature names (words) in the TF-IDF matrix
print("Feature names:", vectorizer.get_feature_names_out())

# Print the TF-IDF matrix (sparse format)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

Feature names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


## BoW

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Example corpus (a list of documents)
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus to BoW vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print feature names (words)
print("Feature names:", vectorizer.get_feature_names_out())

# Print BoW matrix (sparse format)
print("BoW Matrix:")
print(bow_matrix.toarray())

Feature names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
BoW Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


# Manejar datos no balanceados

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

# Generate a synthetic imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2,
                           weights=[0.95, 0.05], random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train a classifier (e.g., Random Forest)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       189
           1       0.57      0.36      0.44        11

    accuracy                           0.95       200
   macro avg       0.77      0.67      0.71       200
weighted avg       0.94      0.95      0.94       200

