# Text Processing

In [1]:
!python3 -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: es-core-news-sm
Successfully installed es-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import re
import nltk
import spacy

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.snowball import SnowballStemmer

nlp = spacy.load("es_core_news_sm")

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Regular Expressions

In [3]:
text = "It is only with the heart that one can see rightly; what is essential is invisible to the eye. And my email is djikstra08@gmail.com; my cellphone number is 5534823204. 126gywbq. iorveth@yahoo.com"
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text) # Use r for Raw text
print(f"Emails: {emails}")

Emails: ['djikstra08@gmail.com', 'iorveth@yahoo.com']


In [4]:
numbers = re.findall(r'\b\d{10}\b', text)
print(f"Numbers: {numbers}")

Numbers: ['5534823204']


In [5]:
# Lemmatization: Base word in dict
# Radicalization: Root

# Lemmatize in English
lemmatizer = WordNetLemmatizer()
words = ["running", "ran", "run", "studies", "holding", "loved"]
lemmas = [lemmatizer.lemmatize(word, pos="v") for word in words]
print(lemmas)

['run', 'run', 'run', 'study', 'hold', 'love']


In [6]:
# Lemmatize in Español
palabras = ["corriendo", "correr", "corre", "corrió", "corremos", "corren", "corredor", "corredizo"]
doc = nlp(" ".join(palabras))
lemmas = [palabra.lemma_ for palabra in doc]
print(lemmas)

['correr', 'correr', 'correr', 'correr', 'corremo', 'correr', 'corredor', 'corredizo']


In [7]:
# Stemming in Español
stemmer = SnowballStemmer("spanish")
stems = [stemmer.stem(palabra) for palabra in palabras]
print(stems)

['corr', 'corr', 'corr', 'corr', 'corr', 'corr', 'corredor', 'corrediz']


In [8]:
# Tokenization
corpus = ["Live life like its the last breath you take for that breath is the whole essence of living, the little things in life are what connects us to all the big things we live for",
          "It is only with the heart that one can see rightly",
          "There is some good in this world, and it’s worth fighting for.",
          "In three words I can sum up everything I've learned about life: it goes on."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(X.toarray())
print(vectorizer.get_feature_names_out())

[[0 1 0 1 1 2 0 1 1 0 0 2 0 0 0 1 1 0 1 1 0 2 1 1 2 1 1 0 0 0 0 0 0 0 1 1
  4 0 2 0 0 1 0 1 0 1 1 1 0 0 0 0 1]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1
  1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0
  0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0]]
['about' 'all' 'and' 'are' 'big' 'breath' 'can' 'connects' 'essence'
 'everything' 'fighting' 'for' 'goes' 'good' 'heart' 'in' 'is' 'it' 'its'
 'last' 'learned' 'life' 'like' 'little' 'live' 'living' 'of' 'on' 'one'
 'only' 'rightly' 'see' 'some' 'sum' 'take' 'that' 'the' 'there' 'things'
 'this' 'three' 'to' 'up' 'us' 've' 'we' 'what' 'whole' 'with' 'words'
 'world' 'worth' 'you']


In [9]:
# Lemmatization
# Lemmatize in English
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in vectorizer.get_feature_names_out()]
print(lemmas)

['about', 'all', 'and', 'are', 'big', 'breath', 'can', 'connects', 'essence', 'everything', 'fighting', 'for', 'go', 'good', 'heart', 'in', 'is', 'it', 'it', 'last', 'learned', 'life', 'like', 'little', 'live', 'living', 'of', 'on', 'one', 'only', 'rightly', 'see', 'some', 'sum', 'take', 'that', 'the', 'there', 'thing', 'this', 'three', 'to', 'up', 'u', 've', 'we', 'what', 'whole', 'with', 'word', 'world', 'worth', 'you']
