<a href="https://colab.research.google.com/github/lmcanavals/ml/blob/main/representaciones_de_texto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF

## Corpus

In [1]:
%%file corpus.txt
saludo al sol con una sonrisa
cuando sonries se te ven los dientes
como los dientes de un sol de caricatura

Writing corpus.txt


In [12]:
import numpy as np
import pandas as pd

from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import RegexpTokenizer
import re

# Reoresentación binaria

In [11]:
with open("corpus.txt") as f:
    corpus = f.readlines()
temp = []
for line in corpus:
    temp.append(line.split())
corpus = temp

print(corpus)

[['saludo', 'al', 'sol', 'con', 'una', 'sonrisa'], ['cuando', 'sonries', 'se', 'te', 'ven', 'los', 'dientes'], ['como', 'los', 'dientes', 'de', 'un', 'sol', 'de', 'caricatura']]


In [12]:
wordsidx = []
for line in corpus:
    wordsidx.extend(line)

wordsidx = list(set(wordsidx))
print(wordsidx)
print(len(wordsidx))
words = {word: idx for idx, word in enumerate(wordsidx)}
print(words)

['se', 'saludo', 'cuando', 'una', 'te', 'ven', 'dientes', 'de', 'los', 'caricatura', 'con', 'al', 'como', 'sol', 'sonrisa', 'un', 'sonries']
17
{'se': 0, 'saludo': 1, 'cuando': 2, 'una': 3, 'te': 4, 'ven': 5, 'dientes': 6, 'de': 7, 'los': 8, 'caricatura': 9, 'con': 10, 'al': 11, 'como': 12, 'sol': 13, 'sonrisa': 14, 'un': 15, 'sonries': 16}


In [13]:
cols, rows = len(wordsidx), len(corpus)
dataset = np.zeros((rows, cols), dtype=int)
for i, line in enumerate(corpus):
    for word in line:
        dataset[i, words[word]] = 1

print(dataset)

[[0 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0]
 [1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0]]


### Some real data

In [4]:
url="https://raw.githubusercontent.com/lmcanavals/ml/main/data/spam.csv"
spamdf = pd.read_csv(url,encoding ='latin1')
spamdf.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
sentence = spamdf["v2"][0]
print(sentence)

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


In [8]:
token = WhitespaceTokenizer().tokenize(sentence)
print(token)
print(sentence.split())

['Go', 'until', 'jurong', 'point,', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'Cine', 'there', 'got', 'amore', 'wat...']
['Go', 'until', 'jurong', 'point,', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'Cine', 'there', 'got', 'amore', 'wat...']


In [9]:
%timeit WhitespaceTokenizer().tokenize(sentence)

7.29 µs ± 728 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [10]:
%timeit sentence.split()

526 ns ± 11.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [13]:
wordsRE = '[a-zA-Z]+'
tokenizer = RegexpTokenizer(wordsRE)
tokens = tokenizer.tokenize(sentence)
print(tokens)

['Go', 'until', 'jurong', 'point', 'crazy', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'Cine', 'there', 'got', 'amore', 'wat']


Econtremos todos los valores monetarios

In [31]:
wordsRE = '\$\d+\.?\d+?'
tokenizer = RegexpTokenizer(wordsRE)
tokens = [ token for e in spamdf["v2"] for token in tokenizer.tokenize(e) ]
print(tokens)

['$350', '$95', '$50', '$50', '$700', '$900', '$700', '$900', '$5.0', '$5.0', '$140', '$180', '$900']


In [37]:
wordsRE = 'S[a-zA-Z]*'
tokenizer = RegexpTokenizer(wordsRE)
tokens = [ token for e in spamdf["v2"] for token in tokenizer.tokenize(e) ]
print(tokens)

['SIX', 'SH', 'SH', 'SUNDAY', 'S', 'SCOTLAND', 'So', 'S', 'SG', 'SEEING', 'SMS', 'Sptv', 'SPTV', 'Sherawat', 'Suprman', 'StarWars', 'Sorry', 'Sorry', 'Send', 'S', 'ShrAcomOrSglSuplt', 'S', 'SOMETHING', 'Still', 'Sorry', 'Sorry', 'Sorry', 'Smile', 'Smile', 'Smile', 'Smile', 'Smile', 'SOMEONE', 'Smiling', 'Simply', 'So', 'She', 'So', 'Statement', 'So', 'SUM', 'Sunshine', 'Sony', 'SP', 'Sir', 'Stop', 'Sindu', 'S', 'STOP', 'S', 'See', 'SMS', 'STOP', 'Send', 'STOP', 'SHOULD', 'Service', 'S', 'See', 'Sir', 'S', 'SHA', 'She', 'Sounds', 'Sorry', 'ST', 'S', 'Sorry', 'Sam', 'So', 'SITUATION', 'SON', 'So', 'SSSSSSSEEEEEE', 'SPORTSx', 'SIC', 'Same', 'Stop', 'STOP', 'S', 'SPECIAL', 'Send', 'So', 'STUPID', 'SS', 'SLVYL', 'Shit', 'SMS', 'Sister', 'Swtheart', 'So', 'Special', 'Send', 'So', 'Sorry', 'SUZY', 'Sorry', 'ST', 'SLAP', 'SELF', 'SH', 'S', 'S', 'S', 'Sirji', 'SSCO', 'SIC', 'SH', 'Suite', 'Since', 'S', 'So', 'SonyEricsson', 'Services', 'So', 'Some', 'So', 'Sony', 'So', 'Sir', 'Salam', 'SSCO', '

## Document labeling

## Stemming and Lemmatisation

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [45]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
print(stemmer.stem("studying"))
print(lemmatizer.lemmatize("feet"))
print(lemmatizer.lemmatize("mice"))

studi
foot
mouse


In [46]:
print(stemmer.stem("car's"))
print(lemmatizer.lemmatize("car's"))

car'
car's


In [50]:
from nltk.stem.lancaster import LancasterStemmer
import nltk.stem as st

In [49]:
stemmer1 = LancasterStemmer()
print(stemmer1.stem("car's"))

car's


In [52]:
print(stemmer1.stem("studying"))
print(stemmer1.stem("studied"))

study
study


In [58]:
print(lemmatizer.lemmatize("are", pos='v'))

be


## Part-of-speech PoS

In [59]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [60]:
from nltk.tokenize import sent_tokenize

In [64]:
sentences = sent_tokenize(" ".join(spamdf['v2'].array))
print(len(sentences))

7757


In [65]:
sentences[:5]

['Go until jurong point, crazy..',
 'Available only in bugis n great world la e buffet... Cine there got amore wat... Ok lar...',
 'Joking wif u oni... Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.',
 "Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's U dun say so early hor... U c already then say... Nah I don't think he goes to usf, he lives around here though FreeMsg Hey there darling it's been 3 week's now and no word back!",
 "I'd like some fun you up for it still?"]

In [66]:
from nltk.tokenize import word_tokenize

In [67]:
words = word_tokenize(" ".join(spamdf['v2'].array))
print(len(words))

104144
