In [1]:
import re
import unicodedata
import sys
import spacy
import nltk

import numpy as np

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk import word_tokenize
from transformers import pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')  # สำหรับ NLTK 3.9 ขึ้นไป

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

6.1 : Cleaning Text

In [3]:
text_data = ["  Interrobang. By Aishwarya Henriette  ",
             "Parking And Going. BY Karl Gautier",
             "  Today Is The night. By Jarek Prakash  ",]

strip_whitespace = [string.strip() for string in text_data]

strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. BY Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [4]:
remove_periods  = [string.replace(".","") for string in strip_whitespace]

remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going BY Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [5]:
def capitalizer(string: str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [6]:
def replace_letters_with_X(string: str) -> str:
    return re.sub(r'[a-zA-Z]', 'X', string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

Discussion

In [7]:
s = "machine learning in python cookbook"

find_n = s.find("n")

start_with_m = s.startswith("m")

end_with_python = s.endswith("python")

is_alnum = s.isalnum()
is_alpha = s.isalpha()

encode_as_utf8 = s.encode("utf-8")

decode = encode_as_utf8.decode("utf-8")

print(find_n, start_with_m, end_with_python, is_alnum, is_alpha, encode_as_utf8, decode,sep = "|")

5|True|False|False|False|b'machine learning in python cookbook'|machine learning in python cookbook


6.2 : Parsing and Cleaning HTML

In [8]:
html = "<div class='full_name'>"\
        "<span style='font-weight:bold'>Masego"\
        "</span> Azra</div>"

soup = BeautifulSoup(html, 'lxml')
soup.find('div', {"class" : 'full_name'}).text

'Masego Azra'

6.3 : Removing Punctuation

In [9]:
text_data = ["Hi!!!! I. Love. This. Song...." , 
             '10000% Agree!!!! #LoveIT' ,
             'Right?!?!']

punctuatuation = dict.fromkeys(
    (i for i in range(sys.maxunicode)
     if unicodedata.category(chr(i)).startswith('P')
     ),
    None
    )

[String.translate(punctuatuation) for String in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

6.4 : Tokenizing Text

In [10]:
string = "The science of today is the technology of tomorrow."
word_tokenize(string)


['The',
 'science',
 'of',
 'today',
 'is',
 'the',
 'technology',
 'of',
 'tomorrow',
 '.']

In [11]:
string = "The science of today is the technology of tomorrow. Tomorrow is today."
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

6.5 : Removing Stop Words

In [12]:
tokenized_words = ['i',
                   'am',
                   'going',
                   'to',
                   'go',
                   'to',
                   'the',
                   'store',
                   'and',
                   'park',
                  ]
stop_words = stopwords.words('english')
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

Discussion

In [13]:
stop_words[:5]

['a', 'about', 'above', 'after', 'again']

6.6 : Stemming Words

In [14]:
tokenized_words = ['i','am','humbled','by','this','traditional','meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

6.7 : Tagging Parts of Speech

In [15]:
text_data = "Chris loved outdoor running"
text_tagged = pos_tag(word_tokenize(text_data))
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [16]:
[word for word,tag in text_tagged if tag in ['NN' , 'NNS', 'NNP' , 'NNPS']]

['Chris']

In [17]:
tweets = ["I am eating a burrito for breakfast",
 "Political science is an amazing field",
 "San Francisco is an awesome city"]

tagged_tweets = []

for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word,tag in tweet_tag])

one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [18]:
one_hot_multi.classes_

array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

6.8 : Performing Named-Entity Recognition

In [19]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk offered to buy Twitter using $21B of his own money.")
print(doc.ents)

(Elon Musk, Twitter, 21B)


In [20]:
for entity in doc.ents:
    print(entity.text , entity.label_,sep=",")

Elon Musk,PERSON
Twitter,PERSON
21B,MONEY


6.9 : Encoding Text as a Bag of Words

In [21]:
test_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

count = CountVectorizer()
bag_of_words = count.fit_transform(test_data)
bag_of_words

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (3, 8)>

In [22]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]])

In [23]:
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

Discussion

In [24]:
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words="english",
                              vocabulary=['brazil'])
bag  = count_2gram.fit_transform(test_data)
bag.toarray()

array([[2],
       [0],
       [0]])

In [25]:
count_2gram.vocabulary_

{'brazil': 0}

6.10 : Weighting Word Importance

In [26]:
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (3, 8)>

In [27]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [28]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

6.11 : Using Text Vectors to Calculate Text Similarity in a Search Query

In [29]:
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

text = "Brazil is the best"
vector = tfidf.transform([text])
vector

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1, 8)>

In [30]:
cosine_similarities = linear_kernel(vector,feature_matrix).flatten()
related_doc_indicies = cosine_similarities.argsort()[:-10:-1]
print([(text_data[i],cosine_similarities[i]) for i in related_doc_indicies])

[(np.str_('Sweden is best'), np.float64(0.6666666666666666)), (np.str_('I love Brazil. Brazil!'), np.float64(0.5163977794943222)), (np.str_('Germany beats both'), np.float64(0.0))]


6.12 : Using a Sentiment Analysis Classifier

In [32]:
classifier = pipeline("sentiment-analysis")

sentiment_1 = classifier("I hate machine learning! It's the absolute worst.")
sentiment_2 = classifier( "Machine learning is the absolute"  "bees knees I love it so much!")

print(sentiment_1,sentiment_2)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9998020529747009}] [{'label': 'POSITIVE', 'score': 0.9995730519294739}]
