<h1>Chap06 - Cleaning Text</h1>

Imports

In [97]:
import re
import lxml
import sys
import unicodedata

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag

from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger



## 6.1 Cleaning Text 

In [2]:
text_data = ["   Interrobang. By Aishwarya Henriette    ",
             "Parking And Going. By Karl Gautier",
             "   Today Is The night. By Jarek Prakash   "]

In [3]:
strip_whitespace = [string.strip() for string in text_data]

In [4]:
strip_whitespace

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [5]:
remove_periods = [string.replace(".", "") for string in strip_whitespace]

In [6]:
remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [7]:
def capitalizer(string: str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [8]:
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

## 6.2 Parsing and Cleaning HTML 

<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>

In [9]:
html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"

In [10]:
soup = BeautifulSoup(html, features='lxml')

In [11]:
soup.find('div', {'class': 'full_name'}).text

'Masego Azra'

## 6.3 Removing Punctuation 

In [13]:
text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right!?!?']

punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

## 6.4 Tokenizing Text 

In [37]:
#Tokenize words
string = 'The science of today is the technology of tomorrow'
word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [38]:
#Tokenize sentences
string = 'The science of today is the technology of tomorrow. Tomorrow is today'
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today']

## 6.5 Removin Stop Words 

In [41]:
tokenized_words = ['i',
                  'am',
                  'going',
                  'to',
                  'go',
                  'to',
                  'the',
                  'store',
                  'and',
                  'park']

In [42]:
stop_words = stopwords.words('english')
[word for word in tokenized_words if word not in stop_words]

['going', 'go', 'store', 'park']

In [46]:
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

## 6.6  Stemming Words 

In [47]:
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

In [48]:
porter = PorterStemmer()

In [49]:
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

## 6.7 Tagging Parts of Speech 

In [52]:
text_data = 'Chris loved outdoor running'
text_tagged = pos_tag(word_tokenize(text_data))
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

In [53]:
[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

['Chris']

In [54]:
tweets = ['I am eating a burrito for breakfast', 
          'Political science is an amazing field',
          'San Francisco is an awesome city']

In [58]:
tagged_tweets = []
for tweet in tweets:
    tweet_tag = nltk.pos_tag(word_tokenize(tweet))
    tagged_tweets.append([tag for word, tag in tweet_tag])
    
one_hot_multi = MultiLabelBinarizer()
one_hot_multi.fit_transform(tagged_tweets)

array([[1, 1, 0, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 0, 0, 0, 1]])

In [64]:
sentences = brown.tagged_sents(categories = 'news')

train = sentences[:4000]
test = sentences[4000:]

unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)

trigram.evaluate(test)

0.8174734002697437

## 6.8 Encoding Test as a Bag of Words 

In [82]:
text_data = np.array(['I Love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [83]:
count.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

In [86]:
pd.DataFrame(bag_of_words.toarray(), columns = count.get_feature_names())

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


In [94]:
count_2gram = CountVectorizer(ngram_range=(1,2), stop_words='english', vocabulary=['brazil'])
bag = count_2gram.fit_transform(text_data)

In [95]:
bag.toarray()

array([[2],
       [0],
       [0]], dtype=int64)

In [96]:
count_2gram.vocabulary_

{'brazil': 0}

## 6.9 Weighting Word Importance 

In [98]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [99]:
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [100]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}