# Cleaning Text

In [1]:
text_data = ['  Interstellar. By Christopher Nolan     ',
            "Tommy Hilfiger is Expensive.",
            "Slash. The mighty player. "]

strip_whitespaces = [data.strip() for data in text_data]
strip_whitespaces

['Interstellar. By Christopher Nolan',
 'Tommy Hilfiger is Expensive.',
 'Slash. The mighty player.']

In [2]:
remove_periods = [data.replace(".","") for data in strip_whitespaces]
remove_periods

['Interstellar By Christopher Nolan',
 'Tommy Hilfiger is Expensive',
 'Slash The mighty player']

In [3]:
uppercase_letter = list(map(lambda x: x.upper(), remove_periods))
uppercase_letter

['INTERSTELLAR BY CHRISTOPHER NOLAN',
 'TOMMY HILFIGER IS EXPENSIVE',
 'SLASH THE MIGHTY PLAYER']

# Parsing and Cleaning a HTML

In [4]:
from bs4 import BeautifulSoup

html = """
        <div class='full_name'><span style='font-weight:bold'>Alexios</span>Sparta</div>
        
        """
soup = BeautifulSoup(html, "lxml")
soup.find("div",{"class":"full_name"}).text


'AlexiosSparta'

# Removing Punctuation from Text

In [5]:
import unicodedata
import sys

text_data = ['Hi!!!!!!! I. Love. This. Song.....',
            '10000000000% Agree!!! #LoveIt',
            'Right?']

punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
[data.translate(punctuation) for data in text_data]

['Hi I Love This Song', '10000000000 Agree LoveIt', 'Right']

# Tokenizing Text

In [8]:
# Word Tokenization
from nltk.tokenize import word_tokenize

string = "The science of today is the technology of tomorrow"

word_tokenize(string)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [9]:
# String Tokenization 
from nltk.tokenize import sent_tokenize

string = "The science of today is the technology of tomorrow. Tomorrow is today."

sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

# Removing Stop Words 

In [10]:
from nltk.corpus import stopwords

tokenized_words = ['i', 'am', 'going', 'to', 'go', 'the', 'shop', 'and', 'park']
stop_words = stopwords.words('english')
[words for words in tokenized_words if words not in stop_words]

['going', 'go', 'shop', 'park']

# Stemming Words

In [11]:
from nltk.stem.porter import PorterStemmer

tokenized_words = ['i', 'am', 'going', 'to', 'go', 'the', 'shop', 'and', 'park']
porter = PorterStemmer()
[porter.stem(word) for word in tokenized_words]

['i', 'am', 'go', 'to', 'go', 'the', 'shop', 'and', 'park']

# Tagging Parts of Speech

In [12]:
from nltk import pos_tag
from nltk import word_tokenize

text_data = 'Chris loved outdoor running'
text_tagged = pos_tag(word_tokenize(text_data))
text_tagged

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

# Encoding Text as a Bag of Words

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

df = pd.DataFrame()

for i, features in enumerate(count.get_feature_names()):
    df[features] = bag_of_words.toarray()[:, i]

df

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


# Weighting Word Importance

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = np.array(['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [18]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

The formula for 

idf is $\log{\frac{1 + n_d}{1 + df(d,t)}} + 1$

and the formula for tf-idf(d,t) = tf(t,d) $\times$ idf(t)