In [1]:
#
import pandas as pd
import numpy as np

<h4 class="text-center"> Unstructure Test </h4>

In [2]:
text_one = [' Inside. by xyz_author   ',
           'Limbo. by some_author',
           'Owlboy. by some_dev']

In [3]:
text_one

[' Inside. by xyz_author   ', 'Limbo. by some_author', 'Owlboy. by some_dev']

<span class="badge"> Strip WhiteSpace </span>

In [4]:
strip_whitespace = [ string.strip() for string in text_one]

In [5]:
strip_whitespace

['Inside. by xyz_author', 'Limbo. by some_author', 'Owlboy. by some_dev']

<span class="badge"> Remove Periods </span>

In [6]:
remove_periods = [ string.replace('.','') for string in strip_whitespace]

In [7]:
remove_periods

['Inside by xyz_author', 'Limbo by some_author', 'Owlboy by some_dev']

<span class="badge"> Capitalize </span>

In [8]:
capitalize_text = [ string.upper() for string in remove_periods]

In [9]:
capitalize_text

['INSIDE BY XYZ_AUTHOR', 'LIMBO BY SOME_AUTHOR', 'OWLBOY BY SOME_DEV']

<span class="badge"> Regular Expression </span>

In [10]:
import re

In [11]:
def replace_letter(string:str) -> str:
    return re.sub(r'[a-zA-Z]', 'X', string)

In [12]:
[ replace_letter(string) for string in capitalize_text]

['XXXXXX XX XXX_XXXXXX', 'XXXXX XX XXXX_XXXXXX', 'XXXXXX XX XXXX_XXX']

<h4 class="text-center"> Web Scraping </h4>

In [13]:
from bs4 import BeautifulSoup
import requests

In [14]:
url = requests.get('https://en.wikipedia.org/wiki/Cicada_3301')

In [15]:
url.text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Cicada 3301 - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"1fe35d60-208b-477d-9464-7a7361445497","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Cicada_3301","wgTitle":"Cicada 3301","wgCurRevisionId":984371936,"wgRevisionId":984371936,"wgArticleId":41190833,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: numeric names: authors list","Articles with short description","Short description is different from Wikidata","Secret societies","Works of unknown authorship

In [16]:
soup = BeautifulSoup(url.text,'lxml')

In [17]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Cicada 3301 - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"1fe35d60-208b-477d-9464-7a7361445497","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Cicada_3301","wgTitle":"Cicada 3301","wgCurRevisionId":984371936,"wgRevisionId":984371936,"wgArticleId":41190833,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: numeric names: authors list","Articles with short description","Short description is different from Wikidata","Secret societies","Works of unknown authorship","Mul

In [18]:
soup.select('.mw-headline')

[<span class="mw-headline" id="Purpose">Purpose</span>,
 <span class="mw-headline" id="Resolution">Resolution</span>,
 <span class="mw-headline" id="Types_of_clues">Types of clues</span>,
 <span class="mw-headline" id="Allegations_against_the_group">Allegations against the group</span>,
 <span class="mw-headline" id="Allegations_of_illegal_activity">Allegations of illegal activity</span>,
 <span class="mw-headline" id="Claims_of_being_a_cult">Claims of being a cult</span>,
 <span class="mw-headline" id="In_popular_culture">In popular culture</span>,
 <span class="mw-headline" id="Music">Music</span>,
 <span class="mw-headline" id="See_also">See also</span>,
 <span class="mw-headline" id="References">References</span>,
 <span class="mw-headline" id="External_links">External links</span>]

In [19]:
soup.select('.mw-headline')[0].getText()

'Purpose'

In [20]:
len(soup.select('.mw-headline'))

11

In [21]:
for i in soup.select('.mw-headline'):
    print(i.text)

Purpose
Resolution
Types of clues
Allegations against the group
Allegations of illegal activity
Claims of being a cult
In popular culture
Music
See also
References
External links


<h4 class="text-center"> Punctuation </h4>

In [22]:
import sys
import unicodedata

In [23]:
text_two =  [
    'Purpose!!....',
    '100% Resolution',
    'Types of #clues?!'
]

In [24]:
text_two

['Purpose!!....', '100% Resolution', 'Types of #clues?!']

In [25]:
#a dictionary of punctuation character
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

In [26]:
[string.translate(punctuation) for string in text_two]

['Purpose', '100 Resolution', 'Types of clues']

<h4 class="text-center"> Tokenizing </h4>

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nill\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<span class="badge"> Word </span>

In [28]:
from nltk.tokenize import word_tokenize

In [29]:
text_three = 'Something is wrong i can feel it'
text_three
word_tokenize(text_three)

['Something', 'is', 'wrong', 'i', 'can', 'feel', 'it']

<span class="badge"> Sentence </span>

In [30]:
from nltk.tokenize import sent_tokenize

In [31]:
text_four = 'Something is wrong i can feel it. I can be wrong too.'
sent_tokenize(text_four)

['Something is wrong i can feel it.', 'I can be wrong too.']

<h4 class="text-center"> Removing Stop Words (on,in,of..) </h4>

In [32]:
from nltk.corpus import stopwords

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nill\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
text_five= 'Is this the end of the world or something terrible going to happen'

In [35]:
tokenize_words = word_tokenize(text_five)

In [36]:
tokenize_words

['Is',
 'this',
 'the',
 'end',
 'of',
 'the',
 'world',
 'or',
 'something',
 'terrible',
 'going',
 'to',
 'happen']

In [37]:
stop_word = stopwords.words('english')

In [38]:
stop_word[:5]

['i', 'me', 'my', 'myself', 'we']

In [39]:
[word for word in tokenize_words if word not in stop_word]

['Is', 'end', 'world', 'something', 'terrible', 'going', 'happen']

<h4 class="text-center"> Stemming Words </h4>

In [40]:
tokenize_words

['Is',
 'this',
 'the',
 'end',
 'of',
 'the',
 'world',
 'or',
 'something',
 'terrible',
 'going',
 'to',
 'happen']

In [41]:
#convert word to their original form
from nltk.stem.porter import PorterStemmer

In [42]:
porter = PorterStemmer()

In [43]:
[porter.stem(word) for word in tokenize_words]

['Is',
 'thi',
 'the',
 'end',
 'of',
 'the',
 'world',
 'or',
 'someth',
 'terribl',
 'go',
 'to',
 'happen']

<h4 class="text-center"> Parts of Speech </h4>

In [48]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nill\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [49]:
text_data = 'He is supposed to be very good person'

In [50]:
tag_speech = pos_tag(word_tokenize(text_data))

In [51]:
tag_speech

[('He', 'PRP'),
 ('is', 'VBZ'),
 ('supposed', 'VBN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('person', 'NN')]

In [52]:
"""
NNP Proper noun, singular
NN Noun, singular or mass
RB Adverb
VBD Verb, past tense
VBG Verb, gerund or present participle
JJ Adjective
PRP Personal pronoun
"""

'\nNNP Proper noun, singular\nNN Noun, singular or mass\nRB Adverb\nVBD Verb, past tense\nVBG Verb, gerund or present participle\nJJ Adjective\nPRP Personal pronoun\n'

In [53]:
#### Real life example of pos_tag

In [54]:
posts = ['He is the best MMA Fighter',
        'This is the right time for vote',
        'Can i go i have work to do']

In [55]:
tag_post = []

In [60]:
for post in posts:
    tag_ = pos_tag(word_tokenize(post))
    tag_post.append([tag for word,tag in tag_])

In [61]:
tag_post

[['PRP', 'VBZ', 'DT', 'JJS', 'NN', 'NN'],
 ['DT', 'VBZ', 'DT', 'JJ', 'NN', 'IN', 'NN'],
 ['MD', 'VB', 'VBP', 'RB', 'VBP', 'NN', 'TO', 'VB']]

In [62]:
from sklearn.preprocessing import MultiLabelBinarizer

In [63]:
one_hot = MultiLabelBinarizer()

In [64]:
one_hot.fit_transform(tag_post)

array([[1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1],
       [1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0]])

In [65]:
one_hot.classes_

array(['DT', 'IN', 'JJ', 'JJS', 'MD', 'NN', 'PRP', 'RB', 'TO', 'VB',
       'VBP', 'VBZ'], dtype=object)

In [66]:
pd.DataFrame(one_hot.fit_transform(tag_post) , columns = one_hot.classes_)

Unnamed: 0,DT,IN,JJ,JJS,MD,NN,PRP,RB,TO,VB,VBP,VBZ
0,1,0,0,1,0,1,1,0,0,0,0,1
1,1,1,1,0,0,1,0,0,0,0,0,1
2,0,0,0,0,1,1,0,1,1,1,1,0


<h4 class="text-center"> Bag of Words </h4>

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
text = np.array(posts)

In [69]:
text

array(['He is the best MMA Fighter', 'This is the right time for vote',
       'Can i go i have work to do'], dtype='<U31')

In [70]:
count = CountVectorizer()

In [71]:
count.fit_transform(text)

<3x17 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [72]:
count.fit_transform(text).toarray()

array([[1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [73]:
count.get_feature_names()

['best',
 'can',
 'do',
 'fighter',
 'for',
 'go',
 'have',
 'he',
 'is',
 'mma',
 'right',
 'the',
 'this',
 'time',
 'to',
 'vote',
 'work']

In [77]:
pd.DataFrame(count.fit_transform(text).toarray(), columns= count.get_feature_names())

Unnamed: 0,best,can,do,fighter,for,go,have,he,is,mma,right,the,this,time,to,vote,work
0,1,0,0,1,0,0,0,1,1,1,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,1,0,1,1,1,1,0,1,0
2,0,1,1,0,0,1,1,0,0,0,0,0,0,0,1,0,1


<h4 class="text-center"> Word Importance </h4>

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
tfid = TfidfVectorizer()

In [80]:
tfid.fit_transform(text)

<3x17 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [81]:
tfid.fit_transform(text).toarray()

array([[0.44036207, 0.        , 0.        , 0.44036207, 0.        ,
        0.        , 0.        , 0.44036207, 0.3349067 , 0.44036207,
        0.        , 0.3349067 , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.40301621,
        0.        , 0.        , 0.        , 0.30650422, 0.        ,
        0.40301621, 0.30650422, 0.40301621, 0.40301621, 0.        ,
        0.40301621, 0.        ],
       [0.        , 0.40824829, 0.40824829, 0.        , 0.        ,
        0.40824829, 0.40824829, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.40824829,
        0.        , 0.40824829]])

In [83]:
tfid.vocabulary_ #frequency of the words

{'he': 7,
 'is': 8,
 'the': 11,
 'best': 0,
 'mma': 9,
 'fighter': 3,
 'this': 12,
 'right': 10,
 'time': 13,
 'for': 4,
 'vote': 15,
 'can': 1,
 'go': 5,
 'have': 6,
 'work': 16,
 'to': 14,
 'do': 2}