# Web scrapping

In [None]:
# !pip install beautifulsoup4

In [2]:
from bs4 import BeautifulSoup 


html_doc = """<html><head><title>Trip Advisor page</title></head>
<body>
<p> ... </p>
<p class="comment-title"><b>A must-see for a visitor to London</b></p>

<p class="comment-body">Beautiful views of London and information-packed Royal Observatory exhibits. 
During the week it was not crowded. I would highly recommend the Royal Observatory Greenwich.
Find more below:
<a href="http://example.com/London-eyes" class="attraction" id="link1">London eyes</a>,
<a href="http://example.com/sample" class="attraction" id="link2">Royal Greenwich Museum</a> and 
follow me for more info.</p>

<p class="next-comment">...</p>
"""

soupified = BeautifulSoup(html_doc, "html.parser") 

# extract the text of p tag with 'title' class 
title = soupified.find('p', {'class': 'comment-body'}).get_text()
print("title: " , title)

title:  Beautiful views of London and information-packed Royal Observatory exhibits. 
During the week it was not crowded. I would highly recommend the Royal Observatory Greenwich.
Find more below:
London eyes,
Royal Greenwich Museum and 
follow me for more info.


In [3]:
from bs4 import BeautifulSoup 

html_doc = """<article class="container">
    <header class="pageheader">
        <div class="downloadandShare">
            <div class="row">
                <div class="column column-8">
    <nav aria-label="breadcrumb" class="pageheader-breadcrumb text-small clearfix">
        <ol>
                <li>
                        <a href="/for-organisations/">For organisations</a><span>/</span>
                </li>
                <li>
                        <a href="/for-organisations/guide-to-data-protection/">Guide to Data Protection</a><span>/</span>
                </li>
                <li>
                        <span class="current" aria-current="page" aria-label="Current page">
                            Guide to the General Data Protection Regulation (GDPR)
                        </span>
                </li>
        </ol>
    </nav>
                    <h1 id="multipage-heading">Guide to the UK General Data Protection Regulation (UK GDPR)</h1>
                    <div id="multipage-snippet">
                        

                    </div>
                </div>
                <div class="pageheader-download column column-4 column-indent-1">
                    <a href="#" id="toggle-hiddenpanel-headershare"><span class="h4">Share<span class="invisible">(Opens Share panel)</span></span><span class="button-circle"><span class="icon-share"></span></span></a>
                            <a href="#" id="toggle-hiddenpanel-download"><span class="h4">Download options<span class="invisible">(Opens download panel)</span></span><span class="button-circle"><span class="icon-download"></span></span></a>

                </div>
            </div>
"""

soupified = BeautifulSoup(html_doc, "html.parser") 

# extract the text of h1 tag with 'multipage-heading' id 
h1_text = soupified.find('h1', {'id': 'multipage-heading'}).get_text()
print("Title: " , h1_text)


Title:  Guide to the UK General Data Protection Regulation (UK GDPR)


# Extract text from PDF files

In [None]:
# !pip install PyPDF2

In [5]:
import PyPDF2
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

filename = './data/2103.00020.pdf'
open_filename = open(filename, 'rb')

pdf_reader = PyPDF2.PdfReader(open_filename)

#pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf_file))
page_obj = pdf_reader.pages[0]
page_1 = page_obj.extract_text()
print(page_1)

Learning Transferable Visual Models From Natural Language Supervision
Alec Radford* 1Jong Wook Kim* 1Chris Hallacy1Aditya Ramesh1Gabriel Goh1Sandhini Agarwal1
Girish Sastry1Amanda Askell1Pamela Mishkin1Jack Clark1Gretchen Krueger1Ilya Sutskever1
Abstract
State-of-the-art computer vision systems are
trained to predict a ﬁxed set of predetermined
object categories. This restricted form of super-
vision limits their generality and usability since
additional labeled data is needed to specify any
other visual concept. Learning directly from raw
text about images is a promising alternative which
leverages a much broader source of supervision.
We demonstrate that the simple pre-training task
of predicting which caption goes with which im-
age is an efﬁcient and scalable way to learn SOTA
image representations from scratch on a dataset
of 400 million (image, text) pairs collected from
the internet. After pre-training, natural language
is used to reference learned visual concepts (or
describe n

# Text extractiong from images

In [None]:
# !pip install pytesseract

In [6]:
from PIL import Image
import pytesseract 

pytesseract.pytesseract.tesseract_cmd = r"C:\Users\SES100\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
filename = "./data/scanned_text.png"
text = pytesseract.image_to_string(Image.open(filename))
print(text)

Another common source of textual data is scanned documents. Text extraction from
scanned documents is typically done through optical character recognition (OCR),
using libraries such as Tesseract [25, 26]. Consider the example image—a snippet
from a 1950 article in a journal [27]—shown in Figure 2-5.



# Unicode Normalization

In [3]:
text = "😎 HUGE LIST OF UNICODE AND EMOJI SYMBOLS TO COPY AND PASTE 😀"
unicode_text = text.encode('utf-8')
print(unicode_text)

b'\xf0\x9f\x98\x8e HUGE LIST OF UNICODE AND EMOJI SYMBOLS TO COPY AND PASTE \xf0\x9f\x98\x80'


# Sentence segmentation

In [36]:
from nltk.tokenize import sent_tokenize, word_tokenize 

mytext = '''I love the NLP module. 
            Dr. Roni and Prof. Luke are working in the AI field.
            We expect to see 3 sentences inside this text.'''

my_sentences = sent_tokenize(mytext)
print("Number of sentences: " , len(my_sentences))
print(my_sentences)

Number of sentences:  3
['I love the NLP module.', 'Dr. Roni and Prof. Luke are working in the AI field.', 'We expect to see 3 sentences inside this text.']


In [37]:
import spacy
nlp = spacy.load('en_core_web_sm')

textObj = nlp(mytext)
for sentence in textObj.sents:
    print(sentence)

I love the NLP module.

            Dr. Roni and Prof. Luke are working in the AI field.

            
We expect to see 3 sentences inside this text.


# Word tokenization

In [38]:
for sentence in my_sentences:
    my_words = word_tokenize(sentence)
    print("Sentence: ", sentence)
    print("Tokens: ", my_words)


Sentence:  I love the NLP module.
Tokens:  ['I', 'love', 'the', 'NLP', 'module', '.']
Sentence:  Dr. Roni and Prof. Luke are working in the AI field.
Tokens:  ['Dr.', 'Roni', 'and', 'Prof.', 'Luke', 'are', 'working', 'in', 'the', 'AI', 'field', '.']
Sentence:  We expect to see 3 sentences inside this text.
Tokens:  ['We', 'expect', 'to', 'see', '3', 'sentences', 'inside', 'this', 'text', '.']


In [39]:
sentence = '''There are $10,000 and €1000 which are there 
        just for testing a tokenizer'''
tokens = word_tokenize(sentence)
print("Tokens: ", tokens)

Tokens:  ['There', 'are', '$', '10,000', 'and', '€1000', 'which', 'are', 'there', 'just', 'for', 'testing', 'a', 'tokenizer']


In [18]:
sentence = '''Let's go to N.Y.!'''
tokens = word_tokenize(sentence)
print("Tokens: ", tokens)

Tokens:  ['Let', "'s", 'go', 'to', 'N.Y.', '!']


# Stop words removal

In [40]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SES100\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
from nltk.corpus import stopwords

def remove_stop_words(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = [ token for token in word_tokenize(sentence) 
                if token not in stop_words ]
    print("Tokens: ", tokens)

text = "There are $10,000 and €1000 which are there just for testing a tokenizer"
remove_stop_words(text)

Tokens:  ['There', '$', '10,000', '€1000', 'testing', 'tokenizer']


In [21]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'through', 'needn', "hadn't", 'him', 'as', 'doesn', 'are', 'further', 'is', 'such', 'weren', 'few', "that'll", 'both', 're', 'and', 'while', 'has', 'its', 'me', 'with', "shouldn't", 'theirs', 'too', 'when', 'shouldn', 'ourselves', 'hadn', 'below', 'shan', 'no', 'a', "shan't", 'same', 'yourselves', 'themselves', 'herself', 'who', 'to', 'ain', 'wasn', "didn't", 'but', 'between', 'that', 'have', 'ma', 'don', 'about', 'll', 'against', 't', 'after', 'off', 'you', 'ours', 'under', "she's", 'whom', 'during', 'what', 'he', 'before', 'am', "mightn't", 'how', "couldn't", 'of', 'itself', 'here', 'if', 'this', "you'd", 'from', 'wouldn', 'or', 'be', "you're", 'all', 'these', 'been', 'nor', 'above', 'very', 's', 'himself', 'why', 'should', 'only', 'mightn', 'out', 'i', 'then', "you've", "haven't", 'at', 'down', 'isn', 'will', 'o', "hasn't", 'each', "wasn't", 'until', 'won', 'up', 'having', "weren't", 'myself', 'hers', 'did', 'can', "won't", 'an', 'haven', 'being', 'which', "mustn't", 'my', 'now', '

# Stemming

In [22]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
text = "This code shows you how stemming works."
words = [ stemmer.stem(t) for t in word_tokenize(text) ]
print(words)


['thi', 'code', 'show', 'you', 'how', 'stem', 'work', '.']


In [23]:
stemmer = PorterStemmer()
text1 = "It's good to learn NLP."
words = [ stemmer.stem(t) for t in word_tokenize(text1) ]
print(words)

text2 = "It's better to learn NLP."
words = [ stemmer.stem(t) for t in word_tokenize(text2) ]
print(words)


['it', "'s", 'good', 'to', 'learn', 'nlp', '.']
['it', "'s", 'better', 'to', 'learn', 'nlp', '.']


# Lemmatization

In [24]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SES100\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SES100\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [25]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

text1 = "Please turn right at the next light."
words = [ lemmatizer.lemmatize(t, pos='a') for t in word_tokenize(text1) ]
print(words)

text2 = "She is always right."
words = [ lemmatizer.lemmatize(t, pos='a') for t in word_tokenize(text2) ]
print(words)

['Please', 'turn', 'right', 'at', 'the', 'next', 'light', '.']
['She', 'is', 'always', 'right', '.']


In [42]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

text1 = "It's good to learn NLP."
words = [ lemmatizer.lemmatize(t, pos='a') 
            for t in word_tokenize(text1) ]
print(words)

text2 = "It's better to learn NLP."
words = [ lemmatizer.lemmatize(t, pos='a') 
            for t in word_tokenize(text2) ]
print(words)

['It', "'s", 'good', 'to', 'learn', 'NLP', '.']
['It', "'s", 'good', 'to', 'learn', 'NLP', '.']


In [27]:
import spacy
sp = spacy.load('en_core_web_sm')

text1 = "It's better to learn NLP."
words = [ sp(t) for t in word_tokenize(text1) ]
lemmatized_dic = {}
for word in words:
    lemmatized_dic[word[0].text] = word[0].lemma_

print(lemmatized_dic)

{'It': 'it', "'s": 'be', 'better': 'well', 'to': 'to', 'learn': 'learn', 'NLP': 'NLP', '.': '.'}


# Digit/punctuation removal & lowercasing

In [28]:
from nltk.tokenize import word_tokenize 
from string import punctuation

text1 = "A test! to $100 @ remove, Digits & Punctuation.! marks?"

def remove_digits_punctuation_lower_case(text):
    return [token.lower() for token in word_tokenize(text) 
        if not token.isnumeric() and token not in punctuation]

print(remove_digits_punctuation_lower_case(text1))

['a', 'test', 'to', 'remove', 'digits', 'punctuation.', 'marks']


# Part Of Speech tagging

In [29]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp('It\'s better to learn NLP.')
for token in doc:
    print(token.text, token.lemma_, token.pos_, 
    token.shape_, token.is_alpha, token.is_stop)

It it PRON Xx True True
's be AUX 'x False True
better well ADJ xxxx True False
to to PART xx True True
learn learn VERB xxxx True False
NLP NLP PROPN XXX True False
. . PUNCT . False False
