In [1]:
# Capturing Text Data
# Plain Text

import os

# Read in a plain text file
with open(os.path.join("data", "hieroglyph.txt"), 'r') as f:
    text = f.read()
    print(text)

Hieroglyphic writing dates from c. 3000 BC, and is composed of hundreds of symbols. A hieroglyph can represent a word, a sound, or a silent determinative; and the same symbol can serve different purposes in different contexts. Hieroglyphs were a formal script, used on stone monuments and in tombs, that could be as detailed as individual works of art.



In [2]:
# Tabular Data
# Extract text column from a dataframe

import pandas as pd

df = pd.read_csv(os.path.join('Data','news.csv'))
df.head()[['publisher','title']]


Unnamed: 0,publisher,title
0,Livemint,Fed's Charles Plosser sees high bar for change...
1,IFA Magazine,US open: Stocks fall after Fed official hints ...
2,IFA Magazine,"Fed risks falling 'behind the curve', Charles ..."
3,Moneynews,Fed's Plosser: Nasty Weather Has Curbed Job Gr...
4,NASDAQ,Plosser: Fed May Have to Accelerate Tapering Pace


In [3]:
# Convert text column to lowercase

df['title'] = df['title'].str.lower()
df.head()[['title','publisher']]

Unnamed: 0,title,publisher
0,fed's charles plosser sees high bar for change...,Livemint
1,us open: stocks fall after fed official hints ...,IFA Magazine
2,"fed risks falling 'behind the curve', charles ...",IFA Magazine
3,fed's plosser: nasty weather has curbed job gr...,Moneynews
4,plosser: fed may have to accelerate tapering pace,NASDAQ


In [4]:
# Online Resource
# Fetch data from a REST API
import requests
import json

# Fetch data from a REST API
r = requests.get("https://quotes.rest/qod.json")
res = r.json()
print(json.dumps(res, indent=4))

{
    "success": {
        "total": 1
    },
    "contents": {
        "quotes": [
            {
                "quote": "Think ahead. Don't let day-to-day operations drive out planning.",
                "length": "64",
                "author": "Donald Rumsfeld",
                "tags": [
                    "inspire",
                    "planning",
                    "time-management"
                ],
                "category": "inspire",
                "language": "en",
                "date": "2021-01-22",
                "permalink": "https://theysaidso.com/quote/donald-rumsfeld-think-ahead-dont-let-day-to-day-operations-drive-out-planning",
                "id": "KBvgCo8_w5IEf7fBSFVFjgeF",
                "background": "https://theysaidso.com/img/qod/qod-inspire.jpg",
                "title": "Inspiring Quote of the day"
            }
        ]
    },
    "baseurl": "https://theysaidso.com",
    "copyright": {
        "year": 2023,
        "url": "https://theysaidso.com"


In [5]:
# Extract relevant object and field
q = res['contents']['quotes'][0]
print(q['quote'],'\n--',q['author'])

Think ahead. Don't let day-to-day operations drive out planning. 
-- Donald Rumsfeld


In [6]:
# Cleaning
# Fetch a web page

r = requests.get("https://news.ycombinator.com")
print(r.text)

<html lang="en" op="news"><head><meta name="referrer" content="origin"><meta name="viewport" content="width=device-width, initial-scale=1.0"><link rel="stylesheet" type="text/css" href="news.css?U9IPZKYheFex0oNnJZBm">
        <link rel="shortcut icon" href="favicon.ico">
          <link rel="alternate" type="application/rss+xml" title="RSS" href="rss">
        <title>Hacker News</title></head><body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
        <tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" width="100%" style="padding:2px"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img src="y18.gif" width="18" height="18" style="border:1px white solid;"></a></td>
                  <td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
              <a href="newest">new</a> | <a href="front">past</a> | <a href=

In [7]:
# Remove HTML tags using RegEx
import re

pattern = re.compile(r'<.*?>') # tags look like <...>
print(pattern.sub('',r.text))


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      AWS announces forks of Elasticsearch and Kibana (amazon.com)
        1041 points by ke4qqq 13 hours ago  | hide | 752&nbsp;comments              
      
                
      2.      Complete rewrite of ownCloud to move away from LAMP (owncloud.com)
        79 points by veddox 2 hours ago  | hide | 59&nbsp;comments              
      
                
      3.      Show HN: 128-bit, roughly-ordered, URL-safe UUIDs (github.com/anthonynsimon)
        10 points by amzans 43 minutes ago  | hide | 1&nbsp;comment              
      
                
      4.      The spat between Google and Australia, as reported on HN (algolia.com)
        9 points by ColinWright 28 minutes ago  | hide | 4&nbsp;comments              
      
   

In [8]:
from bs4 import BeautifulSoup

# Remove HTML tags using Beautiful Soup library
soup = BeautifulSoup(r.text, "html5lib")
print(soup.get_text())


        
          
        Hacker News
        
                  Hacker News
              new | past | comments | ask | show | jobs | submit            
                              login
                          
              

              
      1.      AWS announces forks of Elasticsearch and Kibana (amazon.com)
        1041 points by ke4qqq 13 hours ago  | hide | 752 comments              
      
                
      2.      Complete rewrite of ownCloud to move away from LAMP (owncloud.com)
        79 points by veddox 2 hours ago  | hide | 59 comments              
      
                
      3.      Show HN: 128-bit, roughly-ordered, URL-safe UUIDs (github.com/anthonynsimon)
        10 points by amzans 43 minutes ago  | hide | 1 comment              
      
                
      4.      The spat between Google and Australia, as reported on HN (algolia.com)
        9 points by ColinWright 28 minutes ago  | hide | 4 comments              
      
                
      

In [14]:
# Find all articles

summaries = soup.find_all("tr",class_='athing')
summaries[0]

<tr class="athing" id="25865094">
      <td align="right" class="title" valign="top"><span class="rank">1.</span></td>      <td class="votelinks" valign="top"><center><a href="vote?id=25865094&amp;how=up&amp;goto=news" id="up_25865094"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><a class="storylink" href="https://aws.amazon.com/blogs/opensource/stepping-up-for-a-truly-open-source-elasticsearch">AWS announces forks of Elasticsearch and Kibana</a><span class="sitebit comhead"> (<a href="from?site=amazon.com"><span class="sitestr">amazon.com</span></a>)</span></td></tr>

In [15]:
# Extract title

summaries[0].find("a",class_ = 'storylink').get_text().strip()

'AWS announces forks of Elasticsearch and Kibana'

In [20]:
# Find all articles, extract titles

articles = []
summaries = soup.find_all("tr",class_='athing')
for summary in summaries:
    title = summary.find("a",class_ = 'storylink').get_text().strip()
    articles.append(title)
    
print(len(articles),"Article summaries found. Sample:")
articles[0:5]

30 Article summaries found. Sample:


['AWS announces forks of Elasticsearch and Kibana',
 'Complete rewrite of ownCloud to move away from LAMP',
 'Show HN: 128-bit, roughly-ordered, URL-safe UUIDs',
 'The spat between Google and Australia, as reported on HN',
 'Show HN: Git News – Social News website for developers and GitHub fans']

In [21]:
# Normalization
# Case Normalization

# Sample text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"
print(text)

The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?


In [24]:
# Convert to lowercase
text = text.lower()
print(text)

the first time you see the second renaissance it may look boring. look at it at least twice and definitely watch part 2. it will change your view of the matrix. are the human people the ones who started the war ? is ai a bad thing ?


In [25]:
# Remove punctuation characters

text = re.sub(r'[^a-z0-9A-Z]',' ',text)
print(text)

the first time you see the second renaissance it may look boring  look at it at least twice and definitely watch part 2  it will change your view of the matrix  are the human people the ones who started the war   is ai a bad thing  


In [26]:
# Tokenization
# Split text into tokens (words)

words = text.split()
print(words)


['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [27]:
# NLTK: Natural Language ToolKit

import os
import nltk

nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

In [28]:
# Another sample text
text = "Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers."
print(text)

Dr. Smith graduated from the University of Washington. He later started an analytics firm called Lux, which catered to enterprise customers.


In [29]:
# Split text into words using NLTK

from nltk.tokenize import word_tokenize

words = word_tokenize(text)
print(words)

['Dr.', 'Smith', 'graduated', 'from', 'the', 'University', 'of', 'Washington', '.', 'He', 'later', 'started', 'an', 'analytics', 'firm', 'called', 'Lux', ',', 'which', 'catered', 'to', 'enterprise', 'customers', '.']


In [30]:
from nltk.tokenize import sent_tokenize

# Split text into sentences
sentences = sent_tokenize(text)
print(sentences)

['Dr. Smith graduated from the University of Washington.', 'He later started an analytics firm called Lux, which catered to enterprise customers.']


In [31]:
# List stop words
from nltk.corpus import stopwords
print(stopwords.words('English'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [32]:
# Reset text
text = "The first time you see The Second Renaissance it may look boring. Look at it at least twice and definitely watch part 2. It will change your view of the matrix. Are the human people the ones who started the war ? Is AI a bad thing ?"

# Normalize it
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

# Tokenize it
words = text.split()
print(words)

['the', 'first', 'time', 'you', 'see', 'the', 'second', 'renaissance', 'it', 'may', 'look', 'boring', 'look', 'at', 'it', 'at', 'least', 'twice', 'and', 'definitely', 'watch', 'part', '2', 'it', 'will', 'change', 'your', 'view', 'of', 'the', 'matrix', 'are', 'the', 'human', 'people', 'the', 'ones', 'who', 'started', 'the', 'war', 'is', 'ai', 'a', 'bad', 'thing']


In [33]:
# Remove stop words

words = [w for w in words if w not in stopwords.words('English')]
print(words)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'boring', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'started', 'war', 'ai', 'bad', 'thing']


In [34]:
# Sentence Parsing

my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

# Parse a sentence
sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [38]:
# Stemming & Lemmatization
# Stemming

from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['first', 'time', 'see', 'second', 'renaiss', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definit', 'watch', 'part', '2', 'chang', 'view', 'matrix', 'human', 'peopl', 'one', 'start', 'war', 'ai', 'bad', 'thing']


In [46]:
# Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
# Reduce words to their root form

lemmed = [WordNetLemmatizer().lemmatize(w, pos = 'v') for w in words]
print(lemmad)

# Lemmatize verbs by specifying pos
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in lemmed]
print(lemmed)

['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'start', 'war', 'ai', 'bad', 'thing']
['first', 'time', 'see', 'second', 'renaissance', 'may', 'look', 'bore', 'look', 'least', 'twice', 'definitely', 'watch', 'part', '2', 'change', 'view', 'matrix', 'human', 'people', 'ones', 'start', 'war', 'ai', 'bad', 'thing']
