# Web Scraping with RegEX + Pandas

### Scraping most repetitive words from Dr. Martin Luther King Jr. 'I have a Dream' speech

In [2]:
from bs4 import BeautifulSoup
import requests

In [3]:
url = 'http://www.analytictech.com/mb021/mlk.htm'

page = requests.get(url)

soup = BeautifulSoup(page.text,'html')

print(soup)


<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Microsoft FrontPage 4.0" name="GENERATOR"/>
<title>Martin Luther King Jr.'s 1962 Speech</title>
</head>
<body alink="#FF0000" bgcolor="#FFFFFF" link="#0000FF" text="#000000" vlink="#551A8B">
<h1><font size="5">Transcript of speech by </font><br/>
Dr. Martin Luther King Jr. <br/>
August 28, 1963. Lincoln Memorial in Washington D.C. </h1>
<hr color="#008080" noshade="" size="5"/>
<p>I am happy to join with you today in what will go down in
history as the greatest demonstration for freedom in the history
of our nation. </p>
<p>Five score years ago a great American in whose symbolic shadow
we stand today signed the Emancipation Proclamation. This
momentous decree came as a great beckoning light of hope to
millions of Negro slaves who had been seared in the flames of
withering injustice. It came as a joyous daybreak to end the long
night of their c

In [13]:
mlkj_speech = soup.find_all('p')

In [14]:
type(mlkj_speech)

bs4.element.ResultSet

In [15]:
speech_combined = [ p.text  for p in mlkj_speech] # filtering out all the <p> tags

print(speech_combined)

['I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation. ', 'Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity. ', 'But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination. ', 'One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity. ', 'One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland. ', "We all have come to t

In [16]:
type(speech_combined)

list

In [19]:
' '.join(speech_combined) # removed ', ' in between the sentences

'I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation.  Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity.  But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination.  One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity.  One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland.  We all have come to this hallowed spo

In [20]:
string_speech = ' '.join(speech_combined)

In [29]:
string_speech.replace('\r\n' , ' ') # removing \r\n from the text by replacing it with blank space

'I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.  Five score years ago a great American in whose symbolic shadow we stand today signed the Emancipation Proclamation. This momentous decree came as a great beckoning light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.  But one hundred years later the Negro is still not free. One hundred years later the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination.  One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity.  One hundred years later the Negro is still languishing in the comers of American society and finds himself in exile in his own land.  We all have come to this hallowed spot to remind America of the fierce ur

In [30]:
string_speech_cleaned = string_speech.replace('\r\n' , ' ')

In [31]:
import re

In [33]:
speech_no_punctuation = re.sub(r'[^\w\s]' , ' ',string_speech_cleaned ) # remove all the punctuation marks and white spaces
print(speech_no_punctuation)

I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation   Five score years ago a great American in whose symbolic shadow we stand today signed the Emancipation Proclamation  This momentous decree came as a great beckoning light of hope to millions of Negro slaves who had been seared in the flames of withering injustice  It came as a joyous daybreak to end the long night of their captivity   But one hundred years later the Negro is still not free  One hundred years later the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination   One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity   One hundred years later the Negro is still languishing in the comers of American society and finds himself in exile in his own land   We all have come to this hallowed spot to remind America of the fierce urg

In [47]:
speech_broken_out = re.split(r'\s+',speech_lower) # splitting each work by delimeter, white space and putting in a list
print(speech_broken_out)

['i', 'am', 'happy', 'to', 'join', 'with', 'you', 'today', 'in', 'what', 'will', 'go', 'down', 'in', 'history', 'as', 'the', 'greatest', 'demonstration', 'for', 'freedom', 'in', 'the', 'history', 'of', 'our', 'nation', 'five', 'score', 'years', 'ago', 'a', 'great', 'american', 'in', 'whose', 'symbolic', 'shadow', 'we', 'stand', 'today', 'signed', 'the', 'emancipation', 'proclamation', 'this', 'momentous', 'decree', 'came', 'as', 'a', 'great', 'beckoning', 'light', 'of', 'hope', 'to', 'millions', 'of', 'negro', 'slaves', 'who', 'had', 'been', 'seared', 'in', 'the', 'flames', 'of', 'withering', 'injustice', 'it', 'came', 'as', 'a', 'joyous', 'daybreak', 'to', 'end', 'the', 'long', 'night', 'of', 'their', 'captivity', 'but', 'one', 'hundred', 'years', 'later', 'the', 'negro', 'is', 'still', 'not', 'free', 'one', 'hundred', 'years', 'later', 'the', 'life', 'of', 'the', 'negro', 'is', 'still', 'sadly', 'crippled', 'by', 'the', 'manacles', 'of', 'segregation', 'and', 'the', 'chains', 'of', '

In [48]:
import pandas as pd


In [54]:
df = pd.DataFrame(speech_broken_out).value_counts() # counting the occurance of each words in the created list
df

0     
the       54
of        49
to        29
and       27
a         20
          ..
jews       1
joyous     1
judged     1
land       1
lord       1
Name: count, Length: 324, dtype: int64

In [60]:
df.to_csv(r'C:\Users\nehas\Desktop\Projects\python_projects\mlkj_speech_web_pull\mlkj_speech_counts.csv',index_label = 'words')