# Rule based Modeling using NLTK     

### Download and Retrieve text and Image from Article

#### Retrieve text from web page 

In [58]:
# urllib2 used to download a webpage
import urllib.request

# library to parse html page 
from bs4 import BeautifulSoup
articleURL = "https://www.washingtonpost.com/powerpost/female-senators-are-increasingly-on-receiving-end-of-insults-from-male-officials/2017/07/27/6b0b6078-72d7-11e7-9eac-d56bd5568db8_story.html?hpid=hp_hp-top-table-main_gopmen-817pm%3Ahomepage%2Fstory&utm_term=.3ccd7d73c12d"

# downloads the webpage
page = urllib.request.urlopen(articleURL).read().decode('utf8','ignore')

# instantiate a beautifulsoup object 
soup = BeautifulSoup(page,"lxml")


#### Check if we have downloaded our page correctly 

In [59]:
soup

<!DOCTYPE html>
<html class="article layout_article rendering-context-www" itemscope="" itemtype="http://schema.org/NewsArticle" lang="en"> <head> <script id="_$cookiemonster">(function(b,m){function d(b,d){this.wl={map:f.map.concat(b||[]),reg:f.reg.concat(d||[])}}var f={reg:[],map:[]};d.prototype.ommNom=function(){return this.nom(!0,void 0)};d.prototype.allows=function(b){return!(-1<this.nom(!1,[b]).indexOf(b))};d.prototype.nom=function(d,f){for(var c=[],l=b.location.hostname.split("").reverse().join("").slice(0,18),g=f||b.cookie.split(";"),a,h,e=0;e<g.length,a=g[e];e++)a=a.trim().split("\x3d")[0].toLowerCase(),-1<this.wl.map.indexOf(a)||c.push(a);for(var k=0;k<this.wl.reg.length,
h=this.wl.reg[k];k++)for(e=h.lastIndex=0;e<g.length,a=g[e];e++)a=a.trim().split("\x3d")[0].toLowerCase(),h.test(a)?-1<c.indexOf(a)&&c.splice(c.indexOf(a),1):0>c.indexOf(a)&&0>this.wl.map.indexOf(a)&&c.push(a);d&&("moc.tsopnotgnihsaw"==l&&(this.wl.reg.length||this.wl.map.length)?setTimeout(function(a){return 

#### Retrieve and combine all texts from the article element 

In [63]:
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))

#### Review our text 

In [62]:
text

' Republican female senators whose disapproval of the GOP health-care effort has at times endangered its progress are facing an increasingly pointed backlash from men in their party, including a handful of comments that invoked physical retaliation. In the past week, Sen. Susan Collins (Maine) has been challenged by a male lawmaker to a duel. She and Sen. Lisa Murkowski (Alaska) were told that they and others deserve a physical reprimand for their decisions not to support Republican health-care proposals. Murkowski, who voted with Collins against starting the health-care debate this week, was specifically called out by President Trump on Twitter and told by a Cabinet official that Alaska could suffer for her choice, according to a colleague.  The language of retribution increasingly adopted by Republican men reflects Trump’s influence and underscores the challenges GOP women can face when opposing the consensus of their party, which remains dominated by men, outside experts said. A vid

#### Module to retrieve text 

In [66]:
def getTextWapo(url):
   page = urllib.request.urlopen(url).read().decode('utf8','ignore')
   soup = BeautifulSoup(page,"lxml") 
   text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
   return text

#### Module to retrieve image 

In [72]:
def getImages(url):
    r = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(r,"lxml")  # Setup a "soup" which BeautifulSoup can search
    links = []
    for link in soup.find_all('img'):  # Cycle through all 'img' tags
        imgSrc = link.get('src')  # Extract the 'src' from those tags
        links.append(imgSrc)  # Append the source to 'links'
    return links  # Return 'links'


#### check that we have image from article 

In [73]:
images = getImages("https://www.washingtonpost.com/news/post-politics/wp/2017/07/27/scaramucci-if-reince-wants-to-explain-that-hes-not-a-leaker-let-him-do-that/?tid=pm_pop_b&utm_term=.9491b116a4eb") 

#### Retrieve image link from the array 

In [70]:
images[8]

'https://www.washingtonpost.com/graphics/politics/can-he-do-that-trump-podcast/images/8_CanHeDoThat_LSR_small.jpg'

### Preprocess Text using the NLTK Library 

In [74]:
# Used to break down text into sentences and words
from nltk.tokenize import sent_tokenize,word_tokenize

#Used to create a list of stop words from the article
from nltk.corpus import stopwords
from string import punctuation

#### Tokenize the text into sentences

In [76]:
sents = sent_tokenize(text)

#### Review that text has been tokenized into sentences

In [77]:
sents

[' Republican female senators whose disapproval of the GOP health-care effort has at times endangered its progress are facing an increasingly pointed backlash from men in their party, including a handful of comments that invoked physical retaliation.',
 'In the past week, Sen. Susan Collins (Maine) has been challenged by a male lawmaker to a duel.',
 'She and Sen. Lisa Murkowski (Alaska) were told that they and others deserve a physical reprimand for their decisions not to support Republican health-care proposals.',
 'Murkowski, who voted with Collins against starting the health-care debate this week, was specifically called out by President Trump on Twitter and told by a Cabinet official that Alaska could suffer for her choice, according to a colleague.',
 'The language of retribution increasingly adopted by Republican men reflects Trump’s influence and underscores the challenges GOP women can face when opposing the consensus of their party, which remains dominated by men, outside exp

#### Tokenize the text into words

In [79]:
word_sent = word_tokenize(text.lower())

#### Review that the text has been tokenized into words

In [80]:
word_sent

['republican',
 'female',
 'senators',
 'whose',
 'disapproval',
 'of',
 'the',
 'gop',
 'health-care',
 'effort',
 'has',
 'at',
 'times',
 'endangered',
 'its',
 'progress',
 'are',
 'facing',
 'an',
 'increasingly',
 'pointed',
 'backlash',
 'from',
 'men',
 'in',
 'their',
 'party',
 ',',
 'including',
 'a',
 'handful',
 'of',
 'comments',
 'that',
 'invoked',
 'physical',
 'retaliation',
 '.',
 'in',
 'the',
 'past',
 'week',
 ',',
 'sen.',
 'susan',
 'collins',
 '(',
 'maine',
 ')',
 'has',
 'been',
 'challenged',
 'by',
 'a',
 'male',
 'lawmaker',
 'to',
 'a',
 'duel',
 '.',
 'she',
 'and',
 'sen.',
 'lisa',
 'murkowski',
 '(',
 'alaska',
 ')',
 'were',
 'told',
 'that',
 'they',
 'and',
 'others',
 'deserve',
 'a',
 'physical',
 'reprimand',
 'for',
 'their',
 'decisions',
 'not',
 'to',
 'support',
 'republican',
 'health-care',
 'proposals',
 '.',
 'murkowski',
 ',',
 'who',
 'voted',
 'with',
 'collins',
 'against',
 'starting',
 'the',
 'health-care',
 'debate',
 'this',
 '

#### Remove Stop words 

In [10]:
_stopwords = set(stopwords.words('english') + list(punctuation))

In [11]:
word_sent=[word for word in word_sent if word not in _stopwords]

#### Review that we have removed stop words from our list of words

In [81]:
word_sent

['republican',
 'female',
 'senators',
 'whose',
 'disapproval',
 'of',
 'the',
 'gop',
 'health-care',
 'effort',
 'has',
 'at',
 'times',
 'endangered',
 'its',
 'progress',
 'are',
 'facing',
 'an',
 'increasingly',
 'pointed',
 'backlash',
 'from',
 'men',
 'in',
 'their',
 'party',
 ',',
 'including',
 'a',
 'handful',
 'of',
 'comments',
 'that',
 'invoked',
 'physical',
 'retaliation',
 '.',
 'in',
 'the',
 'past',
 'week',
 ',',
 'sen.',
 'susan',
 'collins',
 '(',
 'maine',
 ')',
 'has',
 'been',
 'challenged',
 'by',
 'a',
 'male',
 'lawmaker',
 'to',
 'a',
 'duel',
 '.',
 'she',
 'and',
 'sen.',
 'lisa',
 'murkowski',
 '(',
 'alaska',
 ')',
 'were',
 'told',
 'that',
 'they',
 'and',
 'others',
 'deserve',
 'a',
 'physical',
 'reprimand',
 'for',
 'their',
 'decisions',
 'not',
 'to',
 'support',
 'republican',
 'health-care',
 'proposals',
 '.',
 'murkowski',
 ',',
 'who',
 'voted',
 'with',
 'collins',
 'against',
 'starting',
 'the',
 'health-care',
 'debate',
 'this',
 '

### Auto summarize by extracting the most important sentences

#### Frequency distribuition of words

In [82]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq

FreqDist({'!': 1,
          '$': 1,
          '(': 10,
          ')': 10,
          ',': 49,
          '--': 1,
          '.': 44,
          '1': 1,
          '1804': 1,
          '2010': 1,
          ':': 1,
          '?': 1,
          '@': 1,
          '[': 4,
          ']': 4,
          'a': 24,
          'a.': 1,
          'aaron': 1,
          'about': 3,
          'about.': 1,
          'aca': 2,
          'accelerate': 1,
          'according': 2,
          'act': 2,
          'ad': 1,
          'adopted': 1,
          'ads': 1,
          'affecting': 1,
          'affordable': 1,
          'after': 2,
          'against': 3,
          'aggressive': 1,
          'alaska': 5,
          'alexander': 1,
          'ali': 1,
          'all': 1,
          'almost': 1,
          'also': 1,
          'am': 1,
          'amber': 1,
          'amend': 1,
          'america': 1,
          'american': 2,
          'americans': 1,
          'among': 1,
          'an': 4,
          'and': 25,

#### Compute significance score for each sentence in the article using the word frequency 

In [83]:
from collections import defaultdict
ranking = defaultdict(int)

for i,sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]

In [89]:
ranking

defaultdict(int,
            {0: 372,
             1: 315,
             2: 279,
             3: 508,
             4: 523,
             5: 422,
             6: 1385,
             7: 677,
             8: 461,
             9: 233,
             10: 293,
             11: 306,
             12: 159,
             13: 229,
             14: 517,
             15: 162,
             16: 44,
             17: 44,
             18: 307,
             19: 61,
             20: 638,
             21: 502,
             22: 111,
             23: 569,
             24: 259,
             25: 573,
             26: 315,
             27: 459,
             28: 65,
             29: 261,
             30: 156,
             31: 521,
             32: 527,
             33: 463,
             34: 256,
             35: 357,
             36: 101,
             37: 382,
             38: 80,
             39: 602,
             40: 272,
             41: 612,
             42: 479,
             43: 83,
             44: 11})

#### The top most significant sentences

In [90]:
sents_idx = nlargest(4, ranking, key=ranking.get)

In [91]:
sents_idx

[6, 7, 20, 41]

#### Print our summary by putting togehter the most significant sentences

In [93]:
[sents[j] for j in sorted(sents_idx)]

['“Masculine dominance in the Republican Party is not only in numbers but in culture,” said Kelly Dittmar, a scholar at the Center for American Women and Politics at Rutgers University and the author of “Navigating Gendered Terrain: Stereotypes and Strategy in Political Campaigns.”   “When the person who is supposed to be the leader of the party shows it’s okay to use those sorts of attacks, whether they are specifically gendered or not, that is something that catches on at other levels,” Dittmar said.',
 '“We see it in the [elected officials] who feel it’s okay to say things like this.”       (Amber Ferguson/The Washington Post)    [Senate GOP leaders work to round up votes for modest health-care overhaul]  Collins and Murkowski have been among the Senate’s most consistently skeptical voices as Republicans accelerate their effort to amend the 2010 Affordable Care Act (ACA).',
 '), in an apparent reference to Collins, told a radio host Friday that if she were a “guy from south Texas, I

#### Module to summarize text given a text and number of sentences for the summary 

In [20]:
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [87]:
summarize(text,3)

['“Masculine dominance in the Republican Party is not only in numbers but in culture,” said Kelly Dittmar, a scholar at the Center for American Women and Politics at Rutgers University and the author of “Navigating Gendered Terrain: Stereotypes and Strategy in Political Campaigns.”   “When the person who is supposed to be the leader of the party shows it’s okay to use those sorts of attacks, whether they are specifically gendered or not, that is something that catches on at other levels,” Dittmar said.',
 '“We see it in the [elected officials] who feel it’s okay to say things like this.”       (Amber Ferguson/The Washington Post)    [Senate GOP leaders work to round up votes for modest health-care overhaul]  Collins and Murkowski have been among the Senate’s most consistently skeptical voices as Republicans accelerate their effort to amend the 2010 Affordable Care Act (ACA).',
 'Rep. Earl L. “Buddy” Carter (R-Ga.) told MSNBC on Wednesday that someone should “go over there to that Senat