In [16]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [75]:
url = "https://newsroom.fb.com/news/2018/07/removing-bad-actors-on-facebook"

In [107]:
page = urlopen(url).read().decode('utf8','ignore') 
soup = BeautifulSoup(page,"lxml")

In [94]:
text = soup.find("div", {"class": "post-content"}).text

In [95]:
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [106]:
sents = sent_tokenize(text)
sents[:10]

['\nToday we removed 32 Pages and accounts from Facebook and Instagram because they were involved in coordinated inauthentic behavior.',
 'This kind of behavior is not allowed on Facebook because we don’t want people or organizations creating networks of accounts to mislead others about who they are, or what they’re doing.',
 'We’re still in the very early stages of our investigation and don’t have all the facts — including who may be behind this.',
 'But we are sharing what we know today given the connection between these bad actors and protests that are planned in Washington next week.',
 'We will update this post with more details when we have them, or if the facts we have change.',
 'It’s clear that whoever set up these accounts went to much greater lengths to obscure their true identities than the Russian-based Internet Research Agency (IRA) has in the past.',
 'We believe this could be partly due to changes we’ve made over the last year to make this kind of abuse much harder.',
 

In [105]:
word_sent = word_tokenize(text.lower())
word_sent[:10]

['today',
 'we',
 'removed',
 '32',
 'pages',
 'and',
 'accounts',
 'from',
 'facebook',
 'and']

In [104]:
mylist = ['’', '“', '”']
_stopwords = set(stopwords.words('english') + list(punctuation) + mylist)


In [84]:
word_sent=[word for word in word_sent if word not in _stopwords]

In [103]:
from nltk.probability import FreqDist
freq = FreqDist(word_sent)


In [31]:
from heapq import nlargest

In [87]:
nlargest(10, freq, key=freq.get)

['ira',
 'accounts',
 'facebook',
 'actors',
 'pages',
 'information',
 'attribution',
 'group',
 'one',
 'activity']

In [102]:
from collections import defaultdict
ranking = defaultdict(int)

for i,sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]
            

In [92]:
sents_idx = nlargest(4, ranking, key=ranking.get)
sents_idx

[13, 12, 96, 10]

In [93]:
[sents[j] for j in sorted(sents_idx)]

['It’s why we’re investing heavily in more people and better technology to prevent bad actors misusing Facebook — as well as working much more closely with law enforcement and other tech companies to better understand the threats we face.',
 'Sample Content\n\xa0\nJuly 31, 2018\nWhat We’ve Found So Far\nBy Nathaniel Gleicher, Head of Cybersecurity Policy\nAbout two weeks ago we identified the first of eight Pages and 17 profiles on Facebook, as well as seven Instagram accounts, that violate our ban on coordinated inauthentic behavior.',
 'We removed all of them this morning once we’d completed our initial investigation and shared the information with US law enforcement agencies, Congress, other technology companies, and the\xa0Atlantic Council’s Digital Forensic Research Lab, a research organization that helps us identify and analyze abuse on Facebook.',
 'Applying the Framework to Our New Discovery\nHere is how we use this framework to discuss attribution of the accounts and Pages we 

In [98]:
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]

In [100]:
summarize(text,4)

['It’s why we’re investing heavily in more people and better technology to prevent bad actors misusing Facebook — as well as working much more closely with law enforcement and other tech companies to better understand the threats we face.',
 'We removed all of them this morning once we’d completed our initial investigation and shared the information with US law enforcement agencies, Congress, other technology companies, and the\xa0Atlantic Council’s Digital Forensic Research Lab, a research organization that helps us identify and analyze abuse on Facebook.',
 'The most followed Facebook Pages were “Aztlan Warriors,” “Black Elevation,” “Mindful Being,” and “Resisters.” The remaining Pages had between zero and 10 followers, and the Instagram accounts had zero followers.',
 'It’s why we’re following up on thousands of leads, including information from law enforcement and lessons we learned from last year’s IRA investigation.']