In [1]:
import requests
import urllib.request
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [2]:
def getWashPostText(url,token):
    
    try:
        page = urllib.request.urlopen(url).read().decode('utf8')
    except:
        return (None,None)
    soup = BeautifulSoup(page)
    if soup is None:
        return (None,None)
    
    text = ""
    if soup.find_all(token) is not None:
        
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
       
        soup2 = BeautifulSoup(text)
        
        if soup2.find_all('p')!=[]:
            
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
    return text, soup.title.text

In [3]:
def getNYTText(url,token):
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find('title').text
    mydivs = soup.findAll("p", {"class":"story-body-text story-content"})
    text = ''.join(map(lambda p:p.text, mydivs))
    return text, title
    

In [4]:
def scrapeSource(url, magicFrag='2020',scraperFunction=getNYTText,token='None'):
    urlBodies = {}
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response)

    numErrors = 0
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if( (url not in urlBodies) and 
               ((magicFrag is not None and magicFrag in url) 
               or magicFrag is None)):
                body = scraperFunction(url,token)

                if body and len(body) > 0:
                    urlBodies[url] = body
                print(url)
        except:
            numErrors += 1
           
    return urlBodies

In [5]:
class FrequencySummarizer:
    def __init__(self,min_cut=0.1,max_cut=0.9):
      
        self._min_cut = min_cut
        self._max_cut = max_cut
        self._stopwords = set(stopwords.words('english') +
                              list(punctuation) +
                              [u"'s",'"'])
        
    
    def _compute_frequencies(self,word_sent,customStopWords=None):
        freq = defaultdict(int)
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)
        for sentence in word_sent:
            for word in sentence:
                if word not in stopwords:
                    freq[word] += 1
        m = float(max(freq.values()))
        for word in list(freq.keys()):
            freq[word] = freq[word]/m
            if freq[word] >= self._max_cut or freq[word] <= self._min_cut:
                del freq[word]
        return freq
    
    def extractFeatures(self,article,n,customStopWords=None):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent,customStopWords)
        if n < 0:
            
            return nlargest(len(self._freq_keys()),self._freq,key=self._freq.get)
        else:
            
            return nlargest(n,self._freq,key=self._freq.get)
    
    def extractRawFrequencies(self, article):
        
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
    
    def summarize(self, article,n):
        text = article[0]
        title = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n,ranking,key=ranking.get)

        return [sentences[j] for j in sentences_index]

In [7]:
urlWashingtonPostNonTech = "https://www.washingtonpost.com/sports"
urlNewYorkTimesNonTech = "https://www.nytimes.com/pages/sports/index.html"
urlWashingtonPostTech = "https://www.washingtonpost.com/business/technology"
urlNewYorkTimesTech = "http://www.nytimes.com/pages/technology/index.html"

washingtonPostTechArticles = scrapeSource(urlWashingtonPostTech,
                                          '2020',
                                         getWashPostText,
                                         'article') 
washingtonPostNonTechArticles = scrapeSource(urlWashingtonPostNonTech,
                                          '2020',
                                         getWashPostText,
                                         'article')
                
                
newYorkTimesTechArticles = scrapeSource(urlNewYorkTimesTech,
                                       '2020',
                                       getNYTText,
                                       None)
newYorkTimesNonTechArticles = scrapeSource(urlNewYorkTimesNonTech,
                                       '2020',
                                       getNYTText,
                                       None)

https://www.washingtonpost.com/world/asia_pacific/coronavirus-china-live-updates/2020/02/20/3159df72-535c-11ea-b119-4faabac6674f_story.html
https://www.washingtonpost.com/nation/2020/02/20/whistleblower-attorney-threatened/
https://www.washingtonpost.com/local/public-safety/roger-stone-sentence-due-thursday-in-federal-court/2020/02/19/2e01bfc8-4c38-11ea-9b5c-eac5b16dafaa_story.html
https://www.washingtonpost.com/national-security/2020/02/19/mexican-russian-spy/
https://www.washingtonpost.com/business/2020/02/20/morgann-stanley-etrade/
https://www.washingtonpost.com/nation/2020/02/19/police-couple-were-date-ended-up-foiling-an-armed-robber/
https://www.washingtonpost.com/nation/2020/02/20/croydon-police-officer-leaves-in-underwear/
https://www.washingtonpost.com/video-games/2020/02/20/playstation-facebook-cancel-gdc-plans-citing-coronavirus-concerns/
https://www.washingtonpost.com/technology/2020/02/20/mike-bloomberg-sure-looks-like-he-owned-debate-an-edited-video-released-by-mike-bloom

https://www.washingtonpost.com/sports/mac-mcclung-is-injured-again-and-undermanned-georgetown-falls-to-providence/2020/02/19/5c9f4ae0-533a-11ea-929a-64efa7482a77_story.html
https://www.washingtonpost.com/sports/colleges/virginia-keeps-plugging-away-with-a-balanced-effort-against-boston-college/2020/02/19/76c3e9da-533a-11ea-929a-64efa7482a77_story.html
https://www.washingtonpost.com/sports/highschools/freshman-amani-watts-known-as-baby-shaq-makes-an-immediate-impact-for-old-mill/2020/02/19/7e9c14b6-533a-11ea-929a-64efa7482a77_story.html
http://washingtonpost.com/sports/2020/02/19/browns-offensive-lineman-caught-by-texas-border-patrol-with-157-pounds-marijuana/?tid=pm_sports_pop
http://washingtonpost.com/sports/2020/02/20/jordan-reed-released-redskins/?tid=pm_sports_pop
http://washingtonpost.com/sports/mlb/astros-cheating-open-secret/2020/02/11/1830154c-4c41-11ea-9b5c-eac5b16dafaa_story.html?tid=pm_sports_pop
http://washingtonpost.com/sports/2020/02/20/nationals-spring-training-three-fam

In [8]:
articleSummaries = {}
for techUrlDictionary in [newYorkTimesTechArticles, washingtonPostTechArticles]:
    for articleUrl in techUrlDictionary:
        if techUrlDictionary[articleUrl][0] is not None:
            if len(techUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(techUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector': summary,
                                               'label': 'Tech'}
for nontechUrlDictionary in [newYorkTimesNonTechArticles, washingtonPostNonTechArticles]:
    for articleUrl in nontechUrlDictionary:
        if nontechUrlDictionary[articleUrl][0] is not None:
            if len(nontechUrlDictionary[articleUrl][0]) > 0:
                fs = FrequencySummarizer()
                summary = fs.extractFeatures(nontechUrlDictionary[articleUrl],25)
                articleSummaries[articleUrl] = {'feature-vector': summary,
                                               'label': 'Non-Tech'}

In [9]:
def getDoxyDonkeyText(testUrl,token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find("title").text
    mydivs = soup.findAll("div", {"class":token})
    text = ''.join(map(lambda p:p.text,mydivs))
    return text,title
    

testUrl = "http://doxydonkey.blogspot.in"
testArticle = getDoxyDonkeyText(testUrl,"post-body")

fs = FrequencySummarizer()
testArticleSummary = fs.extractFeatures(testArticle, 25)

In [10]:
similarities = {}
for articleUrl in articleSummaries:
    oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
    similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

labels = defaultdict(int)    
knn = nlargest(5, similarities, key=similarities.get)
for oneNeighbor in knn:
    labels[articleSummaries[oneNeighbor]['label']] += 1

nlargest(1,labels,key=labels.get)

['Tech']

In [25]:
cumulativeRawFrequencies = {'Tech':defaultdict(int),'Non-Tech':defaultdict(int)}
trainingData = {'Tech':washingtonPostTechArticles,'Non-Tech':washingtonPostNonTechArticles}
for label in trainingData:
    for articleUrl in trainingData[label]:
        if len(trainingData[label][articleUrl][0]) > 0:
            fs = FrequencySummarizer()
            rawFrequencies = fs.extractRawFrequencies(trainingData[label][articleUrl])
            for word in rawFrequencies:
                cumulativeRawFrequencies[label][word] += rawFrequencies[word]


In [26]:
from __future__ import division

In [28]:
techiness = 1.0
nontechiness = 1.0
for word in testArticleSummary:
    # for each 'feature' of the test instance - 
    if word in cumulativeRawFrequencies['Tech']:
        techiness *= 1e3*cumulativeRawFrequencies['Tech'][word] / float(sum(cumulativeRawFrequencies['Tech'].values()))
       
    else:
        techiness /= 1e3
    
    if word in cumulativeRawFrequencies['Non-Tech']:
        nontechiness *= 1e3*cumulativeRawFrequencies['Non-Tech'][word] / float(sum(cumulativeRawFrequencies['Non-Tech'].values()))
     
    else:
        nontechiness /= 1e3

techiness *= float(sum(cumulativeRawFrequencies['Tech'].values())) / float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values()))
nontechiness *= float(sum(cumulativeRawFrequencies['Non-Tech'].values())) / float(sum(cumulativeRawFrequencies['Tech'].values())) + float(sum(cumulativeRawFrequencies['Non-Tech'].values()))
if techiness > nontechiness:
    label = 'Tech'
else:
    label = 'Non-Tech'
print(label, techiness, nontechiness)


Tech 3.8033187782055015e-05 5.072239710395789e-18


In [29]:
def getAllDoxyDonkeyPosts(url,links):
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response)
    for a in soup.findAll('a'):
        try:
            url = a['href']
            title = a['title']
            if title == "Older Posts":
                print(title, url)
                links.append(url)
                getAllDoxyDonkeyPosts(url,links)
        except:
            title = ""
    return

In [30]:
blogUrl = "http://doxydonkey.blogspot.in"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)
doxyDonkeyPosts = {}
for link in links:
    doxyDonkeyPosts[link] = getDoxyDonkeyText(link,'post-body')


documentCorpus = []
for onePost in doxyDonkeyPosts.values():
    documentCorpus.append(onePost[0])

Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-05-23T19:53:00-07:00&max-results=7
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-05-14T19:02:00-07:00&max-results=7&start=7&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-05-02T19:43:00-07:00&max-results=7&start=14&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-04-17T19:26:00-07:00&max-results=7&start=21&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-04-10T18:56:00-07:00&max-results=7&start=28&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-03-30T19:57:00-07:00&max-results=7&start=35&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-03-20T19:47:00-07:00&max-results=7&start=42&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2017-03-02T17:42:00-08:00&max-results=7&start=49&by-date=false
Older Posts http://doxyd

Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-04-23T20:19:00-07:00&max-results=7&start=462&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-04-14T19:40:00-07:00&max-results=7&start=469&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-04-05T20:22:00-07:00&max-results=7&start=476&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-03-24T20:12:00-07:00&max-results=7&start=483&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-03-15T20:41:00-07:00&max-results=7&start=490&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-03-03T19:30:00-08:00&max-results=7&start=497&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-02-22T19:55:00-08:00&max-results=7&start=504&by-date=false
Older Posts http://doxydonkey.blogspot.com/search?updated-max=2015-02-11T20:02:00-08:00&max-results=7&start=511&by-dat

In [31]:
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(documentCorpus)
km = KMeans(n_clusters = 5, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)
km.fit(X)

keywords = {}
for i,cluster in enumerate(km.labels_):
    oneDocument = documentCorpus[i]
    fs = FrequencySummarizer()
    summary = fs.extractFeatures((oneDocument,""),
                                100,
                                [u"according",u"also",u"billion",u"like",u"new", u"one",u"year",u"first",u"last"])
    if cluster not in keywords:
        keywords[cluster] = set(summary)
    else:
        keywords[cluster] = keywords[cluster].intersection(set(summary))

Initialization complete
Iteration  0, inertia 138.118
Iteration  1, inertia 71.499
Converged at iteration 1: center shift 0.000000e+00 within tolerance 7.274641e-09
