### Importing the essential libraries

In [1]:
import urllib.request
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

### Defining a Web scraping function for Washington Post

In [2]:
def getWashPostText(url, token):
    ### Default function to download content from any webpage
    try:
        page = urllib.urlopen(url).read().decode('utf8')
    ### If not able to download the URL, return None
    except:
        return (None,None)
    ### Instantiating a Soup object on the downloaded page
    soup = BeautifulSoup(page)
    ### Double-checking to make sure the page has been downloaded properly
    if soup is None:
        return (None,None)
    ### Creating an empty string character
    text = ""
    ### Based on the entered token, the article content is extracted
    if soup.find_all(token) is not None:
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
    ### Second soup object to extract content from the <p> tag
        soup2 = BeautifulSoup(text)
        if soup2.find_all('p') is not None:
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
    return text, soup.title.text

### Defining a Web scraping function for The New York Times

In [3]:
def getNYTText(url,token):
    ### Defining an alternative way to download contents of a Webpage
    response = requests.get(url)
    ### Instantiating a Soup object
    soup = BeautifulSoup(response.content)
    ### Extracting the entire strings in the page
    page =str(soup)
    ### Defining the page title from the string
    title = soup.find('title').text
    ### Extracting the article content based on the <p> tag's class
    mydivs = soup.findAll("p", {"class":"story-body-text story-content"})
    text = ''.join(map(lambda p: p.text, mydivs))
    return text, title

### Extracting articles from a single Category of a News portal

In [4]:
def scrapeSource(url, magicFrag='2016', scraperFunction = getNYTText, token='None'):
    urlBodies = {}
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response)
    ### Extracting all the links in the page with a timestamp
    numErrors = 0
    for a in soup.findAll('a'):
        try:
            url = a['href']
            if((url not in urlBodies) and (magicFrag is not None and magicFrag in url) 
               or magicFrag is None):
                ### Individual scraper for each article in the Portal's section
                body = scraperFunction(url,token)
                if body and len(body) > 0:
                    urlBodies[url] = body
                print(url)
        except:
            numErrors += 1
    return urlBodies

### Feature Extraction of the Articles

In [5]:
class FreqSummarizer:
    
### Initializing the characteristics of the Member function
    def __init__(self, min_cut = 0.1, max_cut = 0.9):
        self.min_cut = min_cut
        self.max_cut = max_cut
        self._stopwords = set(stopwords.words('english') + list(punctuation) + [u"'s",'"'])
    
### Function for the Dictionary of Words:Frequencies as the key-value pair
    def _compute_freq(self, word_sent,customStopWords=None):
        freq = defaultdict(int)

### If a list of CustomStopwords are available:
        if customStopWords is None:
            stopwords = set(self._stopwords)
        else:
            stopwords = set(customStopWords).union(self._stopwords)

###     Iterating through the words in sentences for incrementing the Frequency count
        for sentence in word_sent:
            for word in sentence:
                if word not in self._stopwords:
                    freq[word] += 1

###     Normalizing the Frequency and Pruning the results based on Caps
        max_freq = float(max(freq.values()))
        for word in list(freq.keys()):
            freq[word] = freq[word]/max_freq
            if freq[word] >= self.max_cut or freq[word] <= self.min_cut:
                del freq[word]
        return freq
    
### Setting up a Function for Feature Extraction
    def extractFeatures(self, article, n, customStopWords = None):
        ### The text of the article is extracted from the tuple
        text = article[0]
        ### The title of the article is extracted from the title of the article
        title = article[1]
        ### Tokenizing into Sentences and Words
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        ### Calling the member function to compute the frequencies
        self._freq = self._compute_freq(word_sent, customStopWords)
        ### Defining a statement to return ALL features if a Negative number is inputted
        if n < 0:
            return nlargest(len(self._freq_keys()), self._freq, key = self._freq.get)
        ### Defining a statement to return only the 'n' largest features
        else:
            return nlargest(n, self._freq, key = self._freq.get)
        
### Setting up a Function to compute Raw frequencies of the words (Just a Word Count)
    def rawFreq(self, article):
        ### The text of the article is extracted from the tuple
        text = article[0]
        ### The title of the article is extracted from the title of the article
        title = article[1]
        ### Tokenizing into Sentences and Words
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        ### Defining the word counter
        for s in word_sent:
            if word not in self._stopwords:
                freq[word] += 1
        return freq
    
### Function for assigning a score to a sentence based on the Frequency of words
    def summarizer(self, article, n):
        ### The text of the article is extracted from the tuple
        text = article[0]
        ### The title of the article is extracted from the title of the article
        title = article[1]
###     Splitting text into list of sentences
        sents = sent_tokenize(text)
###     Creating a list of words from the sentences in the article
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_freq(word_sent)
        ranking = defaultdict(int)

###     Creating a tuple of indices and sentences and incrementing the Rankings dictionary
        for i, sent in enumerate(word_sent):
            for word in sent:
                if word in self._freq:
                    ranking[i] += self._freq[word]

###     Sorting the sentences based on their frequencies
        sents_idx = nlargest(n, ranking, key = ranking.get)
        
        return [sents[j] for j in sents_idx]

### Setting up the Training Dataset

In [6]:
urlWashingtonPostNonTech = "https://www.washingtonpost.com/sports"
urlNewYorkTimesNonTech = "http://www.nytimes.com/pages/sports/index.html"
urlWashingtonPostTech = "https://www.washingtonpost.com/business/technology"
urlNewYorkTimesTech = "http://www.nytimes.com/pages/technology/index.html"

#### Downloading all the content from The Washington Post

In [7]:
WashingtonPostTechArticles = scrapeSource(urlWashingtonPostTech, '2016', getWashPostText, 'article')
WashingtonPostNonTechArticles = scrapeSource(urlWashingtonPostNonTech, '2016', getWashPostText, 'article')

https://www.washingtonpost.com/news/food/wp/2016/04/22/londons-first-nude-restaurant-has-a-waiting-list-11000-names-long/
https://www.washingtonpost.com/news/wonk/wp/2016/04/22/on-this-issue-donald-trump-knows-a-lot-more-than-other-republicans-sad/
https://www.washingtonpost.com/local/she-was-found-pushing-her-dead-son-on-a-swing-now-she-lives-with-what-she-lost/2016/04/20/581732cc-f2c1-11e5-85a6-2132cf446d0a_story.html
https://www.washingtonpost.com/news/the-fix/wp/2016/04/22/president-obama-just-met-the-most-powerful-2-year-old-in-the-world-prince-george/
https://www.washingtonpost.com/sports/capitals/suddenly-the-capitals-are-forced-to-confront-their-ghosts-of-playoffs-past-again/2016/04/22/80f12a06-08c7-11e6-a12f-ea5aed7958dc_story.html
https://www.washingtonpost.com/news/morning-mix/wp/2016/04/22/jesus-came-along-with-the-dark-angels-mother-charged-with-suffocating-her-daughter-to-death/
https://www.washingtonpost.com/news/early-lead/wp/2016/04/21/conor-mcgregor-declares-i-am-not-



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


#### Downloading all the content from The New York Times

In [8]:
newYorkTimesTechArticles = scrapeSource(urlNewYorkTimesTech, '2016', getNYTText, None)
newYorkTimesNonTechArticles = scrapeSource(urlNewYorkTimesNonTech, '2016', getNYTText, None)

http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/pages/technology/index.html&pos=Bar1&sn2=2630ddf6/7cfc6770&sn1=70c2df77/67d9a8a3&camp=nyt2016_bar1_digihd_BAU_mtr_msg_64RWJ&ad=bar1_digihd_BAU_mtr_v2_fonts_https_64RWJ&goto=http%3A%2F%2Fwww%2Enytimes%2Ecom%2Fsubscriptions%2FMultiproduct%2Flp3004%2Ehtml%3Fadxc%3D283467%26adxa%3D413342%26page%3Dwww.nytimes.com/pages/technology/index.html%26pos%3DBar1%26campaignId%3D64RWJ
http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/pages/technology/index.html&pos=Bar1&sn2=2630ddf6/7cfc6770&sn1=aa293c57/b4ec2814&camp=nyt2016_bar1_digihd_BAU_mtr_msg_64RWJ&ad=bar1_digihd_BAU_mtr_v2_fonts_https_64RWJ&goto=https%3A%2F%2Fwww%2Enytimesathome%2Ecom%2Fhd%2F205%3FMediaCode%3DWB7AA%26CMP%3D64RWL%26pos%3DBar1%26campaignId%3D64RWL
http://www.nytimes.com/adx/bin/adx_click.html?type=goto&opzn&page=www.nytimes.com/pages/technology/index.html&pos=Bar1&sn2=2630ddf6/7cfc6770&sn1=70c2df77/67d9a8a3&camp=nyt2



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


### Setting up the Training Articles for Classification

In [9]:
articleSummaries = {}

### Article Summaries for Tech articles
for techUrlDictionary in [newYorkTimesTechArticles, WashingtonPostTechArticles]:
    for articleUrl in techUrlDictionary:
        if techUrlDictionary[articleUrl][0] is not None:
            if len(techUrlDictionary[articleUrl][0]) > 0:
                fs = FreqSummarizer()
                summary = fs.extractFeatures(techUrlDictionary[articleUrl], 25)
                articleSummaries[articleUrl] = {'feature-vector': summary, 'label': 'Tech'}
            
### Article Summaries for Non-Tech articles
for nonTechUrlDictionary in [newYorkTimesNonTechArticles, WashingtonPostNonTechArticles]:
    for articleUrl in nonTechUrlDictionary:
        if nonTechUrlDictionary[articleUrl][0] is not None:
            if len(nonTechUrlDictionary[articleUrl][0]) > 0:
                fs = FreqSummarizer()
                summary = fs.extractFeatures(nonTechUrlDictionary[articleUrl], 25)
                articleSummaries[articleUrl] = {'feature-vector': summary, 'label': 'Non-Tech'}

### Setting up the Test Articles for Classification

In [21]:
def getNewsNow(testUrl,token):
    response = requests.get(testUrl)
    soup = BeautifulSoup(response.content)
    page = str(soup)
    title = soup.find("title").text
    mydivs = soup.findAll("a", {"class":token})
    text = ''.join(map(lambda a:a.text,mydivs))
    return text,title

#### Defining the Test case instance

In [22]:
testUrl = "http://www.newsnow.co.uk/h/Sport"
testArticle = getNewsNow(testUrl,"hll")

fs = FreqSummarizer()
testArticleSummary = fs.extractFeatures(testArticle, 25)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [36]:
### Defining a dictionary to keep track of the distance between the test instance and each article in the training dataset
similarities = {}
### Finding the number of similar instances or distance between the test article and the training corpus of articles
for articleUrl in articleSummaries:
    oneArticleSummary = articleSummaries[articleUrl]['feature-vector']
    similarities[articleUrl] = len(set(testArticleSummary).intersection(set(oneArticleSummary)))

### Defining a dictionary to keep track of the label and how many instances in the 5NN are Tech or Non-Tech
labels = defaultdict(int)
knn = nlargest(10, similarities, key = similarities.get)
for oneNeighbor in knn:
    labels[articleSummaries[oneNeighbor]['label']] += 1

nlargest(1, labels, key = labels.get)

['Non-Tech']

In [25]:
testArticleSummary

['man',
 'tyson',
 'rugby',
 'star',
 'west',
 'mcgregor',
 'new',
 'says',
 'golden',
 'chelsea',
 'united',
 'super',
 'hurricanes',
 'striker',
 'bizarre',
 'liverpool',
 'slow',
 'tottenham',
 'season',
 'prince',
 'klopp',
 'kohli',
 '2016',
 'city',
 'captain']