In [0]:
%pip install bs4
%pip install lxml
%pip install nltk
%pip install textblob

## STEP 1: Web Scraping to collect all posts/ articles from a website

In [0]:
#Get all the posts links from a blog

import urllib.request as ur
from bs4 import BeautifulSoup
#import urllib2
#from bs4 import BeautifulSoup

def getAllDoxyDonkeyPosts(url,links):
    request = ur.Request(url)
    response = ur.urlopen(request)
    soup = BeautifulSoup(response)
    #Soup.findAll Links 'a' is a link tag
    for a in soup.findAll('a'):
        try:
            url = a['href']
            title = a['title']
            if title == "Older Posts":
                print(title, url)
                links.append(url)
                getAllDoxyDonkeyPosts(url,links)
        except:
            title = ""
    return

blogUrl = "http://doxydonkey.blogspot.in"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)

In [0]:
#get all the posts from all the links obtained above
def getDoxyDonkeyText(testUrl):
    request = ur.Request(testUrl)
    response = ur.urlopen(request)
    soup = BeautifulSoup(response)
    #soup.findAll elements/tags from the HTML page as--> Right Click on Article page-->Inspect Element-->HTML developer window--> Identify the Open bullet--> For articles body it is enclosed within a divs tag 
    mydivs = soup.findAll("div", {"class":'post-body'})
    
    posts =[]
    for div in mydivs:
        posts+=map(lambda p:p.text.encode('ascii', errors='replace').decode('utf8').replace("?"," "), div.findAll("li"))
    return posts

In [0]:
doxyDonkeyPosts = []
for link in links:
    doxyDonkeyPosts+=getDoxyDonkeyText(link)

In [0]:
doxyDonkeyPosts

## STEP 2: Now that we have all our posts, use the K-Means clustering to cluster similar posts into one bucket

In [0]:
len(doxyDonkeyPosts)

In [0]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words='english')
X = vectorizer.fit_transform(doxyDonkeyPosts)
X
#Each row of X represents one article in our corpus

In [0]:
##Each row of X represents one article in our corpus. Looking at row-1. 13220 is the number of distinct words that can appear in the corpus and a sentence is vectorised as whether 1/0 i.e. whether that word is present in the sentence or not. Bag of words model. 
X[0].shape
#(1, 13220)

In [0]:
#printing the tf-idf scores for row-1 (with all other rows)
print (X[0])

In [0]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 100, n_init = 1, verbose = True)

In [0]:
km.fit(X)

In [0]:
#Each row of X (i.e. an article) is assigned with a number (labels_) which indicates which cluster out of k it belongs to
#km.labels_=cluster numbers
#return_counts=Count of articles in each cluster resp.
import numpy as np
np.unique(km.labels_, return_counts=True)

## STEP 3: Now that our articles are clustered into separate clusters let us identify what common theme runs accross the same cluster articles. This can be done by getting some key words from each cluster that define them most importantly.

In [0]:
#each article index & the cluster they belong to
for i,cluster in enumerate(km.labels_):
  print(i,cluster)

In [0]:
text={} #Dict {Cluster : Articles belonging to that cluster}
for i,cluster in enumerate(km.labels_):
    oneDocument = doxyDonkeyPosts[i]
    if cluster not in text.keys():
        text[cluster] = oneDocument
    else:
        text[cluster] += oneDocument


In [0]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import defaultdict
from string import punctuation
from heapq import nlargest
import nltk 

In [0]:
#Get most important words from each cluster article(s), exclude the stop words
#Using Word_Imp=Frequency of word in corpus
_stopwords = set(stopwords.words('english') + list(punctuation))

keywords = {} #Dict {Cluster: 100 most Important words}
counts={} #Dict {Cluster: frequency of each of the important words}
for cluster in range(3):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster] = nlargest(100, freq, key=freq.get)
    counts[cluster]=freq                 

In [0]:
print("keywords :",keywords)
print("\n\n")
print("counts :",counts)

In [0]:
#Identify 10 keywords (most important words in each cluster) that are unique to that cluster or they have not appeared in other clusters
unique_keys={}
for cluster in range(3):   
    other_clusters=list(set(range(3))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster]=nlargest(10, unique, key=counts[cluster].get)

In [0]:
unique_keys

## STEP 3: Now that we have clustered all articles and also verified that all clusters run across a common theme via their keywords, let us assign a new article to a cluster via K-Nearest Neighbour

In [0]:
article = "Facebook Inc. has been giving advertisers an inflated metric for the average time users spent watching a video, a measurement that may have helped boost marketer spending on one of Facebook’s most popular ad products. The company, owner of the world’s largest social network, only counts a video as viewed if it has been seen for more than 3 seconds. The metric it gave advertisers for their average video view time incorporated only the people who had watched the video long enough to count as a view in the first place, inflating the metric because it didn’t count anyone who didn’t watch, or watched for a shorter time. Facebook’s stock fell more than 1.5 percent in extended trading after the miscalculation was earlier reported in the Wall Street Journal. Facebook had disclosed the mistake in a posting on its advertiser help center web page several weeks ago. Big advertising buyers and marketers are upset about the inflated metric, and asked the company for more details, according to the report in the Journal, citing unidentified people familiar with the situation. The Menlo Park, California-based company has kept revenue surging in part because of enthusiasm for its video ads, which advertisers compare in performance to those on Twitter, YouTube and around the web."

In [0]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X,km.labels_) 
#Train set: X=All articles as index,tuples of TF-IDF score .   Y=labels or cluster number


In [0]:
test=vectorizer.transform([article.encode('ascii',errors='ignore')])

In [0]:
test

In [0]:
classifier.predict(test)