## Scraping hacker news data for last 24 hours

In [42]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

In [43]:
#define the URL for google news RSS feed(Rih Site Summary)
url='https://news.google.com/rss?h1=en-US&gl=US&ceid=US:en'

In [44]:
#make a request to the RSS feed URL
response=requests.get(url)
response

<Response [200]>

In [45]:
#parse the XML content using BeautifulSoup
soup=BeautifulSoup(response.content,"xml")

In [46]:
#find all <item> elements which represent individual news articles
articles=soup.find_all("item")

In [47]:
articles

[<item><title>What to watch for during Trump’s court appearance - CNN</title><link>https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LmNubi5jb20vMjAyMy8wNi8xMy9wb2xpdGljcy90cnVtcC1jb3VydC1hcHBlYXJhbmNlLXdoYXQtdG8td2F0Y2gvaW5kZXguaHRtbNIBW2h0dHBzOi8vYW1wLmNubi5jb20vY25uLzIwMjMvMDYvMTMvcG9saXRpY3MvdHJ1bXAtY291cnQtYXBwZWFyYW5jZS13aGF0LXRvLXdhdGNoL2luZGV4Lmh0bWw?oc=5</link><guid isPermaLink="false">2126021645</guid><pubDate>Tue, 13 Jun 2023 09:01:00 GMT</pubDate><description>&lt;ol&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CBMiV2h0dHBzOi8vd3d3LmNubi5jb20vMjAyMy8wNi8xMy9wb2xpdGljcy90cnVtcC1jb3VydC1hcHBlYXJhbmNlLXdoYXQtdG8td2F0Y2gvaW5kZXguaHRtbNIBW2h0dHBzOi8vYW1wLmNubi5jb20vY25uLzIwMjMvMDYvMTMvcG9saXRpY3MvdHJ1bXAtY291cnQtYXBwZWFyYW5jZS13aGF0LXRvLXdhdGNoL2luZGV4Lmh0bWw?oc=5" target="_blank"&gt;What to watch for during Trump’s court appearance&lt;/a&gt;&amp;nbsp;&amp;nbsp;&lt;font color="#6f6f6f"&gt;CNN&lt;/font&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="https://news.google

In [48]:
data=[] #list to store scraped data

In [49]:
#calculate the cutoff time for the past 24 hours
cutoff_time=datetime.now() - timedelta(hours=24)

In [50]:
for article in articles:
    #extract relevant data from each article
    pub_date=datetime.strptime(article.pubDate.text,"%a, %d %b %Y %H:%M:%S %Z")
    #check if the article was published within the last 24 hours
    if pub_date>=cutoff_time:
        title=article.title.text # extract title
        link=article.link.text # extract link
        description=article.description.text #extract description
        
        #append the extracted data to the list
        data.append([title,link,description])
# create a DataFrame from the scraped data
df=pd.DataFrame(data,columns=['Title','Link','Description'])
        

In [51]:
df.head()

Unnamed: 0,Title,Link,Description
0,What to watch for during Trump’s court appeara...,https://news.google.com/rss/articles/CBMiV2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,"Russia-Ukraine war: List of key events, day 47...",https://news.google.com/rss/articles/CBMiVmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,Russian missile attack on Zelenskyy's hometown...,https://news.google.com/rss/articles/CBMiZmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,Three people found dead in major incident in N...,https://news.google.com/rss/articles/CBMiR2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,Chris Christie slams 'loser' Trump over docume...,https://news.google.com/rss/articles/CBMiaGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."


#### Apply unsupervised clustering algorithm to get the labelled data

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [53]:
#convert the description text into numerical features
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(df['Description'])

In [54]:
#apply k-means clustering
k=5 #number of clusters
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans.fit(X)

KMeans(n_clusters=5, random_state=42)

In [55]:
#get the cluster labels for each news article
labels=kmeans.labels_

In [56]:
labels

array([4, 2, 2, 2, 4, 4, 4, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       2, 3, 3, 2, 2, 0, 2, 2, 2, 2, 2], dtype=int32)

In [57]:
#add the cluster labels to the dataframe
df['Cluster']=labels

In [58]:
#count the number of articles in each cluster
cluster_counts=df['Cluster'].value_counts().sort_index()

In [60]:
cluster_counts

0     2
1     2
2    23
3     2
4     4
Name: Cluster, dtype: int64