In [7]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import nltk
import pymongo

In [11]:
import numpy as np

In [102]:
import pandas as pd

In [47]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to C:\Users\Nishant
[nltk_data]    |     Jain\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to C:\Users\Nishant
[nltk_data]    |     Jain\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\Nishant
[nltk_data]    |     Jain\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to C:\Users\Nishant
[nltk_data]    |     Jain\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown.zip.
[nltk_data]    | Downloading package brown_tei to C:\Users\Nishant
[nltk_data]    |     Jain\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to C:\Users\Nishan

In [5]:
url = "https://www.moneycontrol.com/"
req1 = requests.get(url)

mnc = req1.content

soup_mnc = BeautifulSoup(mnc)

mnc_links = soup_mnc.find_all('a')

In [14]:
links = []
valid = []
for i in range(len(mnc_links)):
    try:
        link = mnc_links[i]['href']
    except:
        continue
    if(link.startswith("https://www.moneycontrol.com/news")):
        if(len(link)>100):
            links.append(link)
            valid.append(i)

In [16]:
_ ,unq_index = np.unique(links,return_index=True)

unq_index.sort()

links = np.array(links)[unq_index]
valid = np.array(valid)[unq_index]


### Get Articles Data

In [151]:
def article_to_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return article.text

#### Clean Data via NLP pipeline

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def identity_tokenizer(text):
    return text

In [149]:
def tranform_text(text):
    tokens = nltk.tokenize.word_tokenize(text)

    words = [word for word in tokens if word.isalpha()]

    words = [w for w in words if not w in stop_words]

    stemmed = [porter.stem(word.lower()) for word in words]
    return stemmed

def tfidf(tokenised_text):
    tfdf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)    

    tfdf_vector = tfdf.fit_transform([tokenised_text])

    feature_names = tfdf.get_feature_names()
    dense = tfdf_vector.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    return df

#### Cosine similarity

In [225]:
def cosinesim(link1,link2):  
    doc1_tfidf = tfidf(tranform_text(article_to_text(link1)))

    doc2_tfidf = tfidf(tranform_text(article_to_text(link2)))

    combinde_df = pd.concat([a1,a2], axis=0, ignore_index=True).fillna(0)

    similarity = cosine_similarity([combinde_df.loc[0],combinde_df.loc[1]])
    
    return similarity[0,1]

In [226]:
cosinesim(links[0],links[1])

0.07247518602637545

# Setting up Mongo DB

In [1]:
import pymongo

In [275]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["mydatabase"]

In [281]:
mycol = mydb["moneycontrol"]
mycol.drop()

In [277]:
mycol.create_index("link", unique = True)

'link_1'

In [282]:
id_mnc = []
for link in links:
    article_dict = {"link":link,"text": article_to_text(link)}
    x = mycol.insert_one(article_dict)
    id_mnc.append(x.inserted_id)

# Economic Times

In [283]:
url = "https://economictimes.indiatimes.com/"
req2 = requests.get(url)

In [284]:
et = req2.content

soup_et = BeautifulSoup(et)

et_links = soup_et.find_all('a')

In [295]:
et_links[100]

<a data-ga-onclick="23 - href" href="/markets/expert-view/voda-idea-survival-depends-on-whether-promoters-can-invest-more-ss-sirohi/articleshow/77068344.cms" target="_blank">Voda-Idea: Ball's in promoters'court</a>

In [297]:
t = et_links[100]["data-ga-onclick"]

In [305]:
t

'23 - href'

In [330]:
et_links_extracted = []
et_valid = []
for i in range(len(et_links)):
    try:
        link = et_links[i]["data-ga-onclick"]
    except:
        continue
    if(link.endswith("href")):
        if(link[0].isnumeric()):
            et_links_extracted.append("https://economictimes.indiatimes.com"+et_links[i]['href'])
            et_valid.append(i)

In [346]:
pd.DataFrame(zip(et_links_extracted,et_valid))

Unnamed: 0,0,1
0,https://economictimes.indiatimes.com/news/econ...,70
1,https://economictimes.indiatimes.com/industry/...,71
2,https://economictimes.indiatimes.com/tech/ites...,72
3,https://economictimes.indiatimes.com/industry/...,73
4,https://economictimes.indiatimes.com/industry/...,74
...,...,...
272,https://economictimes.indiatimes.com/markets/s...,555
273,https://economictimes.indiatimes.com/markets/s...,556
274,https://economictimes.indiatimes.com/markets/s...,557
275,https://economictimes.indiatimes.com/markets/s...,558


In [324]:
_ ,unq_index_et = np.unique(et_links_extracted,return_index=True)

unq_index_et.sort()

et_links_extracted = np.array(et_links_extracted)[unq_index_et]
et_valid = np.array(et_valid)[unq_index_et]

#### Store data in db

In [327]:
etcol = mydb["et"]
etcol.drop()

In [328]:
etcol.create_index("link", unique = True)

'link_1'

In [329]:
id_et = []
for link in et_links_extracted:
    article_dict = {"link":link,"text": article_to_text(link)}
    x = etcol.insert_one(article_dict)
    id_et.append(x.inserted_id)

ArticleException: Article `download()` failed with HTTPSConnectionPool(host='economictimes.indiatimes.comhttps', port=443): Max retries exceeded with url: //economictimes.indiatimes.com/blogs/et-editorials/let-writing-rubber-cheques-stay-criminal/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001F6869C2DF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')) on URL https://economictimes.indiatimes.comhttps://economictimes.indiatimes.com/blogs/et-editorials/let-writing-rubber-cheques-stay-criminal/