# Data Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import nltk
import pymongo
import numpy as np
import pandas as pd

In [2]:
# !python -m nltk.downloader all

#### Moneycontrol

In [3]:
url = "https://www.moneycontrol.com/"
req1 = requests.get(url)
mnc = req1.content
soup_mnc = BeautifulSoup(mnc)
mnc_links = soup_mnc.find_all('a')

In [4]:
links = []
valid = []
for i in range(len(mnc_links)):
    try:
        link = mnc_links[i]['href']
    except:
        continue
    if(link.startswith("https://www.moneycontrol.com/news")):
        if(len(link)>100):
            links.append(link)
            valid.append(i)

Keeping only unique links

In [5]:
_ ,unq_index = np.unique(links,return_index=True)

unq_index.sort()

links = np.array(links)[unq_index]
valid = np.array(valid)[unq_index]


#### Get Articles Data Function using newspaper library

In [6]:
def article_to_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return [article.text, article.title]

## Clean Data via NLP pipeline

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def identity_tokenizer(text):
    return text

In [9]:
def tranform_text(text):
    tokens = nltk.tokenize.word_tokenize(text)

    words = [word for word in tokens if word.isalpha()]

    words = [w for w in words if not w in stop_words]

    stemmed = [porter.stem(word.lower()) for word in words]
    return stemmed

# Setting up Mongo DB

## MoneyControl

In [10]:
import pymongo

In [11]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["mydatabase"]

In [12]:
mycol = mydb["moneycontrol"]
mycol.drop()

In [13]:
mycol.create_index("link", unique = True)

'link_1'

In [14]:
id_mnc = []
for link in links:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = mycol.insert_one(article_dict)
    id_mnc.append(x.inserted_id)

In [16]:
# To check if any articles have missing data
# cursor = mycol.find({"$where": 'this.text.length < 700'})
# for document in cursor: pprint(document)

In [31]:
mycol.estimated_document_count()

162

## Economic Times

In [17]:
url = "https://economictimes.indiatimes.com/"
req2 = requests.get(url)

In [18]:
et = req2.content

soup_et = BeautifulSoup(et)

et_links = soup_et.find_all('a')

In [20]:
et_links_extracted = []
et_valid = []
for i in range(len(et_links)):
    try:
        link = et_links[i]["data-ga-onclick"]
    except:
        continue
    if(link.endswith("href")):
        if(link[0].isnumeric()):
            if(len(et_links[i]['href'])>75):
                if(et_links[i]['href'].startswith("https")):
                    et_links_extracted.append(et_links[i]['href'])
                else:
                    et_links_extracted.append("https://economictimes.indiatimes.com"+et_links[i]['href'])
                et_valid.append(i)


In [21]:
# pd.DataFrame(zip(et_links_extracted,et_valid))

In [22]:
_ ,unq_index_et = np.unique(et_links_extracted,return_index=True)

unq_index_et.sort()

et_links_extracted = np.array(et_links_extracted)[unq_index_et]
et_valid = np.array(et_valid)[unq_index_et]

#### Store data in db

In [23]:
etcol = mydb["et"]
etcol.drop()

In [24]:
etcol.create_index("link", unique = True)

'link_1'

In [25]:
def article_to_text_bs4(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content)
    try:
        text = soup.find("div", class_="artText").get_text(separator = " ")
    except:
        try:
            text = soup.find("section").get_text(separator = " ")
        except:
            raise ValueError()
    return text

In [26]:
id_et = []
for link in et_links_extracted:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = etcol.insert_one(article_dict)
    id_et.append(x.inserted_id)

In [27]:
# cursor = etcol.find({})
# for document in cursor: pprint(document)

In [32]:
etcol.estimated_document_count()

164

In [29]:
cursor = etcol.find({"$where": 'this.text.length < 700'})
id_change = []
for document in cursor: 
#     pprint(document)
    id_change.append(document["_id"])

In [30]:
for ID in id_change:
    etcol.update_one({"_id": pymongo.collection.ObjectId(ID)}, 
                     {"$set":{"text": article_to_text_bs4(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0]['link'])}})

In [33]:
# for ID in id_change:
#     pprint(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0])

### TFIDF

#### Get MongoDB to DF

In [34]:
cursor_mnc = mycol.find({}) 
mnc = pd.DataFrame(list(cursor_mnc))

In [36]:
cursor_et = etcol.find({}) 
et = pd.DataFrame(list(cursor_et))

#### Dictionary of docs

In [48]:
mnc_text_list = mnc.text.to_list()
et_text_list = et.text.to_list()
combined_list = mnc_text_list + et_text_list

In [49]:
len(combined_list)

326

In [103]:
combined_list_tokenized = [tranform_text(l) for l in combined_list]

In [109]:
tfdf = TfidfVectorizer(tokenizer=identity_tokenizer,lowercase=False)  

In [110]:
tfdf_vector = tfdf.fit(combined_list_tokenized)



In [377]:
index=tfdf_vector.get_feature_names()

In [170]:
def get_tfidf(text):
    return tfdf_vector.transform([tranform_text(text)]).todense()

In [382]:
get_tfidf(combined_list[0])

matrix([[0.02731861, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [179]:
mnc["tfidf"] = mnc.text.apply(get_tfidf)

In [180]:
et["tfidf"] = et.text.apply(get_tfidf)

### Cosine Similarity

In [222]:
df_cos_sim = pd.DataFrame(index=et._id, columns=mnc._id)

In [225]:
def mongo_id(string_id):
    return pymongo.collection.ObjectId(string_id)

In [226]:
# df_cos_sim.loc[mongo_id("5f17453d0469677f8c1ec960"),mongo_id("5f1744ba0469677f8c1ec8be")]

nan

In [228]:
et.set_index("_id",inplace=True)
mnc.set_index("_id", inplace=True)

In [262]:
mnc.index = mnc.index.astype("category")

In [263]:
et.index =et.index.astype("category")

In [264]:
df_cos_sim.index =df_cos_sim.index.astype("category")
df_cos_sim.columns =df_cos_sim.columns.astype("category")

In [233]:
for index in df_cos_sim.index:
    for col in df_cos_sim.columns:
        df_cos_sim.at[index,col] = cosine_similarity(et.at[index,"tfidf"],mnc.at[col,"tfidf"])[0][0]

In [217]:
# cosine_similarity(mnc.tfidf[0],et.tfidf[0])[0][0]

0.011467326574304998

In [240]:
df_cos_sim

_id,5f1744ba0469677f8c1ec8be,5f1744ba0469677f8c1ec8bf,5f1744bb0469677f8c1ec8c0,5f1744bb0469677f8c1ec8c1,5f1744bb0469677f8c1ec8c2,5f1744bc0469677f8c1ec8c3,5f1744bc0469677f8c1ec8c4,5f1744bc0469677f8c1ec8c5,5f1744bd0469677f8c1ec8c6,5f1744bd0469677f8c1ec8c7,...,5f1745190469677f8c1ec956,5f1745190469677f8c1ec957,5f17451a0469677f8c1ec958,5f17451a0469677f8c1ec959,5f17451b0469677f8c1ec95a,5f17451b0469677f8c1ec95b,5f17451c0469677f8c1ec95c,5f17451d0469677f8c1ec95d,5f17451d0469677f8c1ec95e,5f17451e0469677f8c1ec95f
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5f17453d0469677f8c1ec960,0.0114673,0,0,0,0,0,0.0463961,0,0.0185727,0.016028,...,0.029735,0,0,0.0239734,0.00589689,0.0264672,0.0150747,0,0,0.00496967
5f17453e0469677f8c1ec961,0.0388334,0.0173982,0.0565951,0.0422151,0.0619221,0.0218676,0.0250631,0.0505814,0.0337798,0.0411998,...,0.0379356,0.0249632,0.0447389,0.0436684,0.0533956,0.125201,0.0507401,0.057748,0.0413346,0.039054
5f17453e0469677f8c1ec962,0.0312414,0.0294249,0.0622358,0.0311685,0.0187106,0.0580849,0.0319226,0.0767201,0.0115219,0.0292958,...,0.0322794,0.0323973,0.0152817,0.0406278,0.0296115,0.102691,0.0227118,0.0255522,0.0261444,0.054675
5f17453e0469677f8c1ec963,0.0634909,0.0100158,0.0414677,0.0693265,0.0145547,0.00273807,0.00547404,0.0116996,0.0602043,0.0598529,...,0.0268011,0.0108062,0.0529699,0.0391933,0.0519455,0.0332845,0.00899718,0.00843401,0.0283627,0.0359305
5f17453e0469677f8c1ec964,0.243824,0.0160567,0.218952,0.262918,0.018574,0.258621,0.00108253,0.0126162,0.273079,0.288077,...,0.027143,0.0396664,0.101418,0.193663,0.0621848,0.0423586,0.00917726,0.0065596,0.0169228,0.0238648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5f1745a10469677f8c1ec9ff,0.0626323,0.0912729,0.0665884,0.0378552,0.0341251,0.0197389,0.027089,0.0180509,0.0440727,0.046347,...,0.0206509,0.0211657,0.0787627,0.0243525,0.0341151,0.0601454,0.0395705,0.0128242,0.0211429,0.0213133
5f1745a10469677f8c1eca00,0.0622099,0.0202952,0.119831,0.0961647,0.0133872,0.042095,0.0107277,0.0123336,0.113164,0.155136,...,0.00865765,0.0903482,0.082271,0.046309,0.0251939,0.0434824,0.0319511,0.00839136,0.0437764,0.0140102
5f1745a20469677f8c1eca01,0.00960579,0,0.01683,0.0282392,0.00690256,0.00635265,0.00708433,0.0174493,0.0120735,0.0198752,...,0.0117588,0.0383286,0.0130895,0.0533339,0.00728739,0.0358674,0.0394179,0.0050149,0.0196411,0.035215
5f1745a30469677f8c1eca02,0.0341932,0.0415469,0.0259471,0.0203028,0.0156772,0.0257346,0.00301928,0.0278039,0.0250946,0.0359356,...,0.0341704,0.00639622,0.0453617,0.0267879,0.0585293,0.0593735,0.0381089,0.0193714,0.0452701,0.0350853


In [294]:
# np.argmax(df_cos_sim.iloc[:,0])

13

In [295]:
# Max cosine similarity of Moneycontrol article with which ET article
max_sim = np.argmax(df_cos_sim.to_numpy(),axis = 0)

In [303]:
max_sim_id = df_cos_sim.index[max_sim]

In [308]:
mnc["Similar ET Article"] = et.loc[max_sim_id].link.values

In [340]:
mnc["Sim_Score"] =  df_cos_sim.max(axis = 0).values

In [343]:
mnc["ET_Title"] = et.loc[max_sim_id].title.values

In [362]:
mnc["ET_Text"] = et.loc[max_sim_id].text.values

In [365]:
mnc["ET_Link"] = et.loc[max_sim_id].link.values
mnc["ET_id"] = max_sim_id

In [352]:
from IPython.display import display, HTML

In [375]:
display(HTML(mnc[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).head(20).to_html()))

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1744d70469677f8c1ec8f4,"Government open to announce further steps to revive growth, says FM Sitharaman",https://www.moneycontrol.com/news/business/economy/government-open-to-announce-further-steps-to-revive-growth-says-fm-sitharaman-5578571.html,"Government open to announcing more measures to boost growth, says Nirmala Sitharaman",https://economictimes.indiatimes.com/news/economy/policy/government-open-to-announcing-more-measures-to-boost-growth-says-nirmala-sitharaman/articleshow/77089767.cms,0.945453
5f1744c30469677f8c1ec8d1,Coronavirus outbreak: Amarnath Yatra 2020 cancelled,https://www.moneycontrol.com/news/business/coronavirus-outbreak-amarnath-yatra-2020-cancelled-jammu-kashmir-shrine-board-5578401.html,Amarnath Yatra called off amid coronavirus crisis; 'Aarti' to be broadcast live,https://economictimes.indiatimes.com/news/politics-and-nation/amarnath-yatra-called-off-amid-coronavirus-crisis-aarti-to-be-broadcast-live/articleshow/77091101.cms,0.609797
5f1744e30469677f8c1ec905,Corona Kavach policy | Healthcare workers to get 5% discount on medical insurance premium,https://www.moneycontrol.com/news/business/corona-kavach-policy-healthcare-workers-to-get-5-discount-on-medical-insurance-premium-5547501.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.609004
5f1744bd0469677f8c1ec8c6,"Axis Bank Q1 profit falls 19% to Rs 1,112 crore, NII grows 19.5%; slippages decline",https://www.moneycontrol.com/news/business/earnings/axis-bank-q1-profit-falls-19-to-rs-1112-crore-nii-grows-19-5-5577831.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.598263
5f1744d70469677f8c1ec8f3,"Hiring sentiment seeing improvement, says TeamLease Employment Outlook Report",https://www.moneycontrol.com/news/business/economy/hiring-sentiment-seeing-improvement-says-teamlease-employment-outlook-report-5578131.html,Hiring sentiment showing signs of recovery: TeamLease Employment Outlook for Apr-Sep,https://economictimes.indiatimes.com/multimedia/jobs/hiring-sentiment-showing-signs-of-recovery-teamlease-employment-outlook-for-apr-sep/articleshow/77086450.cms,0.596145
5f1744df0469677f8c1ec900,Corona Kavach health insurance policy evokes good response: Insurers,https://www.moneycontrol.com/news/india/corona-kavach-health-insurance-policy-evokes-good-response-insurers-5568181.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.578422
5f1744e20469677f8c1ec904,Does covering pre-existing ailments make Corona Kavach and Rakshak must-haves?,https://www.moneycontrol.com/news/business/personal-finance/does-covering-pre-existing-ailments-make-corona-kavach-and-suraksha-must-haves-5553491.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.529951
5f1744f40469677f8c1ec91d,Deadline extension: 3 attractive tax-saving investments for you,https://www.moneycontrol.com/news/business/personal-finance/deadline-extension-3-attractive-tax-saving-investments-for-you-5516611.html,Best tax saving mutual fund schemes,https://economictimes.indiatimes.com/mf/analysis/which-are-the-best-mutual-fund-schemes-to-save-taxes-in-2020/articleshow/77065021.cms,0.519498
5f1744bb0469677f8c1ec8c1,"Bajaj Finance consolidated Q1 profit falls 19% to Rs 962 crore, new loans down 76%",https://www.moneycontrol.com/news/business/earnings/bajaj-finance-consolidated-q1-profit-falls-19-to-rs-962-crore-new-loans-down-76-5577211.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.508532
5f17450a0469677f8c1ec93d,Gold returns 25% so far this year; should you buy more or just hold?,https://www.moneycontrol.com/news/business/personal-finance/gold-returns-25-so-far-this-year-should-you-buy-more-or-just-hold-5531281.html,"Kotak Gold Fund: Don't chase price: Abhishek Bisen, Fund Manager, Kotak Gold Fund",https://economictimes.indiatimes.com/mf/analysis/dont-chase-price-abhishek-bisen-fund-manager-kotak-gold-fund/articleshow/77060460.cms,0.489989


### Example 1:
<b>MNC Title: </b> Gold hovers near nine-year high as virus fears drive safe-haven demand	https://www.moneycontrol.com/news/business/markets/gold-hovers-near-nine-year-high-as-virus-fears-drive-safe-haven-demand-5574941.html	<br>
<b>ET Title: </b>It’s silver’s turn to shine as prices surge to four-year high	https://economictimes.indiatimes.com/markets/commodities/news/its-silvers-turn-to-shine-as-prices-surge-to-four-year-high/articleshow/77087263.cms <br>
<b>Cosine Sim: </b>0.451393 <br> <br>

Though both articles are about price surge of commodity, but one has focus on gold, whereas other is completely about silver. The moneycontrol article talks about silver in one line only, whereas ET article is wholly focussed on silver. <br>
So TFIDF here picked up similarity but couldn't pick up the main topic. Although the cosine score is not high as well.

### Example 2:
<b>MNC Title: </b> Coronavirus outbreak: Amarnath Yatra 2020 cancelled	https://www.moneycontrol.com/news/business/coronavirus-outbreak-amarnath-yatra-2020-cancelled-jammu-kashmir-shrine-board-5578401.html	<br>
<b>ET Title: </b>Amarnath Yatra called off amid coronavirus crisis; 'Aarti' to be broadcast live	https://economictimes.indiatimes.com/news/politics-and-nation/amarnath-yatra-called-off-amid-coronavirus-crisis-aarti-to-be-broadcast-live/articleshow/77091101.cms <br>
<b>Cosine Sim: </b>0.609797 <br> <br>

Here TFIDF an Cosine similarity works great, both articles covers the same topics and same news.

# 