### Abbreviations used:
ET: Economic Times <br>
MNC: Moneycontrol

# Data Scraping

In [2]:
#imports
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import nltk
import pymongo
import numpy as np
import pandas as pd

In [2]:
# !python -m nltk.downloader all

### Moneycontrol

In [3]:
url = "https://www.moneycontrol.com/"
req1 = requests.get(url)
mnc = req1.content
soup_mnc = BeautifulSoup(mnc)
mnc_links = soup_mnc.find_all('a')

In [4]:
links = []
valid = []
for i in range(len(mnc_links)):
    try:
        link = mnc_links[i]['href']
    except:
        continue
    if(link.startswith("https://www.moneycontrol.com/news")):
        if(len(link)>100):
            links.append(link)
            valid.append(i)

Keeping only unique links

In [5]:
_ ,unq_index = np.unique(links,return_index=True)

unq_index.sort()

links = np.array(links)[unq_index]
valid = np.array(valid)[unq_index]


#### Get Articles Data Function using newspaper library

In [3]:
def article_to_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return [article.text, article.title]

## Clean Data via NLP pipeline

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
def identity_tokenizer(text):
    return text

### Here serveral text preprocessing techniques are used:
1. Tokenize <br>
2. Remove numeric <br>
3. Remove stopwords, although using TF-IDF will reduce the impact of stop words, but to save processing and space, stop words are removed<br>
4. Stemming: Either stemming or lemmatization could be used, although none is perfect lemmatization is better but computationally expensive. Hence limited here to basic stemming approach. I am using standard porter stemmer provided in NLTK

In [117]:
def tranform_text(text):
    #tokenize
    tokens = nltk.tokenize.word_tokenize(text)
    #remove numeric
    words = [word for word in tokens if word.isalpha()]
    #remove stopwords, using english stopwords from nltk
    words = [w for w in words if not w in stop_words]
    #stemming, using porter stemmer
    stemmed = [porter.stem(word) for word in words]

    return stemmed

# Setting up Mongo DB

## MoneyControl

In [7]:
import pymongo

In [8]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["mydatabase"]

In [12]:
mycol = mydb["moneycontrol"]
# mycol.drop()

In [13]:
mycol.create_index("link", unique = True)

'link_1'

In [14]:
id_mnc = []
for link in links:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = mycol.insert_one(article_dict)
    id_mnc.append(x.inserted_id)

In [16]:
# To check if any articles have missing data
# cursor = mycol.find({"$where": 'this.text.length < 700'})
# for document in cursor: pprint(document)

In [13]:
mycol.estimated_document_count()

162

## Economic Times

In [17]:
url = "https://economictimes.indiatimes.com/"
req2 = requests.get(url)

In [18]:
et = req2.content

soup_et = BeautifulSoup(et)

et_links = soup_et.find_all('a')

In [20]:
et_links_extracted = []
et_valid = []
for i in range(len(et_links)):
    try:
        link = et_links[i]["data-ga-onclick"]
    except:
        continue
    if(link.endswith("href")):
        if(link[0].isnumeric()):
            if(len(et_links[i]['href'])>75):
                if(et_links[i]['href'].startswith("https")):
                    et_links_extracted.append(et_links[i]['href'])
                else:
                    et_links_extracted.append("https://economictimes.indiatimes.com"+et_links[i]['href'])
                et_valid.append(i)


In [21]:
# pd.DataFrame(zip(et_links_extracted,et_valid))

In [22]:
_ ,unq_index_et = np.unique(et_links_extracted,return_index=True)

unq_index_et.sort()

et_links_extracted = np.array(et_links_extracted)[unq_index_et]
et_valid = np.array(et_valid)[unq_index_et]

### Store data in db for ET

In [14]:
etcol = mydb["et"]
# etcol.drop()

In [24]:
etcol.create_index("link", unique = True)

'link_1'

In [25]:
def article_to_text_bs4(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content)
    try:
        text = soup.find("div", class_="artText").get_text(separator = " ")
    except:
        try:
            text = soup.find("section").get_text(separator = " ")
        except:
            raise ValueError()
    return text

In [26]:
id_et = []
for link in et_links_extracted:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = etcol.insert_one(article_dict)
    id_et.append(x.inserted_id)

In [27]:
# cursor = etcol.find({})
# for document in cursor: pprint(document)

In [32]:
etcol.estimated_document_count()

164

In [29]:
cursor = etcol.find({"$where": 'this.text.length < 700'})
id_change = []
for document in cursor: 
#     pprint(document)
    id_change.append(document["_id"])

In [30]:
for ID in id_change:
    etcol.update_one({"_id": pymongo.collection.ObjectId(ID)}, 
                     {"$set":{"text": article_to_text_bs4(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0]['link'])}})

In [33]:
# for ID in id_change:
#     pprint(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0])

### TFIDF

#### Get MongoDB to DF

In [46]:
# Load MNC data to df
cursor_mnc = mycol.find({}) 
mnc = pd.DataFrame(list(cursor_mnc))

In [47]:
# Load ET data to df
cursor_et = etcol.find({}) 
et = pd.DataFrame(list(cursor_et))

#### Dictionary of docs

In [48]:
mnc_text_list = mnc.text.to_list()
et_text_list = et.text.to_list()
combined_list = mnc_text_list + et_text_list

In [49]:
len(combined_list)

326

In [50]:
combined_list_tokenized = [tranform_text(l) for l in combined_list]

In [51]:
tfdf = TfidfVectorizer(tokenizer=identity_tokenizer,lowercase=False)  

In [52]:
tfdf_vector = tfdf.fit(combined_list_tokenized)



In [53]:
index=tfdf_vector.get_feature_names()

In [54]:
def get_tfidf(text):
    return tfdf_vector.transform([tranform_text(text)]).todense()

In [55]:
get_tfidf(combined_list[0])

matrix([[0.02711837, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [56]:
mnc["tfidf"] = mnc.text.apply(get_tfidf)

In [57]:
et["tfidf"] = et.text.apply(get_tfidf)

### Cosine Similarity

In [58]:
df_cos_sim = pd.DataFrame(index=et._id, columns=mnc._id)

In [59]:
def mongo_id(string_id):
    return pymongo.collection.ObjectId(string_id)

In [60]:
# df_cos_sim.loc[mongo_id("5f17453d0469677f8c1ec960"),mongo_id("5f1744ba0469677f8c1ec8be")]

In [61]:
et.set_index("_id",inplace=True)
mnc.set_index("_id", inplace=True)

In [62]:
mnc.index = mnc.index.astype("category")

In [63]:
et.index =et.index.astype("category")

In [64]:
df_cos_sim.index =df_cos_sim.index.astype("category")
df_cos_sim.columns =df_cos_sim.columns.astype("category")

In [65]:
for index in df_cos_sim.index:
    for col in df_cos_sim.columns:
        df_cos_sim.at[index,col] = cosine_similarity(et.at[index,"tfidf"],mnc.at[col,"tfidf"])[0][0]

In [66]:
# cosine_similarity(mnc.tfidf[0],et.tfidf[0])[0][0]

In [67]:
df_cos_sim

_id,5f1744ba0469677f8c1ec8be,5f1744ba0469677f8c1ec8bf,5f1744bb0469677f8c1ec8c0,5f1744bb0469677f8c1ec8c1,5f1744bb0469677f8c1ec8c2,5f1744bc0469677f8c1ec8c3,5f1744bc0469677f8c1ec8c4,5f1744bc0469677f8c1ec8c5,5f1744bd0469677f8c1ec8c6,5f1744bd0469677f8c1ec8c7,...,5f1745190469677f8c1ec956,5f1745190469677f8c1ec957,5f17451a0469677f8c1ec958,5f17451a0469677f8c1ec959,5f17451b0469677f8c1ec95a,5f17451b0469677f8c1ec95b,5f17451c0469677f8c1ec95c,5f17451d0469677f8c1ec95d,5f17451d0469677f8c1ec95e,5f17451e0469677f8c1ec95f
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5f17453d0469677f8c1ec960,0.0113833,0,0,0,0,0,0.0463922,0,0.0185723,0.0160175,...,0.029735,0,0,0.0234326,0.00589607,0.0264862,0.0150744,0,0,0.00495306
5f17453e0469677f8c1ec961,0.038545,0.0173965,0.0572118,0.0437079,0.0619159,0.0218655,0.0250963,0.0506047,0.0338064,0.0415046,...,0.0379319,0.0249877,0.0445101,0.0387454,0.0534458,0.125254,0.0507531,0.0578647,0.0413523,0.0385354
5f17453e0469677f8c1ec962,0.0296021,0.0280867,0.0592392,0.0287491,0.0178597,0.0554435,0.030535,0.0732817,0.00874733,0.0279569,...,0.0308114,0.0275829,0.0145136,0.0337566,0.0222155,0.100799,0.0217121,0.0245247,0.0249954,0.0712638
5f17453e0469677f8c1ec963,0.0629155,0.00999828,0.0421956,0.0694787,0.0145293,0.00273329,0.00546403,0.0116785,0.0601531,0.0600958,...,0.0267543,0.010861,0.0526122,0.0332807,0.0519954,0.0330805,0.00898128,0.00841827,0.0283123,0.0347582
5f17453e0469677f8c1ec964,0.242036,0.0160567,0.218338,0.262921,0.018574,0.258621,0.00110771,0.0126362,0.273077,0.287894,...,0.027143,0.0396442,0.100909,0.189294,0.062176,0.0423914,0.00918978,0.00657668,0.0169377,0.0238101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5f1745a10469677f8c1ec9ff,0.0621721,0.0912712,0.0664007,0.0381869,0.0341245,0.0197386,0.0270863,0.0180496,0.0441085,0.0463159,...,0.0206505,0.0212053,0.0783661,0.023333,0.0342102,0.0600722,0.039569,0.0128224,0.0211419,0.0205691
5f1745a10469677f8c1eca00,0.0617539,0.0202952,0.119496,0.0961662,0.0133872,0.042095,0.0107268,0.0123329,0.113161,0.155035,...,0.00865765,0.090282,0.0818583,0.0452644,0.0251904,0.0435137,0.0319504,0.00839035,0.0437751,0.0139633
5f1745a20469677f8c1eca01,0.00953459,0,0.0167815,0.0298457,0.00690198,0.00635213,0.0072485,0.0175826,0.0121035,0.0198893,...,0.0117578,0.0383434,0.0130228,0.0521264,0.00728578,0.0359067,0.0394971,0.00513092,0.0197398,0.0352586
5f1745a30469677f8c1eca02,0.0339426,0.0415469,0.0258744,0.0203031,0.0156772,0.0257346,0.00301904,0.0278024,0.0250941,0.0359122,...,0.0341704,0.00639153,0.0451341,0.0261836,0.0585212,0.0594162,0.0381081,0.0193691,0.0452687,0.034968


In [294]:
# np.argmax(df_cos_sim.iloc[:,0])

13

In [69]:
# Max cosine similarity of Moneycontrol article with which ET article
max_sim = np.argmax(df_cos_sim.to_numpy(),axis = 0)

In [70]:
max_sim_id = df_cos_sim.index[max_sim]

In [71]:
tfidf_results = mnc.copy()

In [72]:
tfidf_results["Similar ET Article"] = et.loc[max_sim_id].link.values

In [73]:
tfidf_results["Sim_Score"] =  df_cos_sim.max(axis = 0).values

In [74]:
tfidf_results["ET_Title"] = et.loc[max_sim_id].title.values

In [75]:
tfidf_results["ET_Text"] = et.loc[max_sim_id].text.values

In [76]:
tfidf_results["ET_Link"] = et.loc[max_sim_id].link.values
tfidf_results["ET_id"] = max_sim_id

In [77]:
from IPython.display import display, HTML

In [78]:
display(HTML(tfidf_results[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).head(20).to_html()))

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1744d70469677f8c1ec8f4,"Government open to announce further steps to revive growth, says FM Sitharaman",https://www.moneycontrol.com/news/business/economy/government-open-to-announce-further-steps-to-revive-growth-says-fm-sitharaman-5578571.html,"Government open to announcing more measures to boost growth, says Nirmala Sitharaman",https://economictimes.indiatimes.com/news/economy/policy/government-open-to-announcing-more-measures-to-boost-growth-says-nirmala-sitharaman/articleshow/77089767.cms,0.945453
5f1744c30469677f8c1ec8d1,Coronavirus outbreak: Amarnath Yatra 2020 cancelled,https://www.moneycontrol.com/news/business/coronavirus-outbreak-amarnath-yatra-2020-cancelled-jammu-kashmir-shrine-board-5578401.html,Amarnath Yatra called off amid coronavirus crisis; 'Aarti' to be broadcast live,https://economictimes.indiatimes.com/news/politics-and-nation/amarnath-yatra-called-off-amid-coronavirus-crisis-aarti-to-be-broadcast-live/articleshow/77091101.cms,0.609797
5f1744e30469677f8c1ec905,Corona Kavach policy | Healthcare workers to get 5% discount on medical insurance premium,https://www.moneycontrol.com/news/business/corona-kavach-policy-healthcare-workers-to-get-5-discount-on-medical-insurance-premium-5547501.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.609021
5f1744bd0469677f8c1ec8c6,"Axis Bank Q1 profit falls 19% to Rs 1,112 crore, NII grows 19.5%; slippages decline",https://www.moneycontrol.com/news/business/earnings/axis-bank-q1-profit-falls-19-to-rs-1112-crore-nii-grows-19-5-5577831.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.598281
5f1744d70469677f8c1ec8f3,"Hiring sentiment seeing improvement, says TeamLease Employment Outlook Report",https://www.moneycontrol.com/news/business/economy/hiring-sentiment-seeing-improvement-says-teamlease-employment-outlook-report-5578131.html,Hiring sentiment showing signs of recovery: TeamLease Employment Outlook for Apr-Sep,https://economictimes.indiatimes.com/multimedia/jobs/hiring-sentiment-showing-signs-of-recovery-teamlease-employment-outlook-for-apr-sep/articleshow/77086450.cms,0.59604
5f1744df0469677f8c1ec900,Corona Kavach health insurance policy evokes good response: Insurers,https://www.moneycontrol.com/news/india/corona-kavach-health-insurance-policy-evokes-good-response-insurers-5568181.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.578432
5f1744e20469677f8c1ec904,Does covering pre-existing ailments make Corona Kavach and Rakshak must-haves?,https://www.moneycontrol.com/news/business/personal-finance/does-covering-pre-existing-ailments-make-corona-kavach-and-suraksha-must-haves-5553491.html,IRDAI allows Corona Kavach policy to be sold as group health insurance,https://economictimes.indiatimes.com/wealth/insure/health-insurance/irdai-allows-corona-kavach-policy-to-be-sold-as-group-health-insurance/articleshow/77089750.cms,0.529957
5f1744f40469677f8c1ec91d,Deadline extension: 3 attractive tax-saving investments for you,https://www.moneycontrol.com/news/business/personal-finance/deadline-extension-3-attractive-tax-saving-investments-for-you-5516611.html,Best tax saving mutual fund schemes,https://economictimes.indiatimes.com/mf/analysis/which-are-the-best-mutual-fund-schemes-to-save-taxes-in-2020/articleshow/77065021.cms,0.519061
5f1744bb0469677f8c1ec8c1,"Bajaj Finance consolidated Q1 profit falls 19% to Rs 962 crore, new loans down 76%",https://www.moneycontrol.com/news/business/earnings/bajaj-finance-consolidated-q1-profit-falls-19-to-rs-962-crore-new-loans-down-76-5577211.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.508575
5f17450a0469677f8c1ec93d,Gold returns 25% so far this year; should you buy more or just hold?,https://www.moneycontrol.com/news/business/personal-finance/gold-returns-25-so-far-this-year-should-you-buy-more-or-just-hold-5531281.html,"Kotak Gold Fund: Don't chase price: Abhishek Bisen, Fund Manager, Kotak Gold Fund",https://economictimes.indiatimes.com/mf/analysis/dont-chase-price-abhishek-bisen-fund-manager-kotak-gold-fund/articleshow/77060460.cms,0.488881


### Example 1:
<b>MNC Title: </b> Gold hovers near nine-year high as virus fears drive safe-haven demand	https://www.moneycontrol.com/news/business/markets/gold-hovers-near-nine-year-high-as-virus-fears-drive-safe-haven-demand-5574941.html	<br>
<b>ET Title: </b>It’s silver’s turn to shine as prices surge to four-year high	https://economictimes.indiatimes.com/markets/commodities/news/its-silvers-turn-to-shine-as-prices-surge-to-four-year-high/articleshow/77087263.cms <br>
<b>Cosine Sim: </b>0.451393 <br> <br>

Though both articles are about price surge of commodity, but one has focus on gold, whereas other is completely about silver. The moneycontrol article talks about silver in one line only, whereas ET article is wholly focussed on silver. <br>
So TFIDF here picked up similarity but couldn't pick up the main topic. Although the cosine score is not high as well.

### Example 2:
<b>MNC Title: </b> Coronavirus outbreak: Amarnath Yatra 2020 cancelled	https://www.moneycontrol.com/news/business/coronavirus-outbreak-amarnath-yatra-2020-cancelled-jammu-kashmir-shrine-board-5578401.html	<br>
<b>ET Title: </b>Amarnath Yatra called off amid coronavirus crisis; 'Aarti' to be broadcast live	https://economictimes.indiatimes.com/news/politics-and-nation/amarnath-yatra-called-off-amid-coronavirus-crisis-aarti-to-be-broadcast-live/articleshow/77091101.cms <br>
<b>Cosine Sim: </b>0.609797 <br> <br>

Here TFIDF an Cosine similarity works great, both articles covers the same topics and same news.

# Word2Vec

In [83]:
from gensim.models import Word2Vec, KeyedVectors, Doc2Vec

##### link to download pretrained model https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit 

In [87]:
word_emb_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) #using pretrained model

In [135]:
from collections import Counter
import itertools

def map_word_frequency(document):
    return Counter(document)

In [170]:
#transform text as per word2vec corpus, also not using stemming and lowercasing. As per testing on some of articles, stemmed words are not present in the word2vec vocabulary. 
def tranform_text_w2v(text):
    tokens = nltk.tokenize.word_tokenize(text)

    words = [word for word in tokens if word.isalpha()]

    words = [w for w in words if not w in stop_words ]
    
    words_w2v = [w for w in words if w in word_emb_model.vocab]
    
    return words_w2v

In [134]:
#Test
cosine_similarity(get_sif_feature_vectors(tranform_text_w2v(mnc.text[0]),tranform_text_w2v(et.text[1])))[0,1]

0.6830894859718859

In [152]:
#word counts for SIF to be used below
word_counts = map_word_frequency([item for sublist in [tranform_text_w2v(l) for l in combined_list] for item in sublist])

In [171]:
# Tokenizing for word2vec usecase
mnc["tokens"] = mnc.text.apply(tranform_text_w2v)
et["tokens"] = et.text.apply(tranform_text_w2v)

In [160]:
# dropping TF-IDF values
mnc.drop(columns="tfidf", inplace=True)
et.drop(columns="tfidf", inplace=True)

In [168]:
#Calculate average vetor for using word2vec for a article. Also apply SIF (smooth inverse frequency)
def w2v_average(sentence):
#     print(word_emb_model.vector_size) #300
    embedding_size = 300 # size of vectore in word embeddings
    a = 0.001
    vs = np.zeros(embedding_size)
    sentence_length = len(sentence)
    for word in sentence:
        a_value = a / (a + word_counts[word]) # smooth inverse frequency, SIF
        vs = np.add(vs, np.multiply(a_value, word_emb_model[word])) # vs += sif * word_vector
    vs = np.divide(vs, sentence_length) # weighted average
    return vs

In [173]:
#applying above function to tokenized text
mnc["w2v_vec"] = mnc.tokens.apply(w2v_average)
et["w2v_vec"] = et.tokens.apply(w2v_average)

In [202]:
# Function to get similar results for word2vec as in the case of TF-IDF above
def view_w2v_result(top = 20, bottom = False):    
    # defining pd matrix for cosine similarity
    df_cos_sim = pd.DataFrame(index=et.index, columns=mnc.index)
    
    # filling the matrix above
    for index in df_cos_sim.index:
        for col in df_cos_sim.columns:
            df_cos_sim.at[index,col] = cosine_similarity([et.at[index,"w2v_vec"],mnc.at[col,"w2v_vec"]])[0][1]

    # Max cosine similarity of Moneycontrol article with which ET article
    max_sim = np.argmax(df_cos_sim.to_numpy(),axis = 0)
    
    #IDs of max cosine similarity
    max_sim_id = df_cos_sim.index[max_sim]
    
    #creating new df to store results for best match article and visualize below
    tfidf_results = mnc.copy()
    tfidf_results["Similar ET Article"] = et.loc[max_sim_id].link.values
    tfidf_results["Sim_Score"] =  df_cos_sim.max(axis = 0).values
    tfidf_results["ET_Title"] = et.loc[max_sim_id].title.values
    tfidf_results["ET_Text"] = et.loc[max_sim_id].text.values
    tfidf_results["ET_Link"] = et.loc[max_sim_id].link.values
    tfidf_results["ET_id"] = max_sim_id
    
    if bottom:
        #printing the top=top articles
        display(HTML(tfidf_results[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).tail(top).to_html()))
    else:
        display(HTML(tfidf_results[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).head(top).to_html()))

## Top 20 results

In [188]:
view_w2v_result()

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1745160469677f8c1ec951,"'Trade light though market will remain strong; avoid Yes Bank; Infosys, TCS look good'",https://www.moneycontrol.com/news/business/markets/trade-light-though-market-will-remain-strong-avoid-yes-bank-infosys-tcs-look-good-5576051.html,Financials likely to see a lot of market share shifts: Shibani Sircar Kurian,https://economictimes.indiatimes.com/markets/expert-view/financials-likely-to-see-a-lot-of-market-share-shifts-shibani-sircar-kurian/articleshow/77086932.cms,0.893066
5f1744c70469677f8c1ec8da,Why Akhil Gogoi always invites the wrath of Assam’s ruling class,https://www.moneycontrol.com/news/india/why-akhil-gogoi-always-invites-the-wrath-of-assams-ruling-class-5575851.html,Amarnarth Yatra cancelled amid the COVID-19 pandemic,https://economictimes.indiatimes.com/news/politics-and-nation/amarnarth-yatra-cancelled-amid-the-covid-19-pandemic/videoshow/77087861.cms,0.889134
5f1744ba0469677f8c1ec8be,Trade Setup for Wednesday: Top 15 things to know before Opening Bell,https://www.moneycontrol.com/news/business/markets/trade-setup-for-wednesday-top-15-things-to-know-before-opening-bell-100-5578031.html,Ahead of Market: 12 things that will decide stock action on Wednesday,https://economictimes.indiatimes.com/markets/stocks/news/ahead-of-market-12-things-that-will-decide-stock-action-on-wednesday/articleshow/77091211.cms,0.88871
5f1744d70469677f8c1ec8f3,"Hiring sentiment seeing improvement, says TeamLease Employment Outlook Report",https://www.moneycontrol.com/news/business/economy/hiring-sentiment-seeing-improvement-says-teamlease-employment-outlook-report-5578131.html,Hiring sentiment showing signs of recovery: TeamLease Employment Outlook for Apr-Sep,https://economictimes.indiatimes.com/multimedia/jobs/hiring-sentiment-showing-signs-of-recovery-teamlease-employment-outlook-for-apr-sep/articleshow/77086450.cms,0.881217
5f1744d70469677f8c1ec8f4,"Government open to announce further steps to revive growth, says FM Sitharaman",https://www.moneycontrol.com/news/business/economy/government-open-to-announce-further-steps-to-revive-growth-says-fm-sitharaman-5578571.html,"Government open to announcing more measures to boost growth, says Nirmala Sitharaman",https://economictimes.indiatimes.com/news/economy/policy/government-open-to-announcing-more-measures-to-boost-growth-says-nirmala-sitharaman/articleshow/77089767.cms,0.874278
5f1745180469677f8c1ec954,'Time to be cautious as mini-bubble building up; book profits',https://www.moneycontrol.com/news/business/markets/time-to-be-cautious-as-mini-bubble-building-up-book-profits-5566641.html,Financials likely to see a lot of market share shifts: Shibani Sircar Kurian,https://economictimes.indiatimes.com/markets/expert-view/financials-likely-to-see-a-lot-of-market-share-shifts-shibani-sircar-kurian/articleshow/77086932.cms,0.865808
5f1744be0469677f8c1ec8ca,Indians need to be more firm in saying ‘No’ to Chinese products,https://www.moneycontrol.com/news/opinion/indians-need-to-be-more-firm-in-saying-no-to-chinese-products-5576761.html,What Trump can do to regain his lost supporters and make 2020 Presidential race close,https://economictimes.indiatimes.com/news/international/world-news/what-trump-can-do-to-regain-his-lost-supporters-and-make-2020-presidential-race-close/articleshow/77088362.cms,0.858413
5f1744e80469677f8c1ec90b,Road to wealth creation: An unusual portfolio approach to deal with coronavirus,https://www.moneycontrol.com/news/business/markets/road-to-wealth-creation-an-unusual-portfolio-approach-to-deal-with-coronavirus-4995241.html,SIPs done in big bull markets don't deliver good returns: S Naren,https://economictimes.indiatimes.com/mf/analysis/if-you-have-to-make-money-book-profits-s-naren/articleshow/77078265.cms,0.85754
5f1744c60469677f8c1ec8d8,Those betraying party won't be able to face public: Rajasthan CM Ashok Gehlot,https://www.moneycontrol.com/news/india/those-betraying-party-wont-be-able-to-face-public-rajasthan-cm-ashok-gehlot-5578211.html,Amarnarth Yatra cancelled amid the COVID-19 pandemic,https://economictimes.indiatimes.com/news/politics-and-nation/amarnarth-yatra-cancelled-amid-the-covid-19-pandemic/videoshow/77087861.cms,0.851965
5f1744c50469677f8c1ec8d7,"OnePlus Nord launched with Snapdragon 765G, six cameras, 90Hz AMOLED display, starting at Rs 24,999",https://www.moneycontrol.com/news/technology/oneplus-nord-lauched-with-snapdragon-765g-six-cameras-90hz-amoled-display-starting-at-rs-24999-5578561.html,"OnePlus Nord launched with 6 cameras at Rs 24,999, to be available on August 4",https://economictimes.indiatimes.com/magazines/panache/oneplus-nord-launched-with-90-hz-fluid-display-6-cameras-and-12gb-ram/articleshow/77089002.cms,0.840779


### Bottom 20 results

In [194]:
view_w2v_result(bottom=True)

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1745110469677f8c1ec948,New debit and credit card rules: Only domestic transactions at ATMs & PoS terminals at time of issuance,https://www.moneycontrol.com/news/economy/policy/new-debit-and-credit-card-rules-only-domestic-transactions-at-atms-pos-terminals-at-time-of-issuance-4821521.html,Let writing rubber cheques stay criminal,https://economictimes.indiatimes.com/blogs/et-editorials/let-writing-rubber-cheques-stay-criminal/,0.661618
5f1744d50469677f8c1ec8f0,"Iraq increases oil exports in July, still pumps above OPEC+ target",https://www.moneycontrol.com/news/business/commodities/iraq-increases-oil-exports-in-july-still-pumps-above-opec-target-5578901.html,"Stimulus package breaks new ground in European unity, 27 nations to share financial burden",https://economictimes.indiatimes.com/news/international/business/stimulus-package-breaks-new-ground-in-european-unity-27-nations-to-share-financial-burden/articleshow/77089312.cms,0.661564
5f1745040469677f8c1ec935,This startup will offer loans against cars as collateral,https://www.moneycontrol.com/news/business/startup/this-startup-will-offer-loans-against-cars-as-collateral-5576311.html,"Mirae Asset mutual fund: Axis, ICICI Pru and Mirae Asset mutual fund add maximum folios in FY 19-20",https://economictimes.indiatimes.com/mf/mf-news/axis-icici-pru-and-mirae-asset-mutual-fund-add-maximum-folios-in-fy-19-20/articleshow/77082580.cms,0.661418
5f1744d50469677f8c1ec8f1,"NHAI annuls Rs 2,193-crore bid won by IRB Infra for BOT project in West Bengal",https://www.moneycontrol.com/news/business/nhai-annuls-rs-2193-crore-bid-won-by-irb-infra-for-bot-project-in-west-bengal-5579071.html,Hiranandani Group's Yotta inks pact to set up 13-acre data center park in Chennai,https://economictimes.indiatimes.com/tech/internet/hiranandani-groups-yotta-inks-pact-to-set-up-13-acre-data-center-park-in-chennai/articleshow/77061523.cms,0.654841
5f1744ba0469677f8c1ec8bf,Gainers & Losers: 10 stocks that moved the most on July 21,https://www.moneycontrol.com/news/photos/business/stocks/gainers-losers-10-stocks-that-moved-the-most-on-july-21-5577641.html,Market Watch: Will history repeat for Nifty at this level?,https://economictimes.indiatimes.com/markets/stocks/etmarkets-podcast/market-watch-will-history-repeat-for-nifty-at-this-level/podcast/77068557.cms,0.650928
5f1744d30469677f8c1ec8ec,"International flights on July 21: Daily updates on arrivals, departures under Vande Bharat Mission",https://www.moneycontrol.com/news/india/international-flights-repatriation-on-july-21-daily-updates-on-arrivals-departures-under-vande-bharat-mission-5574261.html,Amarnarth Yatra cancelled amid the COVID-19 pandemic,https://economictimes.indiatimes.com/news/politics-and-nation/amarnarth-yatra-cancelled-amid-the-covid-19-pandemic/videoshow/77087861.cms,0.646273
5f1744be0469677f8c1ec8c9,SBI Life Insurance Q1 net profit up 5.1%,https://www.moneycontrol.com/news/business/companies/sbi-life-insurance-q1-net-profit-up-5-1-5577251.html,"Voda Idea faces around Rs 6,000 cr yearly outflow if AGR repayments spread over 15 years: Analysts",https://economictimes.indiatimes.com/industry/telecom/telecom-news/voda-idea-faces-around-rs-6000-cr-yearly-outflow-if-agr-repayments-spread-over-15-years-analysts/articleshow/77083756.cms,0.645815
5f1745110469677f8c1ec947,"Beginning March 16, newly issued debit and credit cards won't allow international or online transactions unless asked for",https://www.moneycontrol.com/news/business/personal-finance/beginning-march-16-newly-issued-debit-and-credit-cards-wont-allow-international-or-online-transactions-unless-asked-for-5015481.html,Let writing rubber cheques stay criminal,https://economictimes.indiatimes.com/blogs/et-editorials/let-writing-rubber-cheques-stay-criminal/,0.642948
5f1745140469677f8c1ec94b,"SBI to sell stake in credit card JV, to raise Rs 6,000 crore: Report",https://www.moneycontrol.com/news/business/companies/sbi-to-sell-stake-in-credit-card-jv-to-raise-rs-6000-crore-report-4270891.html,Let writing rubber cheques stay criminal,https://economictimes.indiatimes.com/blogs/et-editorials/let-writing-rubber-cheques-stay-criminal/,0.641808
5f1744bc0469677f8c1ec8c5,Business Insight | What's next for India's airlines as COVID-19 impact cuts deep?,https://www.moneycontrol.com/news/business/earnings/business-insight-whats-next-for-indias-airlines-as-covid-19-impact-cuts-deep-5577691.html,"Indian IT firms bring back over 2,000 stranded employees from US, Europe due to Covid-19",https://economictimes.indiatimes.com/tech/ites/indian-it-firms-bring-back-over-2000-stranded-employees-from-us-europe-due-to-covid-19/articleshow/77071411.cms,0.63673


<b>The</b> results for word2vec seems quite different from simple TF-IDF, the striking difference is the similarity scores, none of the articles have the similarity score below 0.5, whereas in TF-IDF we had similarity scores below 0.1
<br>
<br>
<b>Now</b>, if we look at the article pairs with the least cosine score, the best matching ET article for the moneycontrol article titled "Explained | Why AstraZeneca-Oxford vaccine offers hope", with a similarity score of 0.5035, we can see that the two articles talk about the same topic, although the sentiment is different here. They both talk about AstraZeneca-Oxford vaccine, butone in positive sense, and the other one in questioning way. But considering the low similarity score, this is actually a good match.

<b>Another</b> point to notice is that the article from ET titled "Financials likely to see a lot of market share shifts: Shibani Sircar Kurian" appears to be top match for 4 articles in our top 20 output above. Which is strange. The reason being the articles talks about in general the market conditions, hence have similar scope. But, one article titles "Gold, fixed-income wrap 2019: Yellow metal's prices zoom, long-term bonds deliver" talks about gold and focussed on that, whereas there is no mention of Gold in ET article. Hence, this similarity is mostly based on the words used in both of the articles. <br> <br>

With Word2Vec we are using average of word vectors with SIF (Smooth Inverse Frequency). Since it is average we cannot completely rely on it to provide meaningful distinciton between two pieces of text, hence this is the limitation of this model. We can see many mismatch titles in bottom 20, although some may have a bit of similar content which could be the reason for the score and match. For improvements one can look at top 5 matches instead and see which one is the most relevant among them. <br> <br>

<b>For positives: </b><br>
There is a perfecr match of these two articles:<br><br>
MNC: Hiring sentiment seeing improvement, says TeamLease Employment Outlook Report	https://www.moneycontrol.com/news/business/economy/hiring-sentiment-seeing-improvement-says-teamlease-employment-outlook-report-5578131.html <br>
ET: Hiring sentiment showing signs of recovery: TeamLease Employment Outlook for Apr-Sep	https://economictimes.indiatimes.com/multimedia/jobs/hiring-sentiment-showing-signs-of-recovery-teamlease-employment-outlook-for-apr-sep/articleshow/77086450.cms	<br>
Sim Score: 0.881217

# Doc2Vec

In [207]:
doc_emb_model = Doc2Vec.load('enwiki_dbow/doc2vec.bin') #using pretrained model

In [222]:
from gensim.utils import simple_preprocess

In [258]:
mnc.drop(columns=["tokens","w2v_vec"], inplace=True)
et.drop(columns=["tokens","w2v_vec"], inplace=True)

In [259]:
def d2v_vec(text):
    tokens = simple_preprocess(text)[:200] # tokenize and keep 200 words only
    return doc_emb_model.infer_vector(tokens)

In [261]:
mnc["d2v_vec"] = mnc.text.apply(d2v_vec)
et["d2v_vec"] = et.text.apply(d2v_vec)

In [326]:
def view_d2v_results(top = 20, bottom = False):
    et_vecs = np.stack(et.d2v_vec)
    argmax = []
    cos_sim = []
    for vec in mnc.d2v_vec:
        arr1 = doc_emb_model.wv.cosine_similarities(vec,et_vecs)
        argmax.append(arr1.argmax())
        cos_sim.append(arr1.max())
        
    
    #creating new df to store results for best match article and visualize below
    tfidf_results = mnc.copy()
    tfidf_results["Sim_Score"] =  cos_sim
    tfidf_results["ET_Title"] = et.title.values[argmax]
    tfidf_results["ET_Text"] = et.text.values[argmax]
    tfidf_results["ET_Link"] = et.link.values[argmax]
    
    if bottom:
        #printing the top=top articles
        display(HTML(tfidf_results[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).tail(top).to_html()))
    else:
        display(HTML(tfidf_results[["title","link","ET_Title","ET_Link","Sim_Score"]].sort_values(by = ["Sim_Score"],ascending=False).head(top).to_html()))

## Top 20 results doc2vec

In [329]:
view_d2v_results()

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1744d70469677f8c1ec8f4,"Government open to announce further steps to revive growth, says FM Sitharaman",https://www.moneycontrol.com/news/business/economy/government-open-to-announce-further-steps-to-revive-growth-says-fm-sitharaman-5578571.html,"Government open to announcing more measures to boost growth, says Nirmala Sitharaman",https://economictimes.indiatimes.com/news/economy/policy/government-open-to-announcing-more-measures-to-boost-growth-says-nirmala-sitharaman/articleshow/77089767.cms,0.956811
5f1744c30469677f8c1ec8d1,Coronavirus outbreak: Amarnath Yatra 2020 cancelled,https://www.moneycontrol.com/news/business/coronavirus-outbreak-amarnath-yatra-2020-cancelled-jammu-kashmir-shrine-board-5578401.html,Amarnath Yatra called off amid coronavirus crisis; 'Aarti' to be broadcast live,https://economictimes.indiatimes.com/news/politics-and-nation/amarnath-yatra-called-off-amid-coronavirus-crisis-aarti-to-be-broadcast-live/articleshow/77091101.cms,0.760326
5f1744bd0469677f8c1ec8c6,"Axis Bank Q1 profit falls 19% to Rs 1,112 crore, NII grows 19.5%; slippages decline",https://www.moneycontrol.com/news/business/earnings/axis-bank-q1-profit-falls-19-to-rs-1112-crore-nii-grows-19-5-5577831.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.713166
5f1744e30469677f8c1ec905,Corona Kavach policy | Healthcare workers to get 5% discount on medical insurance premium,https://www.moneycontrol.com/news/business/corona-kavach-policy-healthcare-workers-to-get-5-discount-on-medical-insurance-premium-5547501.html,How much premium will you pay for indemnity-type Covid Kavach health insurance?,https://economictimes.indiatimes.com/wealth/insure/health-insurance/how-much-premium-will-you-pay-for-indemnity-type-covid-kavach-health-insurance/articleshow/77033216.cms,0.673375
5f1744bd0469677f8c1ec8c8,"Bajaj Finserv Q1 net profit jumps 44% YoY; Bajaj Finance creates provision of Rs 1,450 crore",https://www.moneycontrol.com/news/business/markets/bajaj-finserv-q1-net-profit-jumps-44-yoy-bajaj-finance-creates-provision-of-rs-1450-crore-5577851.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.660461
5f1744ba0469677f8c1ec8be,Trade Setup for Wednesday: Top 15 things to know before Opening Bell,https://www.moneycontrol.com/news/business/markets/trade-setup-for-wednesday-top-15-things-to-know-before-opening-bell-100-5578031.html,Ahead of Market: 12 things that will decide stock action on Wednesday,https://economictimes.indiatimes.com/markets/stocks/news/ahead-of-market-12-things-that-will-decide-stock-action-on-wednesday/articleshow/77091211.cms,0.653671
5f1744fa0469677f8c1ec927,Gold hovers near nine-year high as virus fears drive safe-haven demand,https://www.moneycontrol.com/news/business/markets/gold-hovers-near-nine-year-high-as-virus-fears-drive-safe-haven-demand-5574941.html,It’s silver’s turn to shine as prices surge to four-year high,https://economictimes.indiatimes.com/markets/commodities/news/its-silvers-turn-to-shine-as-prices-surge-to-four-year-high/articleshow/77087263.cms,0.650821
5f1745170469677f8c1ec952,"DAILY VOICE: Ignore SMS tips if you are serious about making money in stocks, says Rajesh Palviya of Axis Securities",https://www.moneycontrol.com/news/business/markets/daily-voice-ignore-sms-tips-if-you-are-serious-about-making-money-in-stocks-says-rajesh-palviya-of-axis-securities-5568841.html,Market Watch: How long can the bulls continue this run?,https://economictimes.indiatimes.com/markets/stocks/etmarkets-podcast/market-watch-how-long-can-the-bulls-continue-this-run/podcast/77088350.cms,0.643877
5f1744d90469677f8c1ec8f6,Rajasthan HC requests Speaker to defer action on disqualification notices till Friday,https://www.moneycontrol.com/news/india/rajasthan-hc-requests-speaker-to-defer-action-on-disqualification-notices-till-friday-5577561.html,Sachin Pilot sends legal notice to Congress MLA Malinga over horse-trading charge,https://economictimes.indiatimes.com/news/politics-and-nation/sachin-pilot-sends-legal-notice-to-congress-mla-malinga-over-horse-trading-charge/articleshow/77095070.cms,0.634762
5f1744bb0469677f8c1ec8c1,"Bajaj Finance consolidated Q1 profit falls 19% to Rs 962 crore, new loans down 76%",https://www.moneycontrol.com/news/business/earnings/bajaj-finance-consolidated-q1-profit-falls-19-to-rs-962-crore-new-loans-down-76-5577211.html,"Axis Bank Q1 results: Profit falls 19% YoY to Rs 1,112 crore as provisions jump 16% YoY",https://economictimes.indiatimes.com/markets/stocks/earnings/axis-bank-q1-results-profit-falls-19-yoy-to-rs-1112-crore-misses-street-estimates/articleshow/77086123.cms,0.627297


## Bottom 20 results doc2vec

In [330]:
view_d2v_results(bottom=True)

Unnamed: 0_level_0,title,link,ET_Title,ET_Link,Sim_Score
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5f1744c10469677f8c1ec8cc,IATA chief: Four reasons to be optimistic about aviation,https://www.moneycontrol.com/news/opinion/iata-chief-four-reasons-to-be-optimistic-about-aviation-5575961.html,India funds halve shadow bank debt holdings as crisis persists,https://economictimes.indiatimes.com/markets/stocks/news/india-funds-halve-shadow-bank-debt-holdings-as-crisis-persists/articleshow/77091670.cms,0.452862
5f1745190469677f8c1ec956,NCDEX to launch 'Options on Goods' contracts of 3 commodities from July 27,https://www.moneycontrol.com/news/business/commodities/ncdex-to-launch-options-on-goods-contracts-of-3-commodities-from-july-27-5578821.html,16 firms show up at Indian Railways' pre-application conference on private trains,https://economictimes.indiatimes.com/industry/transportation/railways/16-firms-show-up-at-indian-railways-pre-application-conference-on-private-trains/articleshow/77094623.cms,0.451758
5f1744c40469677f8c1ec8d3,Anatomy of a ventilator procurement order: How arbitrary changes to specifications led to delays and cost overruns,https://www.moneycontrol.com/news/business/companies/anatomy-of-a-ventilator-procurement-order-how-arbitrary-changes-to-specifications-led-to-delays-and-cost-overruns-5577601.html,India funds halve shadow bank debt holdings as crisis persists,https://economictimes.indiatimes.com/markets/stocks/news/india-funds-halve-shadow-bank-debt-holdings-as-crisis-persists/articleshow/77091670.cms,0.448027
5f1744f70469677f8c1ec923,Unitech board to complete stuck real estate projects in 4 years: sources,https://www.moneycontrol.com/news/business/real-estate/unitech-board-to-complete-stuck-real-estate-projects-in-4-years-sources-5570811.html,Indian Hotels Company announces 100% acquisition of Sea Rock hotel site,https://economictimes.indiatimes.com/industry/services/hotels-/-restaurants/indian-hotels-company-announces-100-acquisition-of-sea-rock-hotel-site/articleshow/77088037.cms,0.444158
5f1744cf0469677f8c1ec8e5,Explained: What's behind the Twitter Bitcoin hack?,https://www.moneycontrol.com/news/technology/explained-whats-behind-the-twitter-bitcoin-hack-5562451.html,Jeff Bezos adds record $13 billion in single day to his fortune,https://economictimes.indiatimes.com/markets/stocks/news/jeff-bezos-adds-record-13-billion-in-single-day-to-his-fortune/articleshow/77080750.cms,0.444021
5f1745000469677f8c1ec930,Don’t wait for a hard landing to set your money matters in order,https://www.moneycontrol.com/news/business/personal-finance/dont-wait-for-a-hard-landing-to-set-your-money-matters-in-order-5217561.html,Infosys could be a multi-bagger for next 10 years: Rajat Sharma,https://economictimes.indiatimes.com/markets/expert-view/infosys-could-be-a-multi-bagger-for-next-10-years-rajat-sharma/articleshow/77087710.cms,0.441567
5f1744f00469677f8c1ec918,Woman in her 80s asked to pay tax on undisclosed Rs 196 crore in Swiss account: Report,https://www.moneycontrol.com/news/business/woman-asked-to-pay-tax-on-undisclosed-rs-196-crore-in-swiss-account-report-5566061.html,Rahul Bajaj to step down as Bajaj Finance chairman; stock drops 6%,https://economictimes.indiatimes.com/markets/stocks/news/rahul-bajaj-to-step-down-as-bajaj-finance-chairman-stock-drops-6/articleshow/77082037.cms,0.438472
5f1744fd0469677f8c1ec92b,"Gold prices steady at Rs 49,267 per 10 gram, silver slips Rs 110 per kg",https://www.moneycontrol.com/news/business/commodities/gold-prices-steady-at-rs-49267-per-10-gram-silver-slips-rs-110-per-kg-5556691.html,India's Covid-19 fatality rate drops to 2.43% from 3.36% on June 17: Health Ministry,https://economictimes.indiatimes.com/news/politics-and-nation/indias-covid-19-fatality-rate-drops-to-2-43-from-3-36-on-june-17-health-ministry/articleshow/77087897.cms,0.438329
5f1744e10469677f8c1ec902,Buying a car after August 1? Know the own damage policy changes in store,https://www.moneycontrol.com/news/business/personal-finance/why-a-long-term-own-damage-motor-insurance-policy-is-a-must-have-5559161.html,Can NCB discount be claimed when buying motor insurance policies issued under sandbox regulations?,https://economictimes.indiatimes.com/wealth/insure/motor-insurance/can-ncb-discount-be-claimed-when-buying-motor-insurance-policies-issued-under-sandbox-regulations/articleshow/76970291.cms,0.435374
5f17450c0469677f8c1ec940,Urban Planning | The potential of moving the national capital,https://www.moneycontrol.com/news/opinion/urban-planning-the-potential-of-moving-the-national-capital-3976361.html,"COVID-19 has aggravated pains for deal-making, 2020 to be uncertain year: Report",https://economictimes.indiatimes.com/news/company/corporate-trends/covid-19-has-aggravated-pains-for-deal-making-2020-to-be-uncertain-year-report/articleshow/77086984.cms,0.433799


## doc2vec Insights

# Bert with Cosine

In [203]:
import gc

In [204]:
gc.collect()

14893