# Data Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import nltk
import pymongo
import numpy as np
import pandas as pd

In [2]:
# !python -m nltk.downloader all

#### Moneycontrol

In [3]:
url = "https://www.moneycontrol.com/"
req1 = requests.get(url)
mnc = req1.content
soup_mnc = BeautifulSoup(mnc)
mnc_links = soup_mnc.find_all('a')

In [4]:
links = []
valid = []
for i in range(len(mnc_links)):
    try:
        link = mnc_links[i]['href']
    except:
        continue
    if(link.startswith("https://www.moneycontrol.com/news")):
        if(len(link)>100):
            links.append(link)
            valid.append(i)

Keeping only unique links

In [5]:
_ ,unq_index = np.unique(links,return_index=True)

unq_index.sort()

links = np.array(links)[unq_index]
valid = np.array(valid)[unq_index]


#### Get Articles Data Function using newspaper library

In [6]:
def article_to_text(link):
    article = Article(link)
    article.download()
    article.parse()
    return [article.text, article.title]

## Clean Data via NLP pipeline

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
def identity_tokenizer(text):
    return text

In [9]:
def tranform_text(text):
    tokens = nltk.tokenize.word_tokenize(text)

    words = [word for word in tokens if word.isalpha()]

    words = [w for w in words if not w in stop_words]

    stemmed = [porter.stem(word.lower()) for word in words]
    return stemmed

# Setting up Mongo DB

## MoneyControl

In [10]:
import pymongo

In [11]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["mydatabase"]

In [12]:
mycol = mydb["moneycontrol"]
mycol.drop()

In [13]:
mycol.create_index("link", unique = True)

'link_1'

In [14]:
id_mnc = []
for link in links:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = mycol.insert_one(article_dict)
    id_mnc.append(x.inserted_id)

In [16]:
# To check if any articles have missing data
# cursor = mycol.find({"$where": 'this.text.length < 700'})
# for document in cursor: pprint(document)

In [31]:
mycol.estimated_document_count()

162

## Economic Times

In [17]:
url = "https://economictimes.indiatimes.com/"
req2 = requests.get(url)

In [18]:
et = req2.content

soup_et = BeautifulSoup(et)

et_links = soup_et.find_all('a')

In [20]:
et_links_extracted = []
et_valid = []
for i in range(len(et_links)):
    try:
        link = et_links[i]["data-ga-onclick"]
    except:
        continue
    if(link.endswith("href")):
        if(link[0].isnumeric()):
            if(len(et_links[i]['href'])>75):
                if(et_links[i]['href'].startswith("https")):
                    et_links_extracted.append(et_links[i]['href'])
                else:
                    et_links_extracted.append("https://economictimes.indiatimes.com"+et_links[i]['href'])
                et_valid.append(i)


In [21]:
# pd.DataFrame(zip(et_links_extracted,et_valid))

In [22]:
_ ,unq_index_et = np.unique(et_links_extracted,return_index=True)

unq_index_et.sort()

et_links_extracted = np.array(et_links_extracted)[unq_index_et]
et_valid = np.array(et_valid)[unq_index_et]

#### Store data in db

In [23]:
etcol = mydb["et"]
etcol.drop()

In [24]:
etcol.create_index("link", unique = True)

'link_1'

In [25]:
def article_to_text_bs4(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.content)
    try:
        text = soup.find("div", class_="artText").get_text(separator = " ")
    except:
        try:
            text = soup.find("section").get_text(separator = " ")
        except:
            raise ValueError()
    return text

In [26]:
id_et = []
for link in et_links_extracted:
    text, title = article_to_text(link)
    article_dict = {"link":link,"text": text, "title": title}
    x = etcol.insert_one(article_dict)
    id_et.append(x.inserted_id)

In [27]:
# cursor = etcol.find({})
# for document in cursor: pprint(document)

In [32]:
etcol.estimated_document_count()

164

In [29]:
cursor = etcol.find({"$where": 'this.text.length < 700'})
id_change = []
for document in cursor: 
#     pprint(document)
    id_change.append(document["_id"])

In [30]:
for ID in id_change:
    etcol.update_one({"_id": pymongo.collection.ObjectId(ID)}, 
                     {"$set":{"text": article_to_text_bs4(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0]['link'])}})

In [33]:
# for ID in id_change:
#     pprint(etcol.find({"_id": pymongo.collection.ObjectId(ID)})[0])

### TFIDF

#### Get MongoDB to DF

In [34]:
cursor_mnc = mycol.find({}) 
mnc = pd.DataFrame(list(cursor_mnc))

In [36]:
cursor_et = etcol.find({}) 
et = pd.DataFrame(list(cursor_et))

#### Dictionary of docs

In [48]:
mnc_text_list = mnc.text.to_list()
et_text_list = et.text.to_list()
combined_list = mnc_text_list + et_text_list

In [49]:
len(combined_list)

326

In [103]:
combined_list_tokenized = [tranform_text(l) for l in combined_list]

In [109]:
tfdf = TfidfVectorizer(tokenizer=identity_tokenizer,lowercase=False)  

In [110]:
tfdf_vector = tfdf.fit(combined_list_tokenized)



In [126]:
index=tfdf_vector.get_feature_names()

In [170]:
def get_tfidf(text):
    return tfdf_vector.transform([tranform_text(text)]).todense()

In [179]:
mnc["tfidf"] = mnc.text.apply(get_tfidf)

In [180]:
et["tfidf"] = et.text.apply(get_tfidf)

### Cosine Similarity

In [183]:
df_cos_sim = pd.DataFrame(index=et._id, columns=mnc._id)

In [161]:
# test_trans = tfdf_vector.transform([combined_list_tokenized[0]])

In [None]:
# test_df = pd.DataFrame(test_trans.todense().T, index=tfdf_vector.get_feature_names(), columns=["tfidf"])
# test_df