Web Scrapping Financial express

In [1]:
import pandas as pd

In [3]:
# Selenium is used for web scrapping
%pip install requests selenium --quiet

Note: you may need to restart the kernel to use updated packages.


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By

Following method will go inside the website and will find required html tags( website are made of html tags) and extract  data out of it

PS: this code is for chrome browser

In [26]:


def get_article_details(url):
    print(url)

   # this is the setting chrome will use
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    # Initialize an empty DataFrame for each news row
    row_df = pd.DataFrame(columns=['Title', 'Author', 'Date', 'Excerpt'])

    # Find the container for the entry
    containers = driver.find_elements(By.XPATH, "//div[@class='entry-wrapper']")

    for container in containers:
        try:
            title_element = container.find_element(By.XPATH, ".//h2[@class='entry-title']/a")
            title = title_element.text
            article_url = title_element.get_attribute("href")

            excerpt_element = container.find_element(By.XPATH, ".//div[@class='hide-for-small-only post-excerpt']/p")
            excerpt = excerpt_element.text if excerpt_element else ""

            author_element = container.find_element(By.XPATH, ".//div[@class='author-link']/a")
            author = author_element.text

            date_element = container.find_element(By.XPATH, ".//time[@class='entry-date published']")
            date = date_element.text

            new_row_df = pd.DataFrame({
                "Title": [title],
                "URL": [article_url],
                "Author": [author],
                "Date": [date],
                "Excerpt": [excerpt]
            })
            row_df = pd.concat([row_df, new_row_df], ignore_index=True)
        except Exception as e:
            # print(f"Error: {e}")
            continue

    driver.quit()
    return row_df

base_url = "https://www.financialexpress.com/market/page/"
all_article_details = pd.DataFrame(columns=['Title', 'URL', 'Author', 'Date', 'Excerpt'])

# Loop through the pages
for i in range(1, 10): #page 1 to 10
    url = f"{base_url}{i}/"
    article_df = get_article_details(url)
    all_article_details = pd.concat([all_article_details, article_df], ignore_index=True)



https://www.financialexpress.com/market/page/1/
https://www.financialexpress.com/market/page/2/
https://www.financialexpress.com/market/page/3/
https://www.financialexpress.com/market/page/4/
https://www.financialexpress.com/market/page/5/
https://www.financialexpress.com/market/page/6/
https://www.financialexpress.com/market/page/7/
https://www.financialexpress.com/market/page/8/
https://www.financialexpress.com/market/page/9/


In [27]:
all_article_details.head(5)

Unnamed: 0,Title,URL,Author,Date,Excerpt
0,"Are markets closed on December 25, 2024?",https://www.financialexpress.com/market/are-ma...,Aniket Sharma,"December 25, 2024 11:09 IST","Stock Market Holidays: Christmas, celebrated o..."
1,The Role of Gold Reserves in National Economie...,https://www.financialexpress.com/market/commod...,FE Business,"December 25, 2024 10:33 IST",Gold reserves are a key financial asset held b...
2,"Newsmakers of 2024: Madhabi Puri Buch, chairpe...",https://www.financialexpress.com/market/newsma...,Joydeep Ghosh,"December 25, 2024 05:45 IST",. The government should not shut the doors of ...
3,Tata Capital listing hopes drive group company...,https://www.financialexpress.com/market/tata-c...,Anupreksha Jain,"December 24, 2024 21:50 IST",Tata Capital had assets under management of Rs...
4,Senores Pharmaceuticals IPO allotment likely o...,https://www.financialexpress.com/market/ipo-ne...,Sparsh Bansal,"December 24, 2024 14:39 IST",Senores Pharmaceuticals IPO GMP Today: Senores...


Sentimental Analysis of the obtained data

Preprocessing of the text

In [28]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

In [29]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(sentence):
  words = sentence.split()
  words = [word for word in words if word.isalpha()]
  filtered_words = [word for word in words if word not in stop_words]
  return ' '.join(filtered_words)

In [30]:
def apply_lemmatization(sentence):
    words = sentence.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [31]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", device="mps")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


We are only doing for title of the news we can also do for the Excerpt as well 

In [32]:
all_article_details["Title"]= all_article_details["Title"].str.lower()
all_article_details["Title"] = all_article_details["Title"].apply(lambda x: remove_stop_words(x))
all_article_details["Title"] = all_article_details["Title"].apply(lambda x: apply_lemmatization(x))
sentiment = sentiment_pipeline(all_article_details["Title"].tolist())

In [43]:
title_senti_df = pd.DataFrame(sentiment)

In [44]:
title_senti_df.head(5)

Unnamed: 0,label,score
0,NEGATIVE,0.991717
1,POSITIVE,0.996173
2,NEGATIVE,0.711165
3,POSITIVE,0.939162
4,NEGATIVE,0.984885


In [46]:
all_article_details_with_sentiments= pd.concat([all_article_details,title_senti_df],axis=1)

In [47]:
all_article_details_with_sentiments.head(5)

Unnamed: 0,Title,URL,Author,Date,Excerpt,label,score
0,market closed december,https://www.financialexpress.com/market/are-ma...,Aniket Sharma,"December 25, 2024 11:09 IST","Stock Market Holidays: Christmas, celebrated o...",NEGATIVE,0.991717
1,role gold reserve national global overview,https://www.financialexpress.com/market/commod...,FE Business,"December 25, 2024 10:33 IST",Gold reserves are a key financial asset held b...,POSITIVE,0.996173
2,newsmakers madhabi puri sebi,https://www.financialexpress.com/market/newsma...,Joydeep Ghosh,"December 25, 2024 05:45 IST",. The government should not shut the doors of ...,NEGATIVE,0.711165
3,tata capital listing hope drive group company ...,https://www.financialexpress.com/market/tata-c...,Anupreksha Jain,"December 24, 2024 21:50 IST",Tata Capital had assets under management of Rs...,POSITIVE,0.939162
4,senor pharmaceutical ipo allotment likely dece...,https://www.financialexpress.com/market/ipo-ne...,Sparsh Bansal,"December 24, 2024 14:39 IST",Senores Pharmaceuticals IPO GMP Today: Senores...,NEGATIVE,0.984885
