In [1]:
import pandas as pd
import numpy as np
from duckduckgo_search import DDGS
import json
import html2text
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
import os

def ddg_search(input_text,max_results=None):
    """
    parameter: input_text is a keyword to search
    The function fetch all the url related to given text and saves in json format
    """
    file_path = "ddgs.json"
    if os.path.exists(file_path):
        # Delete the file
        os.remove(file_path)
        print("File deleted successfully.")
    else:
        print("File does not exist.")

    with DDGS() as ddgs:
        results = [r for r in ddgs.text(f"{input_text}", safesearch = "off",timelimit = "d",max_results = max_results)]
    
    # Write list of dictionaries to JSON file
    with open(file_path, "w") as json_file:
        json.dump(results, json_file)
        print("file created successfully")



def do_webscraping(json_file):
    """
    parameter: it takes json file
    returns a list of dictionaries including page content, title, metadata and clean text of the urls
    """
    with open(f"{json_file}", "r") as json_file:
        data_list = json.load(json_file)

    urls = [x['href'] for x in data_list]

    structured_response = []
    for url in urls:
        try:
            loader = AsyncHtmlLoader(url)
            docs = loader.load()

            html2text_transformer = Html2TextTransformer()
            docs_transformed = html2text_transformer.transform_documents(docs)

            if docs_transformed is not None and len(docs_transformed) > 0:
                metadata = docs_transformed[0].metadata
                title = metadata.get('title', '')

                structured_response.append({
                    'summary': docs_transformed[0].page_content,
                    'title': title,
                    'metadata': metadata,
                    'url':url
                })
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            continue
    
    return structured_response


In [2]:
input_text = "MRF stocks"
ddg_search(input_text,max_results=5)
response = do_webscraping(json_file="ddgs.json")


File deleted successfully.
file created successfully


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.88it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  6.38it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.15it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.94it/s]
Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.59s/it]


Create chat model

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain_core.runnables import RunnablePassthrough
from langchain.document_loaders import WebBaseLoader


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
response[0]

{'summary': 'English  \n  \n  * Hindi\n  * Gujarati\n\nSpecials\n\nSearch Quotes, News, Mutual Fund NAVs\n\n  * Moneycontrol Trending Stock\n\n  * Infosys INE009A01021, INFY, 500209\n  * State Bank of India INE062A01020, SBIN, 500112\n  * Yes Bank INE528G01027, YESBANK, 532648\n  * Bank Nifty \n  * Nifty 500  \n\n  * Quotes\n  * Mutual Funds\n  * Commodities\n  * Futures & Options\n  * Currency\n  * News\n  * Cryptocurrency\n  * Forum\n  * Notices\n  * Videos\n  * Glossary\n  * All\n\n  * Hello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\n    * My Profile\n    * My Portfolio\n    * My Watchlist\n    * Credit Score₹100 Cash Reward\n    * My Feed\n    * My Messages\n    * Price Alerts\n    * My Profile\n    * My PRO\n    * My Portfolio\n    * My Watchlist\n    * Credit Score₹100 Cash Reward\n    * My Feed\n    * My Messages\n    * Price Alerts\n    * Logout\n    * Chat with Us\n    * Download App\n    * Follow us on:\n\nPremium\n\nMy Feed\n\n->->MC_ENG_DESKTOP/MC_ENG_NEWS/

In [35]:
def model():
    """
    function return a chain which can generate sentiment
    """
    
    prompt_template = """
    Given a context and a keyword, analyze the sentiment of the context if it's related to the keyword.
    If the context is irrelevant to the keyword, return 'Irrelevant text'.
    If relevant, determine whether the sentiment is positive or negative.

    Context: \n {context} \n
    Keyword: \n {keyword} \n

    Answer: 

    Note: Answer must be either "Positive" or "Negetive" or ""Irrelevent Text"
    """
    doc_prompt = PromptTemplate.from_template("{page_content}")
    llm_prompt = PromptTemplate.from_template(prompt_template)

    print(llm_prompt)
    llm = ChatGoogleGenerativeAI(model="gemini-pro",
                    temperature=0.7, top_p=0.85)
    

    stuff_chain = (
        # Extract data from the documents and add to the key `text`.
        {
            "context": lambda docs: "\n\n".join(
                format_document(doc, doc_prompt) for doc in docs
            ),
            "keyword": RunnablePassthrough()

        }
        | llm_prompt         # Prompt for Gemini
        | llm                # Gemini function
        | StrOutputParser()  # output parser
    )

    
    return stuff_chain

In [18]:
input_text = "MRF stocks"
ddg_search(input_text,max_results=5)
response = do_webscraping(json_file="ddgs.json")


File deleted successfully.
file created successfully


Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.48it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.20it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.80it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.82it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.06it/s]


In [3]:
url_list = [x['url'] for x in response]
url_title = [x['title'] for x in response]

In [4]:
url_list[0]

'https://www.moneycontrol.com/news/business/markets/trade-setup-for-monday-15-things-to-know-before-opening-bell-18-12392891.html'

In [36]:
loader = WebBaseLoader(url_list[0])
docs = loader.load()
res = model()
res.invoke(docs)

input_variables=['context', 'keyword'] template='\n    Given a context and a keyword, analyze the sentiment of the context if it\'s related to the keyword.\n    If the context is irrelevant to the keyword, return \'Irrelevant text\'.\n    If relevant, determine whether the sentiment is positive or negative.\n\n    Context: \n {context} \n\n    Keyword: \n {keyword} \n\n\n    Answer: \n\n    Note: Answer must be either "Positive" or "Negetive" or ""Irrelevent Text"\n    '


'Irrelevant text'

In [5]:
from sentiment_analysis_stock_links.component.model import SentimentAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model = SentimentAnalyzer.analyze_sentiment()

input_variables=['context', 'keyword'] template='\n        Given a context and a keyword, analyze the sentiment of the context if it\'s related to the keyword.\n        If the context is irrelevant to the keyword, return \'Irrelevant text\'.\n        If relevant, determine whether the sentiment is positive or negative.\n\n        Context: \n {context} \n\n        Keyword: \n {keyword} \n\n\n        Answer: \n\n        Note: Answer must be either "Positive" or "Negetive" or ""Irrelevent Text"\n        '


In [8]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader(url_list[0])
docs = loader.load()
model.invoke(docs)

NameError: name 'res' is not defined

In [25]:
model = SentimentAnalyzer()

result_list = []
for url in url_list:
    result = model.analyze_sentiment(url_title)
    result_list.append(result)

InvalidSchema: No connection adapters were found for 'Trade setup for Monday: 15 things to know before opening bell'