In [4]:
# To do the same we did in the terminal in the code to understand how the ans and queries are executed
import requests
import json

def get_ollama_response(prompt, model="llama3.2"):
    url = "http://localhost:11434/api/generate"
    # 11434: This is the default port that Ollama uses to listen for API requests.
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()
        return response.json()["response"]
    except Exception as e:
        return f"Error: {e}. Make sure Ollama is running and '{model}' is pulled."

print("Welcome to Simple Ollama Q&A!")
print("Type 'exit' to quit.")

user_question = ''
while True and user_question.lower() != 'bye':
    user_question = input("\nAsk a question: ")
    
    if user_question.lower() == 'bye':
        print("Exiting. Goodbye!")
        break
    
    answer = get_ollama_response(user_question)
    print(f"Answer: {answer}")

Welcome to Simple Ollama Q&A!
Type 'exit' to quit.
Exiting. Goodbye!


In [None]:
# from googlesearch import search
# import requests

In [None]:

 #Written by sir.
# results= search("Messi", num_results=10 )
# all_content =''
# for each in results:
#     print(f"Fetching content from: {each}")
#     content= requests.get(each).text
#     print(content)
#     all_content += content

In [5]:
import os
import requests
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from googlesearch import search
import time
import re

def is_scraping_allowed(url, user_agent="*"):
    try:
        parsed = urlparse(url)
        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        return False  # Safer to assume not allowed

def sanitize_filename(url):
    """Create a safe filename from a URL."""
    filename = re.sub(r'\W+', '_', url)
    return filename[:100] + ".html"

def collect_raw_html(query, num_results=5):
    """
    Searches Google for the query, fetches raw HTML from allowed sites,
    and saves them in a folder called 'html_ollama_folder'.
    """
    # Create folder if it doesn't exist
    folder_name = "html_ollama_folder"
    os.makedirs(folder_name, exist_ok=True)

    collected = 0
    seen = set()
    search_results = list(search(query, num_results=30))  # over-fetch
    url_to_file = {}

    for url in search_results:
        if collected >= num_results:
            break
        if url in seen:
            continue
        seen.add(url)

        print(f"\n🔍 Trying: {url}")

        if not is_scraping_allowed(url):
            print(f"❌ Not allowed by robots.txt: {url}")
            continue

        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            if response.status_code == 200:
                filename = sanitize_filename(url)
                file_path = os.path.join(folder_name, filename)
                url_to_file[url] = file_path

                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(response.text)

                print(f"✅ Saved: {file_path}")
                collected += 1
                time.sleep(1)  # Be polite
            else:
                print(f"⚠️ Status code {response.status_code}: {url}")
        except Exception as e:
            print(f"🚫 Error fetching {url}: {e}")

    print(f"\n✅ Total collected: {collected}/{num_results}")
    return url_to_file



In [6]:
# Collect 5 raw HTML pages related to Messi
html_pages = collect_raw_html("Messi", num_results=5)





🔍 Trying: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30
❌ Not allowed by robots.txt: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30

🔍 Trying: https://en.wikipedia.org/wiki/Lionel_Messi
✅ Saved: html_ollama_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

🔍 Trying: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003
❌ Not allowed by robots.txt: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003

🔍 Trying: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor
❌ Not allowed by robots.txt: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor

🔍 Trying: https://www.instagram.com/leomessi/
❌ Not allowed by robots.txt: https://www.instagram.com/leomessi/

🔍 Trying: http://google.com/search?tbm=isch&q=Lionel+Messi
❌ Not allowed by robots.txt: http:

In [7]:
# Get the first HTML content from the dictionary and print 500 chars
first_html = next(iter(html_pages.values()))
print(first_html[:500])
# But the given system prompt will be in string, but the html content is in dictionary so transfering it into string.
all_html_combined = ""

for html in html_pages.values():
    all_html_combined += html + "\n\n"  # Optional spacing between pages

# Now you can use it like a normal string
print(all_html_combined[:500])


html_ollama_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html
html_ollama_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

html_ollama_folder/https_www_britannica_com_biography_Lionel_Messi.html

html_ollama_folder/https_www_espn_com_soccer_story___id_45627646_zlatan_ibrahimovic_lionel_messi_plays_statues_inter_mi.html

html_ollama_folder/https_www_bbc_com_sport_football_articles_czxe02expnwo.html

html_ollama_folder/https_www_intermiamicf_com_players_lionel_messi_.html




In [11]:
all_content= "From the following HTML content, first remove all HTML tags to extract only the clean, readable text. Then, identify and extract only the sentences or paragraphs that mention the keyword 'Messi'. Ignore any unrelated content or HTML markup. Return a concise and focused textual summary containing only relevant information about Messi" + all_html_combined

In [15]:
processed_text= get_ollama_response(all_content)

In [16]:
processed_text

"Here is the clean, readable text extracted from the HTML content:\n\nOriginal text without HTML tags:\n\nLionel Messi is a professional soccer player born on June 24, 1987 in Rosario, Argentina. He plays as a forward for Paris Saint-Germain and the Argentina national team.\n\nMessi began playing football at a young age and joined the youth academy of Newell's Old Boys at the age of five. In 2000, he moved to Barcelona, where he made his professional debut in 2004.\n\nIn 2011, Messi won the Ballon d'Or award for the first time. He has since become one of the most successful players in the history of the tournament, winning it a record-breaking seven times.\n\nMessi has also had a successful international career, leading Argentina to the final of the 2020 Copa America. However, he and his team lost to Brazil in the final.\n\nIn addition to his on-field success, Messi is known for his charitable work off the field. He has supported various causes, including education and healthcare initi