In [1]:
import requests
import json
import time

def get_mistral_response(prompt, model="mistral"):
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    try:

        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()



        result = response.json()
        return result["response"]
    except Exception as e:
        return f"Error: {e}. Make sure Ollama is running and '{model}' is pulled.", -1

# User Interaction Loop
print("Welcome to Simple-Mistral-Q&A")
print("Type 'bye' to quit.")

while True:
    user_input = input("\nAsk a question: ")

    if user_input.lower() == 'bye':
        print("Exiting. Goodbye!")
        break

    
    answer = get_mistral_response(user_input)
    print(f"Answer: {answer}")



Welcome to Simple-Mistral-Q&A
Type 'bye' to quit.
Exiting. Goodbye!


In [2]:
import os
import requests
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from googlesearch import search
import time
import re

def is_scraping_allowed(url, user_agent="*"):
    try:
        parsed = urlparse(url)
        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        return False  # Safer to assume not allowed

def sanitize_filename(url):
    """Create a safe filename from a URL."""
    filename = re.sub(r'\W+', '_', url)
    return filename[:100] + ".html"

def collect_raw_html(query, num_results=5):
    """
    Searches Google for the query, fetches raw HTML from allowed sites,
    and saves them in a folder called 'html_ollama_folder'.
    """
    # Create folder if it doesn't exist
    folder_name = "html_mistral_folder"
    os.makedirs(folder_name, exist_ok=True)

    collected = 0
    seen = set()
    search_results = list(search(query, num_results=30))  # over-fetch
    url_to_file = {}

    for url in search_results:
        if collected >= num_results:
            break
        if url in seen:
            continue
        seen.add(url)

        print(f"\n🔍 Trying: {url}")

        if not is_scraping_allowed(url):
            print(f"❌ Not allowed by robots.txt: {url}")
            continue

        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            if response.status_code == 200:
                filename = sanitize_filename(url)
                file_path = os.path.join(folder_name, filename)
                url_to_file[url] = file_path

                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(response.text)

                print(f"✅ Saved: {file_path}")
                collected += 1
                time.sleep(1)  # Be polite
            else:
                print(f"⚠️ Status code {response.status_code}: {url}")
        except Exception as e:
            print(f"🚫 Error fetching {url}: {e}")

    print(f"\n✅ Total collected: {collected}/{num_results}")
    return url_to_file



In [6]:
# Collect 5 raw HTML pages related to Messi
html_pages = collect_raw_html("Messi", num_results=5)





🔍 Trying: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30
❌ Not allowed by robots.txt: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30

🔍 Trying: https://en.wikipedia.org/wiki/Lionel_Messi
✅ Saved: html_mistral_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

🔍 Trying: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003
❌ Not allowed by robots.txt: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003

🔍 Trying: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor
❌ Not allowed by robots.txt: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor

🔍 Trying: http://google.com/search?tbm=isch&q=Lionel+Messi
❌ Not allowed by robots.txt: http://google.com/search?tbm=isch&q=Lionel+Messi

🔍 Trying: https://www.instagram.com/leomessi/
❌ Not allowed by rob

In [7]:
# Get the first HTML content from the dictionary and print 500 chars
first_html = next(iter(html_pages.values()))
print(first_html[:500])
# But the given system prompt will be in string, but the html content is in dictionary so transfering it into string.
all_html_combined = ""

for html in html_pages.values():
    all_html_combined += html + "\n\n"  # Optional spacing between pages

# Now you can use it like a normal string
print(all_html_combined[:500])


html_mistral_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html
html_mistral_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

html_mistral_folder/https_www_britannica_com_biography_Lionel_Messi.html

html_mistral_folder/https_www_espn_com_soccer_story___id_45627646_zlatan_ibrahimovic_lionel_messi_plays_statues_inter_mi.html

html_mistral_folder/https_www_intermiamicf_com_players_lionel_messi_.html

html_mistral_folder/https_www_espn_com_soccer_player___id_45843_lionel_messi.html




In [8]:
all_content= "From the following HTML content, first remove all HTML tags to extract only the clean, readable text. Then, identify and extract only the sentences or paragraphs that mention the keyword 'Messi'. Ignore any unrelated content or HTML markup. Return a concise and focused textual summary containing only relevant information about Messi" + all_html_combined

In [9]:
processed_text= get_mistral_response(all_content)

In [10]:
processed_text

" After removing HTML tags, I've extracted the following text snippets that mention 'Messi':\n\n1. Lionel Messi is an Argentine professional footballer who plays as a forward for Paris Saint-Germain and captains the Argentina national team. (From Wikipedia)\n\n2. Messi and Zlatan Ibrahimovic share the same statue at Milan's San Siro stadium. (From ESPN)\n\n3. Lionel Messi is a player for Inter Miami CF, an American professional soccer club based in Miami, Florida. (From intermiamicf.com)\n\n4. Lionel Messi is an Argentine footballer who plays as a forward for Paris Saint-Germain and captains the Argentina national team. He is widely regarded as one of the greatest players of all time. (From ESPN)\n\nIn summary, Lionel Messi is an Argentine professional footballer currently playing as a forward for Paris Saint-Germain and captain of the Argentina national team. He has been widely recognized as one of the greatest players in soccer history. Additionally, he shares a statue with Zlatan Ib