In [3]:
# To do the same we did in the terminal in the code to understand how the ans and queries are executed
import requests
import json

def get_deepseek_response(prompt, model="deepseek-r1:1.5b"):
    url = "http://localhost:11434/api/generate"
    # 11434: This is the default port that deepseek-r1:1.5b uses to listen for API requests.
    headers = {"Content-Type": "application/json"}
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()
        return response.json()["response"]
    except Exception as e:
        return f"Error: {e}. Make sure deepseek-r1:1.5b is running and '{model}' is pulled."

print("Welcome to Simple deepseek-r1:1.5b Q&A!")
print("Type 'exit' to quit.")

user_question = ''
while True and user_question.lower() != 'bye':
    user_question = input("\nAsk a question: ")
    
    if user_question.lower() == 'bye':
        print("Exiting. Goodbye!")
        break
    
    answer = get_deepseek_response(user_question)
    print(f"Answer: {answer}")

Welcome to Simple deepseek-r1:1.5b Q&A!
Type 'exit' to quit.
Answer: <think>

</think>

Hi! I'm DeepSeek-R1, an AI assistant independently developed by the Chinese company DeepSeek Inc. For detailed information about models and products, please refer to the official documentation.
Answer: <think>
Alright, so I need to figure out how many different ways there are to line up 5 people for a photo. Let's see... I remember something about permutations from math class. Permutations are all the possible arrangements of something, right? So if I have 5 people and I want to arrange them in a line, each person can be in any position, but they can't repeat.

Hmm, so maybe it's just as simple as multiplying the number of choices at each step? Like, for the first position, there are 5 people. Once someone is chosen for the first spot, that person can't be in another spot anymore. So for the second spot, there should be 4 people left. Then, after choosing two people, there are 3 left for the third s

In [None]:
# from googlesearch import search
# import requests

In [None]:

 #Written by sir.
# results= search("Messi", num_results=10 )
# all_content =''
# for each in results:
#     print(f"Fetching content from: {each}")
#     content= requests.get(each).text
#     print(content)
#     all_content += content

In [4]:
import os
import requests
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from googlesearch import search
import time
import re

def is_scraping_allowed(url, user_agent="*"):
    try:
        parsed = urlparse(url)
        robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except:
        return False  # Safer to assume not allowed

def sanitize_filename(url):
    """Create a safe filename from a URL."""
    filename = re.sub(r'\W+', '_', url)
    return filename[:100] + ".html"

def collect_raw_html(query, num_results=5):
    """
    Searches Google for the query, fetches raw HTML from allowed sites,
    and saves them in a folder called 'html_deepseek-r1_folder'.
    """
    # Create folder if it doesn't exist
    folder_name = "html_deepseek-r1_folder"
    os.makedirs(folder_name, exist_ok=True)

    collected = 0
    seen = set()
    search_results = list(search(query, num_results=30))  # over-fetch
    url_to_file = {}

    for url in search_results:
        if collected >= num_results:
            break
        if url in seen:
            continue
        seen.add(url)

        print(f"\n🔍 Trying: {url}")

        if not is_scraping_allowed(url):
            print(f"❌ Not allowed by robots.txt: {url}")
            continue

        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            if response.status_code == 200:
                filename = sanitize_filename(url)
                file_path = os.path.join(folder_name, filename)
                url_to_file[url] = file_path

                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(response.text)

                print(f"✅ Saved: {file_path}")
                collected += 1
                time.sleep(1)  # Be polite
            else:
                print(f"⚠️ Status code {response.status_code}: {url}")
        except Exception as e:
            print(f"🚫 Error fetching {url}: {e}")

    print(f"\n✅ Total collected: {collected}/{num_results}")
    return url_to_file



In [5]:
# Collect 5 raw HTML pages related to Messi
html_pages = collect_raw_html("Messi", num_results=5)





🔍 Trying: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30
❌ Not allowed by robots.txt: https://www.beinsports.com/en-us/soccer/la-liga/articles/could-messi-return-to-barcelona-in-2026-before-the-world-cup-2025-06-30

🔍 Trying: https://en.wikipedia.org/wiki/Lionel_Messi
✅ Saved: html_deepseek-r1_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

🔍 Trying: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003
❌ Not allowed by robots.txt: https://www.transfermarkt.com/lionel-messi/profil/spieler/28003

🔍 Trying: http://google.com/search?tbm=isch&q=Lionel+Messi
❌ Not allowed by robots.txt: http://google.com/search?tbm=isch&q=Lionel+Messi

🔍 Trying: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor
❌ Not allowed by robots.txt: https://x.com/WeAreMessi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor

🔍 Trying: https://www.instagram.com/leomessi/
❌ Not allowed by

In [6]:
# Get the first HTML content from the dictionary and print 500 chars
first_html = next(iter(html_pages.values()))
print(first_html[:500])
# But the given system prompt will be in string, but the html content is in dictionary so transfering it into string.
all_html_combined = ""

for html in html_pages.values():
    all_html_combined += html + "\n\n"  # Optional spacing between pages

# Now you can use it like a normal string
print(all_html_combined[:500])


html_deepseek-r1_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html
html_deepseek-r1_folder/https_en_wikipedia_org_wiki_Lionel_Messi.html

html_deepseek-r1_folder/https_www_britannica_com_biography_Lionel_Messi.html

html_deepseek-r1_folder/https_www_espn_com_soccer_story___id_45627646_zlatan_ibrahimovic_lionel_messi_plays_statues_inter_mi.html

html_deepseek-r1_folder/https_www_intermiamicf_com_players_lionel_messi_.html

html_deepseek-r1_folder/https_www_espn_com_soccer_player___id_45843_lionel_messi.html




In [15]:
all_content= "From the following HTML content, first remove all HTML tags to extract only the clean, readable text. Then, identify and extract only the sentences or paragraphs that mention the keyword 'Messi'. Ignore any unrelated content or HTML markup. Return a concise and focused textual summary containing only relevant information about Messi. And also ignore your thinking process and don't include that" + all_html_combined

In [16]:
processed_text= get_deepseek_response(all_content)

In [17]:
processed_text

'<think>\nOkay, so I need to figure out how to extract information about Lionel Messi from the given HTML content. Let me start by understanding what\'s being asked. The user provided a bunch of links, and they want me to first remove all HTML tags to get only the text. Then, I have to find and extract sentences or paragraphs that mention the keyword \'Messi\'. Also, I need to ignore any unrelated content.\n\nHmm, looking at the URLs, most are about biographies and soccer articles. So, there\'s probably a lot of non-HTML text here. My first step is to clean up the HTML tags. How do I remove them in HTML? Oh right, using a script that iterates through each link and removes any anchor tag or ul/li elements that might have hrefs set to empty strings.\n\nOnce all HTML is removed, I\'ll get a string of text. Now, I need to find instances where \'Messi\' appears. Since the text can be in different languages, like English, French, etc., I should look for \'Lionel Messi\' and then see if any o