<a href="https://colab.research.google.com/github/nelslindahlx/Random-Notebooks/blob/master/basic_search_engine_with_crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Search Engine with Web Crawling in Python

### Cell 1: Install Required Libraries

In [1]:
!pip install requests beautifulsoup4



### Cell 2: Import Necessary Libraries

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import re


### Cell 3: Define the Web Crawler

In [3]:
def clean_text(text):
    # Clean the text by removing unnecessary characters
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def crawl_website(url, max_pages=5):
    crawled_data = []
    pages_to_crawl = [url]
    crawled_urls = set()

    while pages_to_crawl and len(crawled_data) < max_pages:
        current_url = pages_to_crawl.pop(0)
        if current_url in crawled_urls:
            continue

        try:
            response = requests.get(current_url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                text = ' '.join([p.get_text() for p in soup.find_all('p')])
                text = clean_text(text)

                if text:
                    crawled_data.append({
                        'url': current_url,
                        'text': text
                    })

                crawled_urls.add(current_url)

                for link in soup.find_all('a', href=True):
                    full_url = requests.compat.urljoin(current_url, link['href'])
                    if full_url not in crawled_urls:
                        pages_to_crawl.append(full_url)

        except Exception as e:
            print(f"Failed to crawl {current_url}: {e}")

    return pd.DataFrame(crawled_data)

# Start crawling
df = crawl_website('https://www.civichonors.com', max_pages=10)
df.head()


Unnamed: 0,url,text
0,https://www.civichonors.com,Civic Honors Graduation with Civic Honors Unlo...
1,https://civichonors.com/,Civic Honors Graduation with Civic Honors Unlo...
2,https://www.civichonors.com#content,Civic Honors Graduation with Civic Honors Unlo...
3,https://www.civichonors.com#search,Civic Honors Graduation with Civic Honors Unlo...
4,https://www.civichonors.com#top,Civic Honors Graduation with Civic Honors Unlo...


### Cell 4: Preprocess Data

In [4]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data
tfidf_matrix = tfidf.fit_transform(df['text'])

# Display the shape of the matrix
tfidf_matrix.shape


(10, 2103)

### Cell 5: Define the Search Function

In [5]:
def search(query, top_n=3):
    # Transform the query to match the TF-IDF matrix
    query_vec = tfidf.transform([query])

    # Calculate the cosine similarities
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()

    # Get the top N results
    related_docs_indices = cosine_similarities.argsort()[:-top_n-1:-1]

    # Return the results
    results = df.iloc[related_docs_indices]
    return results

# Testing the search function
query = "community service"
results = search(query)
results


Unnamed: 0,url,text
9,https://civichonors.com/#top,Civic Honors Graduation with Civic Honors Unlo...
8,https://civichonors.com/#search,Civic Honors Graduation with Civic Honors Unlo...
7,https://civichonors.com/#content,Civic Honors Graduation with Civic Honors Unlo...


### Cell 6: Display Results

In [6]:
# Displaying the results in a more readable format
def display_results(results):
    for index, row in results.iterrows():
        print(f"URL: {row['url']}")
        print(f"Text: {row['text'][:200]}...")  # Display the first 200 characters of the text
        print()

# Testing the display function
display_results(results)


URL: https://civichonors.com/#top
Text: Civic Honors Graduation with Civic Honors Unlock the Power of Community Opportunity This book was published in 2006 It has a formal copyright You can buy a physical copy if you want or just read it on...

URL: https://civichonors.com/#search
Text: Civic Honors Graduation with Civic Honors Unlock the Power of Community Opportunity This book was published in 2006 It has a formal copyright You can buy a physical copy if you want or just read it on...

URL: https://civichonors.com/#content
Text: Civic Honors Graduation with Civic Honors Unlock the Power of Community Opportunity This book was published in 2006 It has a formal copyright You can buy a physical copy if you want or just read it on...

