<a href="https://colab.research.google.com/github/muffindonor/cloud_computing_project/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# התקנת ספריית Firebase Admin
!pip install firebase

# התקנת חבילות נדרשות במידת הצורך
!pip install firebase beautifulsoup4

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl.metadata (6.5 kB)
Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [2]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
from urllib.parse import urljoin
from firebase import firebase
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords


class SearchEngine:
    def __init__(self):
        """Initialize the search engine"""
        self.pages = []
        self.word_locations = defaultdict(list)  # word -> [(page_id, frequency), ...]
        self.stop_words = set(stopwords.words('english'))
        self.firebase_conn = firebase.FirebaseApplication('https://cloudproject-27420-default-rtdb.firebaseio.com/', None)

    def fetch_pages(self, base_url, num_pages=5):
        """Fetch pages from the provided base URL"""
        try:
            response = requests.get(base_url)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract and resolve links
            links = []
            for a_tag in soup.find_all('a', href=True):
                full_url = urljoin(base_url, a_tag['href'])
                if full_url.startswith(base_url):  # Ensure links are within the same domain
                    links.append(full_url)

            links = list(set(links))  # Remove duplicates
            links = links[:num_pages]  # Limit to the number of pages
            print(f"Found {len(links)} links. Processing the first {num_pages}.")

            for link in links:
                page_response = requests.get(link)
                page_soup = BeautifulSoup(page_response.text, 'html.parser')
                page_text = page_soup.get_text(separator=" ")

                self.pages.append({
                    'id': link,
                    'title': page_soup.title.string if page_soup.title else 'No Title',
                    'url': link,
                    'content': page_text
                })
                print(f"Retrieved: {link}")

            return True

        except Exception as e:
            print(f"Error fetching pages: {str(e)}")
            return False

    def build_index(self):
        """Build a simple word location index"""
        self.word_locations.clear()

        # Process each page
        for page in self.pages:
            words = re.findall(r'\w+', page['content'].lower())
            word_counts = defaultdict(int)
            for word in words:
                if word not in self.stop_words:
                    word_counts[word] += 1

            for word, count in word_counts.items():
                self.word_locations[word].append({
                    'DocID': page['id'],
                    'Title': page['title'],
                    'Frequency': count
                })

        print("Index built successfully!")

        # Save index to Firebase
        index_data = []
        for term, locations in self.word_locations.items():
            index_data.append({
                'Term': term,
                'DocIDs': locations
            })
        self.firebase_conn.put('/', 'word_index', index_data)
        print("Index saved to Firebase successfully!")


    def search(self, query, num_results=5):
      """Search for a query in the indexed data"""
      query_words = [word.lower() for word in re.findall(r'\w+', query) if word.lower() not in self.stop_words]
      if not query_words:
          return []

      page_scores = defaultdict(lambda: {'matches': 0, 'total_freq': 0})
      for word in query_words:
          for entry in self.word_locations.get(word, []):
              page_id = entry['DocID']
              page_scores[page_id]['matches'] += 1
              page_scores[page_id]['total_freq'] += entry['Frequency']

      ranked_results = sorted(
          [(page_id, scores['matches'], scores['total_freq']) for page_id, scores in page_scores.items()],
          key=lambda x: (x[1], x[2]), reverse=True
      )

      results = []
      for page_id, matches, total_freq in ranked_results[:num_results]:
          page = next((p for p in self.pages if p['id'] == page_id), None)
          if page:
              results.append({
                  'title': page['title'],
                  'url': page['url'],
                  'matching_words': matches,
                  'total_frequency': total_freq
              })
      return results



# usage example
if __name__ == "__main__":
    base_url = "https://www.oracle.com/"
    search_engine = SearchEngine()

    if search_engine.fetch_pages(base_url, num_pages=6):
        search_engine.build_index()
        query = "cloud computing"
        results = search_engine.search(query, num_results=6)
        for result in results:
            print(f"Title: {result['title']}, URL: {result['url']}, Matches: {result['matching_words']}, Frequency: {result['total_frequency']}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Found 6 links. Processing the first 6.
Retrieved: https://www.oracle.com/customers/clayton/
Retrieved: https://www.oracle.com/
Retrieved: https://www.oracle.com/customers/nhssbs/
Retrieved: https://www.oracle.com/customers/uber/
Retrieved: https://www.oracle.com/customers/aon/
Retrieved: https://www.oracle.com/customers/smarterd/
Index built successfully!
Index saved to Firebase successfully!
Title: Clayton builds a modern supply chain and finance ecosystem with Oracle, URL: https://www.oracle.com/customers/clayton/, Matches: 1, Frequency: 29
Title: Oracle | Cloud Applications and Cloud Platform, URL: https://www.oracle.com/, Matches: 1, Frequency: 19
Title: Uber relies on Oracle Cloud to deliver on promises to its customers  , URL: https://www.oracle.com/customers/uber/, Matches: 1, Frequency: 9
Title: Aon enhances its end-to-end client strategy with Oracle Cloud, URL: https://www.oracle.com/customers/aon/, Matches: 1, Frequency: 7
Title: NHS Shared Business Services gains better insi

In [3]:
import ipywidgets as widgets
from IPython.display import display, clear_output
from firebase import firebase


class SearchUI:
    def __init__(self, firebase_url):
        self.firebase_conn = firebase.FirebaseApplication(firebase_url, None)
        self.index_data = self.fetch_index()

    def fetch_index(self):
        try:
            index = self.firebase_conn.get('/word_index', None)
            return index if index else []
        except Exception as e:
            print(f"Error fetching index from Firebase: {e}")
            return []

    def search_query(self, query):
        query_words = [word.lower() for word in query.split()]
        results = []
        for term in query_words:
            for entry in self.index_data:
                if entry['Term'] == term:
                    for occurrence in entry['DocIDs']:
                        results.append({
                            'page_id': occurrence['DocID'],
                            'title': occurrence['Title'],
                            'frequency': occurrence['Frequency']
                        })
        return results

    def show_search_screen(self):
        clear_output()

        # Dark background style
        background_style = '''
        <style>
            body {
                background-color: #1a202c !important;
                min-height: 100vh;
                margin: 0;
                padding: 20px;
                box-sizing: border-box;
            }
            .widget-button button {
                background: linear-gradient(90deg, #ef4444 0%, #f59e0b 100%) !important;
                border: none !important;
                color: white !important;
                border-radius: 20px !important;
                padding: 8px 25px !important;
                font-weight: bold !important;
                transition: transform 0.2s !important;
                box-shadow: none !important;
            }
            .widget-button button:hover {
                transform: scale(1.05) !important;
            }
            .widget-text input {
                background-color: #2d3748 !important;
                border: none !important;
                color: white !important;
                border-radius: 9999px !important;
                padding: 15px 25px !important;
                width: 100% !important;
            }
            .widget-text input::placeholder {
                color: #718096 !important;
            }
            .admin-button {
                background-color: #3b82f6 !important;
                color: white !important;
                padding: 8px 16px !important;
                border-radius: 6px !important;
                border: none !important;
                cursor: pointer !important;
                position: absolute !important;
                right: 20px !important;
                top: 20px !important;
            }
            .jupyter-widgets.widget-hbox {
                display: none !important;
            }
        </style>
        '''

        # Logo with gradient border
        logo_html = '''
        <div style="text-align: center; margin: 40px auto;">
            <div style="width: 150px; height: 150px; margin: 0 auto; position: relative;">
                <div style="
                    width: 150px;
                    height: 150px;
                    border-radius: 50%;
                    background: white;
                    display: flex;
                    align-items: center;
                    justify-content: center;
                    position: relative;
                    z-index: 1;
                ">
                    <span style="color: #718096; font-size: 16px;">150 × 150</span>
                </div>
                <div style="
                    position: absolute;
                    top: -4px;
                    left: -4px;
                    right: -4px;
                    bottom: -4px;
                    background: linear-gradient(90deg, #ef4444 0%, #f59e0b 50%, #ef4444 100%);
                    border-radius: 50%;
                    z-index: 0;
                "></div>
            </div>
        </div>
        '''

        # Search input
        search_bar = widgets.Text(
            placeholder="Search...",
            layout=widgets.Layout(
                width="400px"
            )
        )

        # Search button using widgets.Button
        search_button = widgets.Button(
            description="Badger Search",
            button_style="warning",
            layout=widgets.Layout(width="auto")
        )

        # Admin button
        admin_button = widgets.HTML('''
            <button class="admin-button">
                Admin Page
            </button>
        ''')

        def on_search_clicked(b):
            query = search_bar.value.strip()
            if query:
                results = self.search_query(query)
                self.show_results_screen(query, results)
            else:
                output = widgets.HTML("<div style='color: #ef4444; text-align: center; margin-top: 10px;'>Please enter a valid query!</div>")
                display(output)

        search_button.on_click(on_search_clicked)

        # Display all elements
        display(widgets.HTML(background_style))
        display(admin_button)
        display(widgets.HTML(logo_html))

        # Center the search elements
        search_container = widgets.VBox([
            search_bar,
            search_button
        ], layout=widgets.Layout(
            display='flex',
            flex_flow='column',
            align_items='center',
            width='100%',
            max_width='700px',
            margin='0 auto'
        ))

        display(search_container)

    def show_results_screen(self, query, results):
        clear_output()

        # Keep the dark theme style
        display(widgets.HTML('''
        <style>
            body {
                background-color: #1a202c !important;
                color: white !important;
            }
            .widget-button button {
                background: linear-gradient(90deg, #ef4444 0%, #f59e0b 100%) !important;
                border: none !important;
                color: white !important;
                border-radius: 20px !important;
                padding: 8px 25px !important;
                font-weight: bold !important;
            }
            .jupyter-widgets.widget-hbox {
                display: none !important;
            }
        </style>
        '''))

        term_count = sum([r['frequency'] for r in results])
        doc_count = len(results)

        header = widgets.HTML(
            f'''
            <div style="margin: 20px; color: white;">
                <h3>Results for: <i>{query}</i></h3>
                <p>Your search appeared <b>{term_count}</b> times in the following links<br>
                About <b>{doc_count}</b> results</p>
            </div>
            '''
        )

        results_html = []
        for result in results:
            results_html.append(f'''
                <div style="display: flex; justify-content: space-between;
                align-items: center; margin: 15px 0; padding: 10px; color: white;">
                    <a href="{result['page_id']}"
                    style="color: #3b82f6; font-size: 16px; text-decoration: none;">
                        {result['title']}
                    </a>
                    <span style="color: #718096;">Frequency: {result['frequency']}</span>
                </div>
            ''')

        results_widget = widgets.HTML('\n'.join(results_html))

        back_button = widgets.Button(
            description="Back to Search",
            button_style="warning",
            layout=widgets.Layout(width="150px")
        )

        def on_back_clicked(b):
            self.show_search_screen()

        back_button.on_click(on_back_clicked)
        display(widgets.VBox([header, results_widget, back_button]))

# Initialize the UI
firebase_url = "https://cloudproject-27420-default-rtdb.firebaseio.com/"
search_ui = SearchUI(firebase_url)
search_ui.show_search_screen()

HTML(value='\n        <style>\n            body {\n                background-color: #1a202c !important;\n    …

HTML(value='\n            <button class="admin-button">\n                Admin Page\n            </button>\n  …

HTML(value='\n        <div style="text-align: center; margin: 40px auto;">\n            <div style="width: 150…

VBox(children=(Text(value='', layout=Layout(width='400px'), placeholder='Search...'), Button(button_style='war…