<a href="https://colab.research.google.com/github/mahersalman/Introducation_To_Cloud_Computing/blob/main/Tutorials/Tutorial6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install requests beautifulsoup4



# **1. Building Index**

## 1.1 Fetching page and returning its soup

In [2]:

import requests
from bs4 import BeautifulSoup

def fetch_page(url):
  response = requests.get(url)
  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup
  else:
    return None

## 1.2 Return Index of a certain soup

In [3]:
import re

def index_words(soup):
  index = {}
  words = re.findall(r'\w+', soup.get_text()) #soup.get_text() gets all text and removes all tags # \w+ ignores special characters
  for word in words:
    word = word.lower()
    if word in index:
      index[word] += 1
    else:
      index[word] = 1
  return index


## 1.3 Removing stop words from an Index

In [29]:
def remove_stop_words(index):
  stop_words = {'a', 'an', 'the', 'and', 'or','in', 'on', 'at','to','be','of','that'}
  for stop_word in stop_words:
    if stop_word in index:
      del index[stop_word]
  return index


## 1.4 Aggregate similiar words (words with same stem result) in Index

In [5]:
from nltk.stem import PorterStemmer

def apply_stemming(index):
  stemmer = PorterStemmer()
  stemmed_index = {}
  for word, count in index.items():
    stemmed_word = stemmer.stem(word)
    if stemmed_word in stemmed_index:
      stemmed_index[stemmed_word] += count
    else:
      stemmed_index[stemmed_word] = count
  return stemmed_index


# **2. Handling Search Query**

## 2.1. Counting for each word in query, number of appearances in Index

In [6]:
def search(query, index):
  stemmer = PorterStemmer()
  query_words = re.findall(r'\w+', query.lower())
  results = {}
  for word in query_words:
    word= stemmer.stem(word)
    if word in index:
      results[word] = index[word]
  return results


# 3. Search engine for a single page (usage example method)
i. creates index for the page

ii. returns arrays of frequencies for each word in query

In [30]:
def search_engine(url, query):
  soup = fetch_page(url)
  if soup is None:
    return None
  index = index_words(soup)
  index = remove_stop_words(index)
  index = apply_stemming(index)
  results = search(query, index)
  return results


Example of using the method search_engine

In [21]:
# url = 'https://en.wikipedia.org/wiki/Bird'
# query = 'bird'
# results = search_engine(url, query)
# print(results)


{'bird': 568}


In [22]:
# url = 'https://en.wikipedia.org/wiki/Bird'
# query = 'birds wings'
# results = search_engine(url, query)
# print(results)
# rank=1
# for word, count in results.items():
#   rank = rank*1/count
# rank = 1-rank
# print(rank)



{'bird': 568, 'wing': 25}
0.9999295774647887


In [23]:
# url = 'https://en.wikipedia.org/wiki/Bird'
# query = 'owls'
# results = search_engine(url, query)
# print(results)
# rank=1
# for word, count in results.items():
#   rank = rank*1/count
# rank = 1-rank
# print(rank)


{'owl': 13}
0.9230769230769231


In [24]:
# url = 'https://w3.braude.ac.il/?lang=en'
# query = 'Industry'
# results = search_engine(url, query)
# print(results)
# rank=1
# for word, count in results.items():
#   rank = rank*1/count
# rank = 1-rank
# print(rank)

{'industri': 8}
0.875


In [25]:
# url = 'https://w3.braude.ac.il/?lang=en'
# query = 'Braude college'
# results = search_engine(url, query)
# print(results)
# rank=1
# for word, count in results.items():
#   rank = rank*1/count
# rank = 1-rank
# print(rank)

{'braud': 13, 'colleg': 8}
0.9903846153846154


In [26]:
# url = 'https://w3.braude.ac.il/?lang=en'
# query = 'Galilee center'
# results = search_engine(url, query)
# print(results)
# rank=1
# for word, count in results.items():
#   rank = rank*1/count
# rank = 1-rank
# print(rank)

{'galile': 15, 'center': 4}
0.9833333333333333


# **Building an Index for OnShape Glossary**

In [14]:
#Currently unused
words={"point","circle","import","export","rectangle","arc","triangle","hole","surface","sketch","feature","part","material","measure","tool","update","edit","collaborate","studio","tab", "draw","geometry","extrude","assemble","plan","assemble","keyboard","shortcut"}

In [31]:
def build_index(url):
  soup = fetch_page(url)
  if soup is None:
    return None
  index1 = index_words(soup)
  index1 = remove_stop_words(index1)
  index1 = apply_stemming(index1)
  return index1

In [34]:
glossaryIndex=build_index("https://cad.onshape.com/help/Content/Glossary/glossary.htm?tocpath=_____19")

In [33]:
# print(index1)
for word,frequency in index1.items():
  if frequency>25:
    print(word,frequency)

account 33
set 33
all 644
context 676
option 86
document 164
sketch 220
part 524
studio 369
assembl 279
draw 226
render 159
plan 626
type 656
profession 145
enterpris 166
your 65
thi 66
point 74
one 51
tool 130
for 136
creat 174
sheet 70
geometri 37
which 55
are 112
view 151
model 63
between 40
two 46
keyboard 631
shortcut 631
n 505
see 638
also 648
use 139
dimens 49
allow 67
you 199
place 29
ani 50
enabl 55
by 82
defin 32
along 26
it 57
featur 118
materi 27
s 52
appear 76
submenu 30
open 61
display 53
onshap 98
menu 98
simul 149
instanc 47
load 40
dialog 32
shift 49
show 47
have 29
list 58
can 89
user 43
tab 38
is 162
not 38
add 26
when 91
select 185
color 47
face 94
panel 77
from 66
where 41
plane 67
custom 27
version 36
new 46
current 40
into 26
with 84
line 42
rotat 30
scale 29
surfac 75
metal 41
edit 40
properti 52
camera 40
more 35
insert 27
as 99
entiti 47
axi 34
mate 109
edg 86
connect 45
curv 55
workspac 38
tabl 52
other 39
exist 28
environ 36
hide 33


# Connecting to Firebase

In [18]:
pip install firebase

Collecting firebase
  Downloading firebase-4.0.1-py3-none-any.whl (12 kB)
Installing collected packages: firebase
Successfully installed firebase-4.0.1


In [19]:
pip install firebase-admin



In [37]:
#DO NOT RUN THIS AGAIN, CONSIDER DELETING FROM DB FIRST IF YOU WANT TO RUN AGAIN
#Added to DB most frequent words with frequency above 25.
# from firebase import firebase
# FBconn = firebase.FirebaseApplication('https://cloud-tut6-2a99a-default-rtdb.firebaseio.com/',None)

# for word,frequency in glossaryIndex.items():
#   if frequency>25:
#     data_to_upload = {
#         'term' : word,
#         'frequency' : frequency
#     }
#     result = FBconn.post('/onshapeGlossary/',data_to_upload)
#     print(result)


{'name': '-O1bc_ukOHW988QGPpwk'}
{'name': '-O1bc_whLs0jov94sCNl'}
{'name': '-O1bc_yeu2TqDB2u4c2g'}
{'name': '-O1bca-atBnh8-3CQgz7'}
{'name': '-O1bca5NSfzq_NDv7WqP'}
{'name': '-O1bca7c7OfawJzyvsDB'}
{'name': '-O1bca9ppa_rGs27llnQ'}
{'name': '-O1bcaC1ITtu1ieduNgw'}
{'name': '-O1bcaE9MMR9l7zcmxJz'}
{'name': '-O1bcaGFQJkVkBlqRaFb'}
{'name': '-O1bcaIJFJhvgUWOCUTD'}
{'name': '-O1bcaKVKpoIsVO7iL_j'}
{'name': '-O1bcaMiRp73XFysvxtI'}
{'name': '-O1bcaOsA__XqIR_oZh3'}
{'name': '-O1bcaQxXzvdJmYbo6sw'}
{'name': '-O1bcaTCq5BBRoKT5omC'}
{'name': '-O1bcaVTmG5939UZcjEs'}
{'name': '-O1bcaXg0YMNUgg9C9vt'}
{'name': '-O1bcaZzR7ytjkB_8CRY'}
{'name': '-O1bcab0J_-oXm3BtbRQ'}
{'name': '-O1bcacwhJhPdsfYHUx7'}
{'name': '-O1bcaewju2mVHazYek8'}
{'name': '-O1bcagzhq0iBNyyRrUO'}
{'name': '-O1bcaix-ui8PTIg-i2X'}
{'name': '-O1bcalImwvL21RkXuzC'}
{'name': '-O1bcanaGSs6uNss1Q9T'}
{'name': '-O1bcapw5kscWr8Btshg'}
{'name': '-O1bcarwNBsAjyfqGLMi'}
{'name': '-O1bcau0CMwT-oKXXNpG'}
{'name': '-O1bcaw6jbiIlyR1rOzP'}
{'name': '