<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Q1

In [2]:
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse


class MultiThreadScraper:

    def __init__(self, base_url):

        self.base_url = base_url
        self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=20)
        self.scraped_pages = set([])
        self.to_crawl = Queue()
        self.to_crawl.put(self.base_url)

    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
                if url not in self.scraped_pages:
                    self.to_crawl.put(url)

    def scrape_info(self, html):
        return

    def post_scrape_callback(self, res):
        result = res.result()
        if result and result.status_code == 200:
            self.parse_links(result.text)
            self.scrape_info(result.text)

    def scrape_page(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return

    def run_scraper(self):
        while True:
            try:
                target_url = self.to_crawl.get(timeout=60)
                if target_url not in self.scraped_pages:
                    print("Scraping URL: {}".format(target_url))
                    self.scraped_pages.add(target_url)
                    job = self.pool.submit(self.scrape_page, target_url)
                    job.add_done_callback(self.post_scrape_callback)
            except Empty:
                return
            except Exception as e:
                print(e)
                continue
if __name__ == '__main__':
    s = MultiThreadScraper("https://www.w3schools.com/")
    s.run_scraper()

Scraping URL: https://www.w3schools.com/
Scraping URL: https://www.w3schools.com
Scraping URL: https://www.w3schools.com/html/default.asp
Scraping URL: https://www.w3schools.com/css/default.asp
Scraping URL: https://www.w3schools.com/bootstrap/bootstrap_ver.asp
Scraping URL: https://www.w3schools.com/w3css/default.asp
Scraping URL: https://www.w3schools.com/colors/default.asp
Scraping URL: https://www.w3schools.com/icons/default.asp
Scraping URL: https://www.w3schools.com/graphics/default.asp
Scraping URL: https://www.w3schools.com/howto/default.asp
Scraping URL: https://www.w3schools.com/sass/default.asp
Scraping URL: https://www.w3schools.com/js/default.asp
Scraping URL: https://www.w3schools.com/jquery/default.asp
Scraping URL: https://www.w3schools.com/react/default.asp
Scraping URL: https://www.w3schools.com/angular/default.asp
Scraping URL: https://www.w3schools.com/js/js_json_intro.asp
Scraping URL: https://www.w3schools.com/js/js_ajax_intro.asp
Scraping URL: https://www.w3schoo

Scraping URL: https://www.w3schools.com/howto/howto_js_tabs.asp
Scraping URL: https://www.w3schools.com/howto/howto_css_dropdown.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_accordion.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_sidenav.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_topnav.asp
Scraping URL: https://www.w3schools.com/howto/howto_css_modals.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_progressbar.asp
Scraping URL: https://www.w3schools.com/howto/howto_css_parallax.asp
Scraping URL: https://www.w3schools.com/howto/howto_css_login_form.asp
Scraping URL: https://www.w3schools.com/howto/howto_html_include.asp
Scraping URL: https://www.w3schools.com/howto/howto_google_maps.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_rangeslider.asp
Scraping URL: https://www.w3schools.com/howto/howto_css_tooltip.asp
Scraping URL: https://www.w3schools.com/howto/howto_js_slideshow.asp
Scraping URL: https://www.w3schools.com/ho

Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=00008B
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=DarkCyan
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=008B8B
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=008B8B&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=008B8B
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=DarkGoldenRod
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=B8860B
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=B8860B&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=B8860B
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=DarkGray
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=A9A9A9
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=A9A9A9&colortop=FFFFFF

Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=F8F8FF
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=F8F8FF&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=F8F8FF
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=Gold
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=FFD700
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=FFD700&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=FFD700
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=GoldenRod
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=DAA520
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=DAA520&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=DAA520
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=Gray
Scraping UR

Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=LightYellow
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=FFFFE0
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=FFFFE0&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=FFFFE0
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=Lime
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=00FF00
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=00FF00&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=00FF00
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=LimeGreen
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=32CD32
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=32CD32&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=32CD32
Scra

Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=Silver
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=C0C0C0
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=C0C0C0&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=C0C0C0
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=SkyBlue
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=87CEEB
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=87CEEB&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=87CEEB
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?color=SlateBlue
Scraping URL: https://www.w3schools.com/colors/color_tryit.asp?hex=6A5ACD
Scraping URL: https://www.w3schools.com/colors/colors_mixer.asp?colorbottom=6A5ACD&colortop=FFFFFF
Scraping URL: https://www.w3schools.com/colors/colors_picker.asp?colorhex=6A5ACD
Scrapi

KeyboardInterrupt: 

# Q2

In [21]:
import math
def tobin(x):
    xb=bin(x)
    x_bin=xb[2:]
    return x_bin
def todec(x):
    x=int(x)
    decimal=0 
    i=0
    n = 0
    while(x != 0): 
        dec = x % 10
        decimal = decimal + dec * pow(2, i) 
        x = x//10
        i += 1
    return decimal  
    
def deltaencode(x):
    log=math.log2(x)
    l1=1+math.floor(log) # 1+log2x
    l2=tobin(l1)
    l2=str(l2) #log converted to string
    lenl2=len(l2)
    ze=''
    for i in range(0,lenl2-1):
        ze=ze+'0'
    loge=ze+l2
    #print(loge)
    xxbin=tobin(x)
    bin2=xxbin[1:]
    xxbin=str(bin2) #no converted to bin
    #print(bin2)
    encoded=loge+xxbin
    return encoded
def deltadecode(x):
    q=0
    for i in range (0,len(x)):
        if x[i]=='0':
            q+=1
        else:
            break
    xstr=x[q+q+1:]
    xstr1='1'+xstr
    xint=int(xstr1)
    decoded=todec(xint)
    return decoded

def golombcoding(x,b):
    ze=''
    q=int(x/b)
    for i in range(0,q):
        ze=ze+'0'
    g1=ze+'1'
    #print(g1)
    r=x-q*b
    g2=tobin(r)
    #print(r)
    encoded=g1+g2
    return encoded
    

def golombdecoding(x,b):
    z=0
    for i in range (0,len(x)):
        if x[i]=='0':
            z+=1
        else:
            break
    q=z
    #print('q',q)
    r=x[z+1:]
    i=int(math.log2(b))
    d=int(2**(i+1)-b)
    #print('d',d)
    #print('i',i)
    r=r[0:i]
    #print('first r',r)
    
    rd=todec(r)
    #print('rd',rd)
    
    if d<rd:
        rnew=r+x[i]        
        rd=todec(rnew)
        print('second r',rd)
    #rd=todec(r)
    rn=(rd-d)*-1
    #print('rn',rn)
    x=q*b+rn+1
    print(x)
    return x
x1=deltaencode(37)
print(x1)
x2=deltadecode(x1)
print(x2)    
x1=golombcoding(35,11)
print(x1)
x2=str(x1)
#print(x2)
golombdecoding(x1,11)

0011000101
37
000110
37


37

# Q3

In [23]:
from collections import deque
from bs4 import BeautifulSoup
import sys
import urllib
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import re

In [24]:
url = "https://en.wikipedia.org/wiki/Web_mining"
urlf = urlopen(url)
soup = BeautifulSoup(urlf.read(),"lxml")

In [25]:
with open("d1.txt","w+") as f:
    for a in soup.find_all('p'):
        #print(a.getText())
        f.write(a.getText()+"\n")

In [26]:
url2 = "https://en.wikipedia.org/wiki/Data_mining"
urlf2 = urlopen(url2)
soup2 = BeautifulSoup(urlf2.read(),"lxml")

In [27]:
with open("d2.txt","w+") as f:
    for a in soup2.find_all('p'):
        # print(a.getText())
        f.write(a.getText()+"\n")

In [28]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

In [29]:
f1 = open("d1.txt","r").read()
f2 = open("d2.txt","r").read()
#print(f2)
rem_stop1 = word_tokenize(f1)
rem_stop2 = word_tokenize(f2)


In [5]:
# Hash Map

class HashMap:
        def __init__(self):
                self.size = 6
                self.map = [None] * self.size
		
        def _get_hash(self, key):
                hash = 0
                for char in str(key):
                        hash += ord(char)
                return hash % self.size
		
        def add(self, key, value):
                key_hash = self._get_hash(key)
                key_value = [key, value]
		
                if self.map[key_hash] is None:
                        self.map[key_hash] = list([key_value])
                        return True
                else:
                        for pair in self.map[key_hash]:
                                if pair[0] == key:
                                        pair[1] = value
                                        return True
                        self.map[key_hash].append(key_value)
                        return True
			
        def get(self, key):
                key_hash = self._get_hash(key)
                if self.map[key_hash] is not None:
                        for pair in self.map[key_hash]:
                                if pair[0] == key:
                                        return pair[1]
                return None
			
        def delete(self, key):
                key_hash = self._get_hash(key)
		
                if self.map[key_hash] is None:
                        return False
                for i in range (0, len(self.map[key_hash])):
                        if self.map[key_hash][i][0] == key:
                                self.map[key_hash].pop(i)
                                return True
                return False
	
        def keys(self):
                arr = []
                for i in range(0, len(self.map)):
                        if self.map[i]:
                                arr.append(self.map[i][0])
                return arr
			
        def print(self):
                print('---PHONEBOOK----')
                for item in self.map:
                        if item is not None:
                                print(str(item))
			
h = HashMap()
h.add('Bob', '567-8888')
h.add('Ming', '293-6753')
h.add('Ming', '333-8233')
h.add('Ankit', '293-8625')
h.add('Aditya', '852-6551')
h.add('Alicia', '632-4123')
h.add('Mike', '567-2188')
h.add('Aditya', '777-8888')
h.print()		
h.delete('Bob')
h.print()
print('Ming: ' + h.get('Ming'))
print(h.keys())

---PHONEBOOK----
[['Mike', '567-2188']]
[['Alicia', '632-4123']]
[['Aditya', '777-8888']]
[['Bob', '567-8888'], ['Ming', '333-8233'], ['Ankit', '293-8625']]
---PHONEBOOK----
[['Mike', '567-2188']]
[['Alicia', '632-4123']]
[['Aditya', '777-8888']]
[['Ming', '333-8233'], ['Ankit', '293-8625']]
Ming: 333-8233
[['Mike', '567-2188'], ['Alicia', '632-4123'], ['Aditya', '777-8888'], ['Ming', '333-8233']]


# Q4

In [32]:
import unicodedata
_WORD_MIN_LENGTH = 3
_STOP_WORDS = frozenset([
'a', 'about', 'above', 'above', 'across', 'after', 'afterwards', 'again', 
'against', 'all', 'almost', 'alone', 'along', 'already', 'also','although',
'always','am','among', 'amongst', 'amoungst', 'amount',  'an', 'and', 'another',
'any','anyhow','anyone','anything','anyway', 'anywhere', 'are', 'around', 'as',
'at', 'back','be','became', 'because','become','becomes', 'becoming', 'been', 
'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 
'between', 'beyond', 'bill', 'both', 'bottom','but', 'by', 'call', 'can', 
'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 
'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 
'either', 'eleven','else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 
'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 
'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 
'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 
'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 
'himself', 'his', 'how', 'however', 'hundred', 'ie', 'if', 'in', 'inc', 
'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 
'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 
'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 
'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 
'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 
'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only',
'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out',
'over', 'own','part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same',
'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 
'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 
'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 
'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 
'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third',
'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 
'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 
'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 
'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter',
'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 
'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
'yourselves', 'the'])

def word_split(text):
    
    word_list = []
    wcurrent = []
    windex = None

    for i, c in enumerate(text):
        if c.isalnum():
            wcurrent.append(c)
            windex = i
        elif wcurrent:
            word = u''.join(wcurrent)
            word_list.append((windex - len(word) + 1, word))
            wcurrent = []

    if wcurrent:
        word = u''.join(wcurrent)
        word_list.append((windex - len(word) + 1, word))

    return word_list

def words_cleanup(words):
   
    cleaned_words = []
    for index, word in words:
        if len(word) < _WORD_MIN_LENGTH or word in _STOP_WORDS:
            continue
        cleaned_words.append((index, word))
    return cleaned_words

def words_normalize(words):
   
    normalized_words = []
    for index, word in words:
        wnormalized = word.lower()
        normalized_words.append((index, wnormalized))
    return normalized_words

def word_index(text):
    
    words = word_split(text)
    words = words_normalize(words)
    words = words_cleanup(words)
    return words

def inverted_index(text):
    
    inverted = {}

    for index, word in word_index(text):
        locations = inverted.setdefault(word, [])
        locations.append(index)

    return inverted

def inverted_index_add(inverted, doc_id, doc_index):
    
    for word, locations in doc_index.items():
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted

def search(inverted, query):
    
    words = [word for _, word in word_index(query) if word in inverted]
    results = [set(inverted[word].keys()) for word in words]
    return reduce(lambda x, y: x & y, results) if results else []

if __name__ == '__main__':
    doc1 = open("d1.txt","r").read()
    doc2 = open("d2.txt","r").read()



    # Build Inverted-Index for documents
    inverted = {}
    documents = {'id1':doc1, 'id2':doc2}
    for doc_id, text in documents.items():
        doc_index = inverted_index(text)
        inverted_index_add(inverted, doc_id, doc_index)

    # Print Inverted-Index
    for word, doc_locations in inverted.items():
        print (word, doc_locations)


web {'id1': [0, 97, 167, 476, 541, 554, 577, 653, 799, 840, 938, 950, 1034, 1133, 1228, 1353, 1397, 1452, 1470, 1493, 1516, 1622, 1684, 1754, 1804, 1815, 2022, 2112, 2249, 3311, 3701, 3959, 4166, 4340, 6713, 6802, 6837, 6858, 6911, 6960, 6983, 7091, 7231, 7371, 7703, 7856, 7889, 8057, 8081, 8818, 8877, 8901, 9822, 9883, 10201], 'id2': [15621]}
mining {'id1': [4, 38, 156, 490, 589, 667, 813, 964, 1078, 1269, 1401, 1462, 1482, 1507, 1526, 1560, 1825, 1993, 2032, 2120, 2259, 2627, 3321, 3711, 3969, 4148, 4176, 4350, 5582, 5791, 6376, 6727, 6872, 6925, 6974, 6995, 7009, 7754, 7893, 8093, 8827, 9657, 9701, 9826, 10046, 10228, 10352], 'id2': [5, 171, 429, 813, 945, 1309, 1504, 1749, 2008, 2035, 2310, 2569, 2691, 2885, 3101, 3602, 3922, 4061, 4169, 4233, 4582, 4678, 4857, 5083, 5298, 5387, 6201, 6863, 6969, 7158, 7325, 7446, 7522, 7865, 7996, 8050, 8739, 8817, 8880, 9085, 9250, 9815, 10127, 10390, 10412, 10652, 10740, 10785, 11185, 11250, 11345, 11520, 11608, 11708, 11789, 11897, 12028, 12229

negative {'id2': [3896]}
appeared {'id2': [3929]}
1990 {'id2': [3945]}
connotations {'id2': [4001]}
short {'id2': [4021]}
1980s {'id2': [4035]}
phrase {'id2': [4044]}
trademarked {'id2': [4098]}
hnc {'id2': [4113]}
san {'id2': [4120]}
diego {'id2': [4124]}
pitch {'id2': [4148]}
workstation {'id2': [4176]}
consequently {'id2': [4205]}
turned {'id2': [4218]}
archaeology {'id2': [4271]}
harvesting {'id2': [4296]}
gregory {'id2': [4358]}
piatetsky {'id2': [4366]}
shapiro {'id2': [4376]}
coined {'id2': [4384]}
workshop {'id2': [4449]}
topic {'id2': [4470]}
1989 {'id2': [4481, 10158]}
popular {'id2': [4513, 4601]}
press {'id2': [4629]}
communities {'id2': [4635]}
currently {'id2': [4652, 14137]}
interchangeably {'id2': [4718]}
academic {'id2': [4743, 10301]}
major {'id2': [4767]}
forums {'id2': [4773]}
started {'id2': [4793, 4901, 5156]}
1995 {'id2': [4804]}
international {'id2': [4824, 5219, 10198, 10539]}
conference {'id2': [4838, 5233, 5279, 10212, 10505, 10524, 10553]}
montreal {'id2': [