In [37]:
import requests
import bs4
import pandas as pd
import random
import re
from collections import Counter
from difflib import get_close_matches
from pandas import DataFrame
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [2]:
USER_AGENT = [
    'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'
]

user_agent = random.choice(USER_AGENT)

In [3]:
header = {'User-Agent': user_agent}

In [105]:
class GoogleSearchURL:
    def __init__(self, search_string: str) -> None:
        search_string = str(search_string)
        self.string = f"https://www.google.com/search?q={'+'.join(search_string.split(' '))}"

    def __repr__(self) -> str:
        return self.string

In [106]:
search = "3m 50 TAPE VINYL 49'x100ft 10 mil"
response = requests.get(GoogleSearchURL(search))

In [6]:
class Body:
    def __init__(self, content: bytes, skip_tags=None):
        self.soup = bs4.BeautifulSoup(content).body

        if skip_tags:
            for tag in skip_tags:
                for t in self.soup.find_all(tag):
                    t.decompose()

    def extract_text(self, join=None):
        texts = []
        def recursive_extract(tag):
            if not isinstance(tag, bs4.element.Tag):
                texts.append(tag.text)
                return

            for t in tag.children:
                recursive_extract(t)
        recursive_extract(self.soup)

        if join: texts = join.join(texts)
        return texts

In [7]:
class Links3M:
    link3m_start = "https://www.3m"

    def __init__(self, body: Body) -> None:
        counter = 0
        links3m = []
        for a in body.soup.find_all('a'):
            href = a['href']
            index = href.find(self.link3m_start)
            if index > 0:
                link = '/'.join(href[index:].split('/')[:-1]) + '/'
                links3m.append((link))
                counter += 1

        self.links = links3m
        self.__length = counter

    def __len__(self): return self.__length

    def __iter__(self):
        for link in self.links:
            yield link

    def __next__(self):
        return next(self.__iter__())

    def __getitem__(self, index):
        return self.links[index]


In [107]:
print(Links3M(Body(response.content)).links)
for link in Links3M(Body(response.content)).links:
    print(link)
    response3m = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}, timeout=30)
    break

['https://www.3mindia.in/3M/en_IN/p/dc/v000060177/', 'https://www.3mindia.in/3M/en_IN/p/d/b00042191/', 'https://www.3m.com/3M/en_US/p/d/b00042191/', 'https://www.3m.com/3M/en_LB/p/d/v000182030/', 'https://www.3mindia.in/3M/en_IN/p/dc/v000060177/']
https://www.3mindia.in/3M/en_IN/p/dc/v000060177/


In [9]:
class CleanText3M:
    remove_tokens = ['\n', '\t', '\r', '©', '™', '®', '\xa0', '  ', '°', '(', ')']

    def __init__(self, text) -> None:
        if isinstance(text, str):
            self.text = self.clean(text)

        if isinstance(text, list):
            texts = []
            for t in text:
                t = self.clean(t)
                if t: texts.append(t)
            self.text = texts
    
    @staticmethod
    def clean(text):
        replace_dict = {token: '' for token in CleanText3M.remove_tokens}
        text = CleanText3M.replace_multiple(replace_dict, text)       
        text = CleanText3M.split_titled(text)
        return ' '.join([word for word in text.split(' ') if word])
        
    @staticmethod
    def replace_multiple(replace_dict, text):
        replace = dict((re.escape(k), v) for k, v in replace_dict.items()) 
        pattern = re.compile("|".join(replace.keys()))
        return pattern.sub(lambda m: replace[re.escape(m.group(0))], text)

    @staticmethod    
    def split_titled(text: str):
        clean_text = []
        for word in text.split(' '):
            if not word.isupper() and not word.isnumeric() and not word.islower():
                # word = ' '.join(re.findall('[A-Z][^A-Z]*', word)) # logic 1
                words = list(filter(None, re.split("([A-Z][^A-Z]*)", word))) # logic 2
                cleaned_words = []
                for w in words:
                    if not w[-1].isalnum() and w[-1] not in '.,?!%':
                        w = w[:-1]
                    
                    if w: cleaned_words.append(w)
                word = ' '.join(cleaned_words)
            clean_text.append(word)

        return ' '.join(clean_text)

In [92]:
cleaned_text = '. '.join(CleanText3M(Body(response3m.content, skip_tags=['img', 'table', 'a', 'script']).extract_text()).text)
cleaned_text

'3M in India. English EN. Save. Products. Industries. Brands. At 3M, we discover and innovate in nearly every industry to help solve problems around the world.. Command. Littmann. Nexcare. Post-it. Scotch Brite. Scotch. 3M Scotchrap Vinyl Corrosion Protection Tape 50, Unprinted, 1 in x 100 ft, Black, 48 Rolls Case, BULK. Main Picture. Item. 1. of. 1. 3M Scotchrap Vinyl Corrosion Protection Tape 50, Unprinted, 1 in x 100 ft, Black, 48 Rolls Case, BULK. 3M ID80008006621. Overview. Details. Specifications. Resources. 1of 1. Hover to zoom. Protects metal piping systems against corrosion. Pressure-sensitive, rubber adhesive ensures excellent tacking and provides optimized adherence. Tape measures 1 in x 100 ft x 10 mil. Comes in durable black PVC. More. Details. 3M Scotchrap Tape 50 is a premium-grade, 10-mil thick, all-weather, PVC backing tape. It withstands temperatures of -55 to 176 degrees F -48 to 80 degrees C. This high-tack adhesive tape offers resistance to corrosion, impact, abras

In [103]:
class SearchScore:
    def __init__(self, text: str, search: str) -> None:
        lower_text = text.lower()
        lower_text_replaced = CleanText3M.replace_multiple({'.': '', ',': ''}, lower_text)
        splits = lower_text_replaced.split(' ')
        
        search_words = search.lower().split(' ')
        result = {}
        for word in search_words:
            matches = get_close_matches(word, splits)
            if matches:
                match_counter = Counter(matches)
                result[word] = list(match_counter.keys())
            else: result[word] = []

        self.keywords = result

        result = {key: {k: [] for k in self.keywords[key]} for key in self.keywords.keys()}
        for sentence in lower_text.split('. '):
            for word in self.keywords.keys():
                keys = self.keywords[word]
                for key in keys:
                    if sentence.__contains__(key):
                        result[word][key].append(sentence)
        self.sentences = result

        W = []
        K = []
        S = []
        for word, word_value in self.sentences.items():
            for key, key_sentences in word_value.items():
                for sentence in key_sentences:
                    W.append(word)
                    K.append(key)
                    S.append(sentence)

        self.M = DataFrame({'search word': W, 'close match': K, 'sentence': S})
        self.matrix = DataFrame(self.M.groupby(['search word', 'close match', 'sentence']).count())
        

In [104]:
SearchScore(cleaned_text, CleanText3M(search).text).M

Unnamed: 0,search word,close match,sentence
0,3m,3m,3m in india
1,3m,3m,"at 3m, we discover and innovate in nearly ever..."
2,3m,3m,3m scotchrap vinyl corrosion protection tape 5...
3,3m,3m,3m scotchrap vinyl corrosion protection tape 5...
4,3m,3m,3m id80008006621
5,3m,3m,"3m scotchrap tape 50 is a premium-grade, 10-mi..."
6,3m,3m,3m scotchrap tape 50 corrosion protection tape...
7,3m,3m,3m 2021
8,3m,3m,the brands listed above are trademarks of 3m.
9,50,50,3m scotchrap vinyl corrosion protection tape 5...
