### **Final crawler implementation version 1.0**

---



**NOTES:**

2.  When doing 1, take the final URL and process it's content, not the first one + also check if it is a valid URL - HALF THERE.
3. Repeat the process until the DB hits 700 entry rows - SOLVED.
4. Add input fields for the constants - OPTIONAL.
5. Unit tests are not provided intentionally, since they're an overkill for our audience.


In [None]:
!pip install selenium webdriver_manager
!pip install tqdm
!pip install polars

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
import sqlite3
import pandas as pd
import time
import re
import random
import logging
from urllib.parse import urljoin, urlparse, parse_qs
from collections import deque
import requests
from tqdm import tqdm
from abc import ABC, abstractmethod
import datetime


# Mount Google Drive
drive.mount('/content/drive')

# Define constants
MIN_REQUEST_DELAY = 1
MAX_REQUEST_DELAY = 30
MAX_PAGES = 700
SAVE_INTERVAL = 10
MAX_RETRIES = 3
MAX_REDIRECTS = 2

# User Agents for realistic browsing
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/52.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
]

# Initialize logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

db_path = '/content/drive/My Drive/db_data.db'
table_name = 'crawled_data'

def create_db_and_table_if_not_exists():
    conn = sqlite3.connect(db_path)
    conn.execute(f"CREATE TABLE IF NOT EXISTS {table_name} (URL TEXT UNIQUE, Status_Code INTEGER, Content TEXT)")
    conn.commit()
    conn.close()

def get_urls():
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(f"SELECT URL FROM {table_name}")
    all_urls = cursor.fetchall()
    conn.close()
    if all_urls:
        return set(url[0] for url in all_urls), all_urls[-1][0]
    else:
        return set(), None

def write_to_db(df):
    conn = sqlite3.connect(db_path)
    df.to_sql(table_name, conn, if_exists='append', index=False)
    conn.commit()
    conn.close()

def count_rows_in_db():
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    conn.close()
    return row_count


class CrawlingStrategy(ABC):
    @abstractmethod
    def add_links(self, links):
        pass
    @abstractmethod
    def get_next(self):
        pass
    @abstractmethod
    def has_next(self):
        pass
    @abstractmethod
    def count(self):
        pass

class BFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, start_url):
        self.queue = deque([start_url])
    def add_links(self, links):
        self.queue.extend(links)
    def get_next(self):
        return self.queue.popleft()
    def has_next(self):
        return len(self.queue) > 0
    def count(self):
        return len(self.queue)

class DFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, start_url):
        self.stack = [start_url]
    def add_links(self, links):
        self.stack.extend(links)
    def get_next(self):
        return self.stack.pop()
    def has_next(self):
        return len(self.stack) > 0
    def count(self):
        return len(self.stack)


class WebCrawler:
    def __init__(self, crawling_strategy):
        self.visited_urls, last_url = get_urls()
        self.start_url = last_url if last_url else 'https://kalicube.com/learning-spaces/'
        self.crawling_strategy = crawling_strategy
        self.crawling_strategy.add_links([self.start_url] if not self.crawling_strategy.has_next() else [])
        self.data = []
        self.current_delay = MIN_REQUEST_DELAY
        self.pages_crawled = 0

    from urllib.parse import urlparse, parse_qs


    def is_valid_url(self, url):
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)

        # Check for the presence of any UTM parameters which start with 'utm_'
        has_utm_params = any(param.startswith('utm_') for param in query_params)

        # Conditions for a URL to be considered valid
        return (parsed_url.scheme in ('http', 'https') and
                not any(re.search(pattern, url) for pattern in [r'\.(jpg|jpeg|png|gif|pdf|doc|xls|zip|rar|mp3|mp4)$', r'/user/[a-zA-Z0-9_-]+/?$']) and
                '#' not in url and
                'https://twitter.com' not in url and
                "/learning-spaces/" in parsed_url.path and # REPLACE /learning-spaces/ with your website directory!
                not has_utm_params)


    def extract_page_data(self, url_data):
        url = url_data[0]
        num_redirects = url_data[1]
        original_url = url
        user_agent = random.choice(USER_AGENTS)
        headers = {'User-Agent': user_agent}
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            content = response.text
            if url not in self.visited_urls:
                self.data.append({'URL': original_url, 'Status_Code': response.status_code, 'Content': content})
                logging.info(f"Crawled: {url} with status code {response.status_code}")
            links = set(urljoin(url, link) for link in re.findall(r'href=["\'](.*?)["\']', content))
            valid_links = [[link,0] for link in links if self.is_valid_url(link)]
            return valid_links

        elif response.status_code == 301 or response.status_code == 302:
            if num_redirects > MAX_REDIRECTS:
              logging.info(f"Reached max redirects. Stopping...")
              return []
            url = response.headers['Location']
            if not url.startswith('http'):
                url = urljoin(original_url, url)
            return [[url, num_redirects+1]]
            logging.info(f"Redirected to: {url}")

        else:
            return []  # Exit the redirect loop on non-200/301/302 status codes
        time.sleep(self.current_delay)


    def adjust_delay(self, response_time):
        if response_time < 2:
            self.current_delay = max(MIN_REQUEST_DELAY, self.current_delay * 0.9)
        elif response_time > 5:
            self.current_delay = min(MAX_REQUEST_DELAY, self.current_delay * 1.1)

    def crawl(self):
      with tqdm(total=MAX_PAGES, desc="Crawling progress", unit="page") as pbar:
          while self.pages_crawled < MAX_PAGES and self.crawling_strategy.has_next():
              current_url_data = self.crawling_strategy.get_next()
              print(f"Current url data e {current_url_data}")
              current_url = current_url_data[0]
              logging.info(f"Currently processing: {current_url}")
              new_links = self.extract_page_data(current_url_data)
              self.crawling_strategy.add_links([link for link in new_links if link[0] not in self.visited_urls])
              if current_url not in self.visited_urls:
                  self.visited_urls.add(current_url)
              self.pages_crawled += 1
              pbar.update(1)
              time.sleep(self.current_delay)

              # Check if we have processed 700 entries
              if self.pages_crawled % 700 == 0:
                  row_count = count_rows_in_db()
                  logging.info(f"Processed entries: {self.pages_crawled}, Rows in DB: {row_count}")
                  if row_count < 700:
                      logging.info("Less than 700 rows in the database, resetting pages_crawled.")
                      self.pages_crawled = 0  # Reset pages_crawled
                      continue  # Continue the crawling process

              if self.pages_crawled % SAVE_INTERVAL == 0:
                  self.save_data()

          if self.data:
              self.save_data(final=True)



    def save_data(self):
        df = pd.DataFrame(self.data)
        write_to_db(df)
        self.data = []  # Reset data to avoid duplication

def main():
    create_db_and_table_if_not_exists()
    strategy_type = input("Enter crawling strategy (BFS/DFS): ").strip().upper()
    visited_urls, last_url = get_urls()
    start_url = [last_url, 0] if last_url else ['https://kalicube.com/learning-spaces/',0]
    strategy = BFSCrawlingStrategy(start_url) if strategy_type == 'BFS' else DFSCrawlingStrategy(start_url)
    crawler = WebCrawler(strategy)
    crawler.crawl()

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter crawling strategy (BFS/DFS): BFS


Crawling progress:   0%|          | 0/700 [00:00<?, ?page/s]

Current url data e ['https://kalicube.com/learning-spaces/', 0]
url data e ['https://kalicube.com/learning-spaces/', 0], url e https://kalicube.com/learning-spaces/, a num redirects e 0


Crawling progress:   0%|          | 1/700 [00:00<04:16,  2.73page/s]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-graph-update/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-graph-update/', 0], url e https://kalicube.com/learning-spaces/knowledge-graph-update/, a num redirects e 0


Crawling progress:   0%|          | 2/700 [00:02<16:38,  1.43s/page]

Current url data e ['https://kalicube.com/learning-spaces/faq/brand-serps/how-does-the-kalicube-process-work/', 0]
url data e ['https://kalicube.com/learning-spaces/faq/brand-serps/how-does-the-kalicube-process-work/', 0], url e https://kalicube.com/learning-spaces/faq/brand-serps/how-does-the-kalicube-process-work/, a num redirects e 0


Crawling progress:   0%|          | 3/700 [00:04<21:39,  1.86s/page]

Current url data e ['https://kalicube.com/learning-spaces/faq/seo-glossary/how-kalicube-implements-the-kalicube-process/', 0]
url data e ['https://kalicube.com/learning-spaces/faq/seo-glossary/how-kalicube-implements-the-kalicube-process/', 0], url e https://kalicube.com/learning-spaces/faq/seo-glossary/how-kalicube-implements-the-kalicube-process/, a num redirects e 0


Crawling progress:   1%|          | 4/700 [00:06<19:35,  1.69s/page]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/', 0], url e https://kalicube.com/learning-spaces/knowledge-nuggets/, a num redirects e 0


Crawling progress:   1%|          | 5/700 [00:08<21:26,  1.85s/page]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/entity-seo/how-does-consistency-contribute-to-machine-understandability-virtual-antics-podcast/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/entity-seo/how-does-consistency-contribute-to-machine-understandability-virtual-antics-podcast/', 0], url e https://kalicube.com/learning-spaces/knowledge-nuggets/entity-seo/how-does-consistency-contribute-to-machine-understandability-virtual-antics-podcast/, a num redirects e 0


Crawling progress:   1%|          | 6/700 [00:10<22:34,  1.95s/page]

Current url data e ['https://kalicube.com/learning-spaces/faq/brand-serps/', 0]
url data e ['https://kalicube.com/learning-spaces/faq/brand-serps/', 0], url e https://kalicube.com/learning-spaces/faq/brand-serps/, a num redirects e 0


Crawling progress:   1%|          | 7/700 [00:12<23:43,  2.05s/page]

Current url data e ['https://kalicube.com/learning-spaces/faq/knowledge-panels/', 0]
url data e ['https://kalicube.com/learning-spaces/faq/knowledge-panels/', 0], url e https://kalicube.com/learning-spaces/faq/knowledge-panels/, a num redirects e 0


Crawling progress:   1%|          | 8/700 [00:14<23:28,  2.04s/page]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-googles-representation-of-you-shape-your-personal-brand-tailoring-talk-podcast/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-googles-representation-of-you-shape-your-personal-brand-tailoring-talk-podcast/', 0], url e https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-googles-representation-of-you-shape-your-personal-brand-tailoring-talk-podcast/, a num redirects e 0


Crawling progress:   1%|▏         | 9/700 [00:16<23:00,  2.00s/page]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/', 0], url e https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/, a num redirects e 0


Crawling progress:   1%|▏         | 10/700 [00:18<22:58,  2.00s/page]

Current url data e ['https://kalicube.com/learning-spaces/how-does-consistency-contribute-to-machine-understandability/', 0]
url data e ['https://kalicube.com/learning-spaces/how-does-consistency-contribute-to-machine-understandability/', 0], url e https://kalicube.com/learning-spaces/how-does-consistency-contribute-to-machine-understandability/, a num redirects e 0


Crawling progress:   2%|▏         | 11/700 [00:21<23:39,  2.06s/page]

Current url data e ['https://kalicube.com/learning-spaces/faq/seo-glossary/', 0]
url data e ['https://kalicube.com/learning-spaces/faq/seo-glossary/', 0], url e https://kalicube.com/learning-spaces/faq/seo-glossary/, a num redirects e 0


Crawling progress:   2%|▏         | 12/700 [00:22<23:18,  2.03s/page]

Current url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-focusing-on-a-niche-impact-your-brand-and-business-growth-virtual-antics-podcast/', 0]
url data e ['https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-focusing-on-a-niche-impact-your-brand-and-business-growth-virtual-antics-podcast/', 0], url e https://kalicube.com/learning-spaces/knowledge-nuggets/brand-seo/how-does-focusing-on-a-niche-impact-your-brand-and-business-growth-virtual-antics-podcast/, a num redirects e 0


Crawling progress:   2%|▏         | 13/700 [00:24<20:43,  1.81s/page]

Current url data e ['https://kalicube.com/learning-spaces/how-does-the-kalicube-process-stay-ahead-of-technological-advancements/', 0]
url data e ['https://kalicube.com/learning-spaces/how-does-the-kalicube-process-stay-ahead-of-technological-advancements/', 0], url e https://kalicube.com/learning-spaces/how-does-the-kalicube-process-stay-ahead-of-technological-advancements/, a num redirects e 0


Crawling progress:   2%|▏         | 15/700 [00:27<18:33,  1.63s/page]

Current url data e ['https://kalicube.com/learning-spaces/', 0]
url data e ['https://kalicube.com/learning-spaces/', 0], url e https://kalicube.com/learning-spaces/, a num redirects e 0
Current url data e ['https://kalicube.com/learning-spaces/page/2/', 0]
url data e ['https://kalicube.com/learning-spaces/page/2/', 0], url e https://kalicube.com/learning-spaces/page/2/, a num redirects e 0


Crawling progress:   2%|▏         | 15/700 [00:29<22:09,  1.94s/page]


KeyboardInterrupt: 

### **Read the DB to check results**

In [None]:
# Import necessary libraries
from google.colab import drive
import sqlite3
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the database file on Google Drive
db_path = '/content/drive/My Drive/db_data.db'  # Update with your actual path

# Function to read the database and return a DataFrame
def read_database(db_path):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)

    # SQL query to select all data from the table
    query = "SELECT URL, Status_Code, Content FROM crawled_data"  # Replace 'your_table_name' with the actual table name

    # Execute the query and convert to a Pandas DataFrame
    df = pd.read_sql_query(query, conn)

    # Close the connection to the database
    conn.close()

    return df

# Call the function and display the data
df = read_database(db_path)
print(df[['URL', 'Content']].head(10))
print(len(df.index))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                                 URL  \
0              https://kalicube.com/learning-spaces/   
1       https://kalicube.com/learning-spaces/page/2/   
2         https://kalicube.com/learning-spaces/feed/   
3      https://kalicube.com/learning-spaces/page/45/   
4  https://kalicube.com/learning-spaces/faq/perso...   
5  https://kalicube.com/learning-spaces/faq/seo-g...   
6  https://kalicube.com/learning-spaces/why-shoul...   
7  https://kalicube.com/learning-spaces/knowledge...   
8  https://kalicube.com/learning-spaces/faq/perso...   
9  https://kalicube.com/learning-spaces/faq/perso...   

                                             Content  
0  <!doctype html>\n<html lang="en-US" class="no-...  
1  <!doctype html>\n<html lang="en-US" class="no-...  
2  <?xml version="1.0" encoding="UTF-8"?><rss ver...  
3  <!doctype html>\n<html lang="en

### **Cleaning the content** from the HTML in each DB row and saving it into each respective Cleaned_Content row

In [None]:
import sqlite3
from bs4 import BeautifulSoup
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Function to clean HTML content
def clean_html(content):
    # Create a BeautifulSoup object and specify the parser
    soup = BeautifulSoup(content, "html.parser")

    # Remove all script and style elements
    for script_or_style in soup(["script", "style", "header", "footer", "nav"]):
        script_or_style.decompose()  # rip it out

    # Get text
    text = soup.get_text()

    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())

    # Drop blank lines
    cleaned_content = '\n'.join(line for line in lines if line)

    return cleaned_content

# Function to check if the Cleaned_Content column exists
def add_cleaned_content_column_if_not_exists(cursor):
    # Check if 'Cleaned_Content' column exists
    cursor.execute("PRAGMA table_info(crawled_data)")
    columns = [column[1] for column in cursor.fetchall()]

    if 'Cleaned_Content' not in columns:
        # Add the Cleaned_Content column
        cursor.execute("ALTER TABLE crawled_data ADD COLUMN Cleaned_Content TEXT")

# Function to update the database with cleaned content
def update_database(db_file):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Ensure the Cleaned_Content column exists
    add_cleaned_content_column_if_not_exists(cursor)

    # Query to select URL and Content columns
    cursor.execute("SELECT rowid, URL, Content FROM crawled_data")

    # Iterate over each row
    for row in cursor.fetchall():
        rowid, url, content = row

        # Clean the HTML content
        cleaned_content = clean_html(content)

        # Update the row with the cleaned content
        cursor.execute(
            "UPDATE crawled_data SET Cleaned_Content = ? WHERE rowid = ?",
            (cleaned_content, rowid)
        )

    # Commit the transaction and close the connection
    conn.commit()
    conn.close()

if __name__ == "__main__":
    # Google Drive path to your database file
    db_file = '/content/drive/My Drive/db_data.db'  # Adjust the path accordingly
    update_database(db_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  k = self.parse_starttag(i)
