<a href="https://colab.research.google.com/github/patrickhamzaokello/ColabFavourites/blob/main/Create_Intents_from_Website.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Install spaCy and download the English model:
# pip install spacy
# python -m spacy download en_core_web_sm

In [28]:
import requests
from bs4 import BeautifulSoup
import markdown
import os
import time
from urllib.parse import urljoin, urlparse
import re
from typing import Set, List, Dict

class BankWebsiteScraper:
    def __init__(self, base_url: str, output_dir: str = "bank_content"):
        self.base_url = base_url
        self.output_dir = output_dir
        self.visited_urls: Set[str] = set()
        self.session = requests.Session()
        # Add common browser headers to avoid being blocked
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

    def is_valid_url(self, url: str) -> bool:
        """Check if URL belongs to the same domain and is a valid page"""
        parsed_base = urlparse(self.base_url)
        parsed_url = urlparse(url)
        return (parsed_url.netloc == parsed_base.netloc and
                not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js']) and
                '#' not in url)

    def clean_text(self, text: str) -> str:
        """Clean and format text content"""
        # Remove extra whitespace and newlines
        text = ' '.join(text.split())
        # Remove special characters
        text = re.sub(r'[^\w\s\-.,?!()]', '', text)
        return text.strip()

    def html_to_markdown(self, soup: BeautifulSoup, url: str) -> str:
        """Convert HTML content to markdown format"""
        markdown_content = []

        # Add page title
        if soup.title:
            markdown_content.append(f"# {soup.title.string.strip()}\n")

        # Add URL reference
        markdown_content.append(f"Source: {url}\n")

        # Process main content
        main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': ['content', 'main-content']})
        if not main_content:
            main_content = soup

        # Process headings
        for heading in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            level = int(heading.name[1])
            markdown_content.append(f"{'#' * level} {self.clean_text(heading.text)}\n")

        # Process paragraphs
        for p in main_content.find_all('p'):
            text = self.clean_text(p.text)
            if text:
                markdown_content.append(f"{text}\n")

        # Process lists
        for ul in main_content.find_all(['ul', 'ol']):
            for li in ul.find_all('li'):
                text = self.clean_text(li.text)
                if text:
                    markdown_content.append(f"- {text}\n")

        return '\n'.join(markdown_content)

    def scrape_page(self, url: str) -> List[str]:
        """Scrape single page and return new URLs to visit"""
        if url in self.visited_urls:
            return []

        self.visited_urls.add(url)
        print(f"Scraping: {url}")

        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Convert content to markdown
            markdown_content = self.html_to_markdown(soup, url)

            # Save to file
            filename = f"{len(self.visited_urls):04d}_{urlparse(url).path.replace('/', '_')}.md"
            if filename == '0000_.md':
                filename = '0000_home.md'

            with open(os.path.join(self.output_dir, filename), 'w', encoding='utf-8') as f:
                f.write(markdown_content)

            # Find new URLs
            new_urls = []
            for link in soup.find_all('a', href=True):
                new_url = urljoin(url, link['href'])
                if self.is_valid_url(new_url) and new_url not in self.visited_urls:
                    new_urls.append(new_url)

            time.sleep(1)  # Be nice to the server
            return new_urls

        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            return []

    def scrape_website(self):
        """Scrape entire website"""
        urls_to_visit = [self.base_url]

        while urls_to_visit:
            url = urls_to_visit.pop(0)
            new_urls = self.scrape_page(url)
            urls_to_visit.extend(new_urls)

class IntentTrainer:
    def __init__(self, markdown_dir: str):
        self.markdown_dir = markdown_dir
        self.content_by_category: Dict[str, List[str]] = {}

    def load_markdown_files(self):
        """Load and categorize markdown content"""
        for filename in os.listdir(self.markdown_dir):
            if filename.endswith('.md'):
                with open(os.path.join(self.markdown_dir, filename), 'r', encoding='utf-8') as f:
                    content = f.read()

                    # Extract categories based on headings and content
                    categories = self.identify_categories(content)
                    for category, text in categories.items():
                        if category not in self.content_by_category:
                            self.content_by_category[category] = []
                        self.content_by_category[category].extend(text)

    def identify_categories(self, content: str) -> Dict[str, List[str]]:
        """Identify banking categories from content"""
        categories = {}
        current_category = "general"
        current_text = []

        # Define category keywords
        category_keywords = {
            'account': ['account', 'savings', 'current', 'deposit'],
            'payments': ['bill', 'payment', 'utility', 'electricity', 'water'],
            'transfers': ['transfer', 'send money', 'remittance'],
            'loans': ['loan', 'credit', 'mortgage', 'financing'],
            'cards': ['card', 'credit card', 'debit card', 'atm'],
            'investments': ['investment', 'fixed deposit', 'mutual fund'],
            'digital_banking': ['online', 'mobile', 'internet banking', 'app']
        }

        for line in content.split('\n'):
            # Check if line is a heading
            if line.startswith('#'):
                # Save previous category
                if current_text:
                    if current_category not in categories:
                        categories[current_category] = []
                    categories[current_category].extend(current_text)
                    current_text = []

                # Determine new category based on heading
                heading = line.strip('#').strip().lower()
                current_category = "general"
                for category, keywords in category_keywords.items():
                    if any(keyword in heading for keyword in keywords):
                        current_category = category
                        break
            else:
                current_text.append(line.strip())

        # Save last category
        if current_text:
            if current_category not in categories:
                categories[current_category] = []
            categories[current_category].extend(current_text)

        return categories

    def generate_intents(self) -> Dict[str, List[str]]:
        """Generate intent patterns from categorized content"""
        intent_patterns = {}

        for category, content in self.content_by_category.items():
            patterns = self.extract_patterns(content)
            intent_patterns[category] = patterns

        return intent_patterns

    def extract_patterns(self, content: List[str]) -> List[str]:
        """Extract likely user intent patterns from content"""
        patterns = []

        # Common question patterns
        question_starters = ['how to', 'what is', 'where can', 'when does', 'can i']
        action_verbs = ['get', 'apply', 'open', 'start', 'view', 'check', 'pay', 'transfer']

        for line in content:
            line = line.lower()
            # Extract questions
            for starter in question_starters:
                if starter in line:
                    question = re.search(f"{starter}.*?[.?!]", line)
                    if question:
                        patterns.append(question.group(0))

            # Extract action statements
            for verb in action_verbs:
                if verb in line:
                    action = re.search(f"{verb}.*?[.?!]", line)
                    if action:
                        patterns.append(action.group(0))

        return list(set(patterns))  # Remove duplicates


In [29]:
def main():
    # Initialize scraper
    scraper = BankWebsiteScraper('https://www.gtbank.co.ug/')

    # Scrape website
    scraper.scrape_website()

if __name__ == "__main__":
    main()

Scraping: https://www.gtbank.co.ug/
Scraping: https://www.gtbank.co.ug/locate/branches
Scraping: https://www.gtbank.co.ug/locate/atms
Scraping: https://www.gtbank.co.ug/media-centre
Scraping: https://www.gtbank.co.ug/help-centre
Scraping: https://www.gtbank.co.ug/security-centre
Scraping: https://www.gtbank.co.ug/personal-banking
Scraping: https://www.gtbank.co.ug/personal-banking/accounts
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gtsave
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gtsave-plus
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/sks-account
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gt-target
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gt-seniors-account
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gt-salary-account
Scraping: https://www.gtbank.co.ug/personal-banking/accounts/gtcrea8-account
Scraping: https://www.gtbank.co.ug/personal-banking/ways-to-bank
Scraping: https://www

In [30]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import torch
from sentence_transformers import SentenceTransformer
import spacy



In [31]:
class IntentClassifier:
    def __init__(self, intents, default_intent='menu'):
        self.intents = intents
        self.default_intent = default_intent

        # Initialize models
        self.sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
        self.nlp = spacy.load('en_core_web_sm')
        self.tfidf = TfidfVectorizer()

        # Prepare intent embeddings
        self.prepare_intent_embeddings()

    def prepare_intent_embeddings(self):
        # Flatten intents for embedding
        self.all_examples = []
        self.intent_map = []
        for intent, examples in self.intents.items():
            self.all_examples.extend(examples)
            self.intent_map.extend([intent] * len(examples))

        # Create embeddings for all examples
        self.example_embeddings = self.sentence_transformer.encode(self.all_examples)

        # Create TF-IDF matrix
        self.tfidf_matrix = self.tfidf.fit_transform(self.all_examples)

    def classify_with_transformer(self, user_message, threshold=0.6):
        # Get message embedding
        message_embedding = self.sentence_transformer.encode([user_message])

        # Calculate similarities
        similarities = cosine_similarity(message_embedding, self.example_embeddings)[0]

        # Get best match
        best_match_idx = np.argmax(similarities)
        best_similarity = similarities[best_match_idx]

        if best_similarity >= threshold:
            return self.intent_map[best_match_idx], best_similarity
        return self.default_intent, best_similarity

    def classify_with_tfidf(self, user_message, threshold=0.3):
        # Transform user message
        message_tfidf = self.tfidf.transform([user_message])

        # Calculate similarities
        similarities = cosine_similarity(message_tfidf, self.tfidf_matrix)[0]

        # Get best match
        best_match_idx = np.argmax(similarities)
        best_similarity = similarities[best_match_idx]

        if best_similarity >= threshold:
            return self.intent_map[best_match_idx], best_similarity
        return self.default_intent, best_similarity

    def classify_with_spacy(self, user_message, threshold=0.7):
        doc = self.nlp(user_message.lower())

        best_similarity = 0
        best_intent = self.default_intent

        for intent, examples in self.intents.items():
            for example in examples:
                example_doc = self.nlp(example.lower())
                similarity = doc.similarity(example_doc)

                if similarity > best_similarity:
                    best_similarity = similarity
                    best_intent = intent

        if best_similarity >= threshold:
            return best_intent, best_similarity
        return self.default_intent, best_similarity

    def ensemble_classify(self, user_message, weights={'transformer': 0.5, 'tfidf': 0.3, 'spacy': 0.2}):
        # Get classifications from all methods
        transformer_intent, transformer_score = self.classify_with_transformer(user_message)
        tfidf_intent, tfidf_score = self.classify_with_tfidf(user_message)
        spacy_intent, spacy_score = self.classify_with_spacy(user_message)

        # Weight the scores
        weighted_scores = {
            transformer_intent: transformer_score * weights['transformer'],
            tfidf_intent: tfidf_score * weights['tfidf'],
            spacy_intent: spacy_score * weights['spacy']
        }

        # Get the intent with highest weighted score
        final_intent = max(weighted_scores.items(), key=lambda x: x[1])[0]

        return final_intent


# **Intent Generator from Scraped Content**

This script will:

1. Process your scraped content and extract meaningful sentences
2. Categorize content into banking-related categories
3. Generate various intent patterns from the content
4. Remove duplicates and cluster similar intents
5. Organize everything into a structured dictionary

In [32]:
import os
import re
from typing import Dict, List, Set
from collections import defaultdict
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

class IntentGenerator:
    def __init__(self, content_dir: str):
        self.content_dir = content_dir
        # Load spaCy model for NLP tasks
        self.nlp = spacy.load('en_core_web_sm')

        # Common banking terms and their categories
        self.banking_categories = {
            'account': ['account', 'savings', 'current', 'deposit', 'open', 'create', 'balance'],
            'payments': ['pay', 'bill', 'utility', 'electricity', 'water', 'tax', 'fees'],
            'transfers': ['transfer', 'send', 'receive', 'remit', 'swift', 'wire'],
            'loans': ['loan', 'borrow', 'credit', 'mortgage', 'financing'],
            'cards': ['card', 'credit', 'debit', 'atm', 'pin', 'contactless'],
            'investments': ['invest', 'fixed deposit', 'mutual fund', 'stocks', 'bonds'],
            'digital_banking': ['online', 'mobile', 'internet', 'app', 'password', 'login'],
            'customer_service': ['help', 'support', 'complaint', 'issue', 'problem', 'assist']
        }

        # Common question and action patterns
        self.patterns = {
            'question_starters': ['how', 'what', 'where', 'when', 'can', 'do'],
            'action_verbs': ['want', 'need', 'get', 'apply', 'check', 'view', 'make', 'send'],
            'intent_indicators': ['i want to', 'i need to', 'how do i', 'how can i', 'where do i']
        }

    def load_content(self) -> List[str]:
        """Load all content from scraped files"""
        content = []
        for filename in os.listdir(self.content_dir):
            if filename.endswith(('.txt', '.md', '.html')):
                with open(os.path.join(self.content_dir, filename), 'r', encoding='utf-8') as f:
                    content.append(f.read())
        return content

    def extract_sentences(self, text: str) -> List[str]:
        """Extract and clean sentences from text"""
        doc = self.nlp(text)
        sentences = []
        for sent in doc.sents:
            # Clean and normalize sentence
            clean_sent = re.sub(r'\s+', ' ', sent.text).strip()
            clean_sent = re.sub(r'[^\w\s\-.,?!()]', '', clean_sent)
            if clean_sent:
                sentences.append(clean_sent.lower())
        return sentences

    def categorize_sentence(self, sentence: str) -> str:
        """Determine the banking category of a sentence"""
        sentence = sentence.lower()
        max_matches = 0
        best_category = 'general'

        for category, keywords in self.banking_categories.items():
            matches = sum(1 for keyword in keywords if keyword in sentence)
            if matches > max_matches:
                max_matches = matches
                best_category = category

        return best_category

    def generate_intent_variations(self, sentence: str) -> List[str]:
        """Generate different variations of an intent"""
        variations = set()
        doc = self.nlp(sentence)

        # Original sentence
        variations.add(sentence)

        # Generate question form
        for starter in self.patterns['question_starters']:
            if not sentence.startswith(starter):
                variations.add(f"{starter} {sentence}")

        # Generate action form
        for verb in self.patterns['action_verbs']:
            if not sentence.startswith(f"i {verb}"):
                variations.add(f"i {verb} to {sentence}")

        # Remove duplicate words and clean up
        clean_variations = set()
        for var in variations:
            clean_var = ' '.join(dict.fromkeys(var.split()))
            clean_var = re.sub(r'\s+', ' ', clean_var).strip()
            if len(clean_var.split()) > 2:  # Only keep phrases with 3+ words
                clean_variations.add(clean_var)

        return list(clean_variations)

    def cluster_similar_intents(self, intents: List[str], n_clusters: int = 5) -> List[str]:
        """Cluster similar intents and select representatives"""
        if len(intents) <= n_clusters:
            return intents

        # Create TF-IDF matrix
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(intents)

        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(tfidf_matrix)

        # Select representative intents from each cluster
        representatives = []
        for i in range(n_clusters):
            cluster_intents = [intent for j, intent in enumerate(intents) if clusters[j] == i]
            if cluster_intents:
                # Select the shortest intent as representative
                representative = min(cluster_intents, key=len)
                representatives.append(representative)

        return representatives

    def generate_intents(self) -> Dict[str, List[str]]:
        """Generate comprehensive intents from content"""
        # Load and process content
        content = self.load_content()
        all_sentences = []
        for text in content:
            all_sentences.extend(self.extract_sentences(text))

        # Organize sentences by category
        categorized_sentences = defaultdict(list)
        for sentence in all_sentences:
            category = self.categorize_sentence(sentence)
            categorized_sentences[category].append(sentence)

        # Generate final intents
        final_intents = {}
        for category, sentences in categorized_sentences.items():
            # Generate variations for each sentence
            all_variations = []
            for sentence in sentences:
                variations = self.generate_intent_variations(sentence)
                all_variations.extend(variations)

            # Remove duplicates
            unique_variations = list(set(all_variations))

            # Cluster similar intents to reduce redundancy
            if len(unique_variations) > 20:
                final_intents[category] = self.cluster_similar_intents(
                    unique_variations,
                    n_clusters=min(20, len(unique_variations))
                )
            else:
                final_intents[category] = unique_variations

        return final_intents

def main():
    # Initialize generator
    generator = IntentGenerator('/content/bank_content')

    # Generate intents
    intents = generator.generate_intents()

    # Print results
    for category, intent_list in intents.items():
        print(f"\n{category.upper()} ({len(intent_list)} intents):")
        for intent in intent_list:
            print(f"- {intent}")
        print("-" * 50)

if __name__ == "__main__":
    main()


DIGITAL_BANKING (20 intents):
- career gtbank uganda source httpswww.gtbank.co.ugcareer online banking .
- corporate governance gtbank uganda source httpswww.gtbank.co.ugaboutour-companycorporate-governance online banking our company brand affiliations personal business about us help security quick links history people recognition and awards financial information pillar 3 market discipline disclosures guaranty trust bank (uganda) limited is committed to adhering the highest standards of good at all levels its operations.
- help centre gtbank uganda source httpswww.gtbank.co.ughelp-centre online banking topics personal business about us security quick links 2025 guaranty trust bank, uganda.
- these rules signify our collective belief and guide approach to work.
- internship graduates application process navigate learn more personal banking business about us help security quick links
- it is an internet-based solution that enables online payments using local as well international cards 

In [33]:
def main():
    # Initialize classifier
    classifier = IntentClassifier(intents)

    # Example messages
    test_messages = [
        "I want to open a new account",
        "My Yaka payment failed but money was deducted",
        "How do I receive money from USA?",
        "Need to pay my electricity bill",
        "i need a card",
        "is there hospital payments"
    ]

    # Test classification
    for message in test_messages:
        result = classifier.ensemble_classify(message)
        print(f"Message: {message}")
        print(f"Classified Intent: {result}\n")

if __name__ == "__main__":
    main()

  similarity = doc.similarity(example_doc)


Message: I want to open a new account
Classified Intent: open_account



  similarity = doc.similarity(example_doc)


Message: My Yaka payment failed but money was deducted
Classified Intent: bill_payment



  similarity = doc.similarity(example_doc)


Message: How do I receive money from USA?
Classified Intent: transfer_money



  similarity = doc.similarity(example_doc)


Message: Need to pay my electricity bill
Classified Intent: bill_payment



  similarity = doc.similarity(example_doc)


Message: i need a card
Classified Intent: menu



  similarity = doc.similarity(example_doc)


Message: is there hospital payments
Classified Intent: general

