In [None]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

# Ensure that NLTK's stop words are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_claims_from_url(url):
    # Make an HTTP GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML using Beautiful Soup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all claim elements
        claims = soup.find_all(class_='claim-text')
        
        # Extract text from each claim and remove numbering and stop words
        claim_texts = []
        current_claim_number = None
        current_claim_text = ""
        for claim in claims:
            text = claim.get_text(strip=True)  # Use strip=True to remove leading and trailing whitespace
            # Check if the claim is not a dependent claim
            if 'dependent' not in claim.get('class', []):
                claim_number_match = re.match(r'^(\d+)\.', text)
                if claim_number_match:
                    # If current claim text is not empty, append it to claim_texts
                    if current_claim_text:
                        filtered_text = ' '.join(word for word in current_claim_text.split() if word.lower() not in stop_words)
                        claim_texts.append(filtered_text.strip())
                    # Update current claim number and reset current claim text
                    current_claim_number = int(claim_number_match.group(1))
                    current_claim_text = text
                else:
                    # Concatenate the text to the current claim text
                    current_claim_text += " " + text
        # Append the last claim text
        if current_claim_text:
            filtered_text = ' '.join(word for word in current_claim_text.split() if word.lower() not in stop_words)
            claim_texts.append(filtered_text.strip())
        
        return claim_texts
    else:
        # Print an error message if the request was not successful
        print(f"Failed to fetch HTML content from {url}. Status code: {response.status_code}")
        return None

def extract_claims_from_multiple_urls(urls):
    all_claims = []
    for url in urls:
        claims = extract_claims_from_url(url)
        if claims:
            all_claims.append(claims)
    return all_claims

# Example usage
urls = [
    "https://patents.google.com/patent/GB2478972A/en?q=(phone)&oq=phone",
    "https://patents.google.com/patent/US9634864B2/en?oq=US9634864B2",
    "https://patents.google.com/patent/US9980046B2/en?oq=US9980046B2"
]  # Replace "URL1", "URL2", etc. with your actual URLs
all_claims = extract_claims_from_multiple_urls(urls)

for url, claims in zip(urls, all_claims):
    print(url)
    for claim_text in claims:
        print(claim_text)
        print("--------------")
print(len(all_claims))



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from Kmeans import Kmeans


# Flatten the list of lists into a single list
all_claims_flat = [claim for sublist in all_claims for claim in sublist]

# Convert claims to lowercase
all_claims_lower = [claim.lower() for claim in all_claims_flat]


num_clusters = 5
normal_kmeans = Kmeans(num_clusters, all_claims_lower, all_claims_flat)
normal_kmeans.print_clustered_claims()
normal_kmeans.plot()


print("distances of clusters\n",normal_kmeans.distance_between_clusters())

print("clusters density\n",normal_kmeans.cluster_density())


#done using TF-IDF

print(normal_kmeans.get_clusters_titles())

In [None]:

from GMMClustering import GMMClustering

gmm = GMMClustering(5,all_claims_lower,all_claims_flat)
gmm.plot()
gmm.print_clustered_claims()

print("distances of clusters\n",gmm.distance_between_clusters())

print("clusters density\n",gmm.cluster_density())

#using czearing title model generator

print(gmm.get_clusters_titles())


In [None]:
from GPT2Kmeans import GPT2Kmeans
gpt2_kmeans = GPT2Kmeans(num_clusters, all_claims_flat)

# Plot clusters
gpt2_kmeans.plot()

# Print clustered claims
gpt2_kmeans.print_clustered_claims()



print("distances of clusters\n", gpt2_kmeans.distance_between_clusters())
print("clusters density\n", gpt2_kmeans.cluster_density())

# done by embeddings
print(gpt2_kmeans.get_clusters_titles())

In [None]:
# i will choose the GMM with te model because its using a smarter clustering algorithem with a language model to create titles based
# on other examples in
# the biggest advatage of that way is the "accuracy" of the clustering titles. but its a really slow method thats might require resorces
# like gpu and even then might take alot of time on larger data.

# in terms of performance and accuracy i will probably choose the gpt2 with embeddings. even tho its also using an llm to generate titles
# its uses the llm inorder to cluster and then using the embeddings to get a title (embeddings after llm promies good embeddings and much quicker)

#overall winner is GPT2 with embeddings.