In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

def cluster_text(text, n_clusters=5):
    tokens = preprocess_text(text)
    
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([' '.join(tokens)])
    
    # Perform hierarchical clustering
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    clustering.fit(tfidf_matrix.toarray())
    
    # Get words for each cluster
    feature_names = vectorizer.get_feature_names_out()
    clusters = {}
    for i in range(n_clusters):
        cluster_words = [feature_names[j] for j in range(len(feature_names)) if clustering.labels_[j] == i]
        clusters[i] = cluster_words
    
    return clusters

def create_cooccurrence_network(tokens, window_size=5):
    word_pairs = []
    for i in range(len(tokens)):
        for j in range(i+1, min(i+window_size, len(tokens))):
            word_pairs.append((tokens[i], tokens[j]))
    
    # Count co-occurrences
    cooccurrence = Counter(word_pairs)
    
    # Create graph
    G = nx.Graph()
    for (word1, word2), count in cooccurrence.items():
        G.add_edge(word1, word2, weight=count)
    
    return G

def visualize_network(G, clusters):
    plt.figure(figsize=(12, 8))
    
    # Set node colors based on clusters
    color_map = plt.cm.get_cmap('viridis')
    node_colors = []
    for node in G.nodes():
        for cluster_id, words in clusters.items():
            if node in words:
                node_colors.append(color_map(cluster_id / len(clusters)))
                break
        else:
            node_colors.append('gray')
    
    # Draw the graph
    pos = nx.spring_layout(G)
    nx.draw(G, pos, node_color=node_colors, with_labels=True, node_size=1000, font_size=8)
    
    # Draw edge labels
    edge_labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    
    plt.title("Patent Text Co-occurrence Network")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

def analyze_patent_text(text, n_clusters=5):
    tokens = preprocess_text(text)
    clusters = cluster_text(text, n_clusters)
    G = create_cooccurrence_network(tokens)
    visualize_network(G, clusters)
    
    return clusters

# Example usage
patent_text = """
A method and system for enhancing the efficiency of solar panels through the application of 
nanotechnology. The invention involves coating the surface of solar cells with a layer of 
carbon nanotubes, which increases light absorption and improves electron transport. 
This novel approach results in significantly higher energy conversion rates compared to 
traditional solar panels. The method also includes a process for uniformly depositing 
the nanotubes onto the solar cell surface, ensuring consistent performance across the entire panel.
"""

clusters = analyze_patent_text(patent_text)
print("Clustered words:")
for cluster_id, words in clusters.items():
    print(f"Cluster {cluster_id}: {', '.join(words)}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...


ValueError: Found array with 1 sample(s) (shape=(1, 43)) while a minimum of 2 is required by AgglomerativeClustering.

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
import networkx as nx
from pyvis.network import Network
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and non-alphabetic tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

def cluster_text(text, n_clusters=5):
    tokens = preprocess_text(text)
    
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([' '.join(tokens)])
    
    # Check if we have enough unique terms for clustering
    if tfidf_matrix.shape[1] < n_clusters:
        print(f"Warning: Not enough unique terms ({tfidf_matrix.shape[1]}) for {n_clusters} clusters. Adjusting number of clusters.")
        n_clusters = max(2, tfidf_matrix.shape[1] - 1)
    
    # Perform hierarchical clustering
    if tfidf_matrix.shape[0] > 1:
        clustering = AgglomerativeClustering(n_clusters=n_clusters)
        clustering.fit(tfidf_matrix.toarray())
        labels = clustering.labels_
    else:
        print("Warning: Only one sample available. Skipping clustering.")
        labels = np.zeros(tfidf_matrix.shape[1], dtype=int)
    
    # Get words for each cluster
    feature_names = vectorizer.get_feature_names_out()
    clusters = {}
    for i in range(n_clusters):
        cluster_words = [feature_names[j] for j in range(len(feature_names)) if labels[j] == i]
        clusters[i] = cluster_words
    
    return clusters

def create_cooccurrence_network(tokens, window_size=5):
    word_pairs = []
    for i in range(len(tokens)):
        for j in range(i+1, min(i+window_size, len(tokens))):
            word_pairs.append((tokens[i], tokens[j]))
    
    # Count co-occurrences
    cooccurrence = Counter(word_pairs)
    
    # Create graph
    G = nx.Graph()
    for (word1, word2), count in cooccurrence.items():
        G.add_edge(word1, word2, weight=count)
    
    return G

def visualize_network_pyvis(G, clusters):
    # Create a pyvis network
    net = Network(notebook=True, height="500px", width="100%", bgcolor="#ffffff", font_color="black")
    
    # Define a color palette
    color_palette = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#85C1E9', '#D7BDE2']
    
    # Add nodes to the network
    for node in G.nodes():
        # Find which cluster the node belongs to
        cluster_id = next((i for i, words in clusters.items() if node in words), -1)
        
        # Add node with color based on cluster
        color = color_palette[cluster_id % len(color_palette)] if cluster_id != -1 else '#CCCCCC'
        net.add_node(node, label=node, title=f"Cluster {cluster_id}", color=color)
    
    # Add edges to the network
    for edge in G.edges(data=True):
        net.add_edge(edge[0], edge[1], value=edge[2]['weight'], title=f"Weight: {edge[2]['weight']}")
    
    # Set physics layout
    net.set_options('''
    var options = {
      "physics": {
        "forceAtlas2Based": {
          "gravitationalConstant": -50,
          "centralGravity": 0.01,
          "springLength": 100,
          "springConstant": 0.08
        },
        "maxVelocity": 50,
        "minVelocity": 0.1,
        "solver": "forceAtlas2Based",
        "timestep": 0.35
      }
    }
    ''')
    
    # Save and show the network
    net.show("patent_network.html")

def analyze_patent_text(text, n_clusters=5):
    tokens = preprocess_text(text)
    clusters = cluster_text(text, n_clusters)
    G = create_cooccurrence_network(tokens)
    visualize_network_pyvis(G, clusters)
    
    return clusters

# Example usage
patent_text = """
A method and system for enhancing the efficiency of solar panels through the application of 
nanotechnology. The invention involves coating the surface of solar cells with a layer of 
carbon nanotubes, which increases light absorption and improves electron transport. 
This novel approach results in significantly higher energy conversion rates compared to 
traditional solar panels. The method also includes a process for uniformly depositing 
the nanotubes onto the solar cell surface, ensuring consistent performance across the entire panel.
"""

clusters = analyze_patent_text(patent_text)
print("Clustered words:")
for cluster_id, words in clusters.items():
    print(f"Cluster {cluster_id}: {', '.join(words)}")

#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
#FF6B6B 0
patent_network.html
Clustered words:
Cluster 0: absorption, across, also, application, approach, carbon, cell, coating, compared, consistent, conversion, depositing, efficiency, electron, energy, enhancing, ensuring, entire, higher, improves, includes, increase, invention, involves, layer, light, method, nanotechnology, nanotube, novel, onto, panel, performance, process, rate, result, significantly, solar, surface, system, traditional, transport, uniformly
Cluster 1: 
Cluster 2: 
Cluster 3: 
Cluster 4: 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tmina\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
