In [1]:
import pandas as pd

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel("corpus_dataset_with_label.xlsx")

# Get basic information about the DataFrame
description = df.describe()

# Display the description
print(description)
print(df.head())


              PRLI
count  1831.000000
mean     30.410287
std      31.522799
min       0.000000
25%       5.075094
50%      22.966550
75%      42.631923
max     211.415429
            Patent#       PRLI label  \
0     JP2002207516A   0.000000    Q4   
1      JP03770383B2   0.000000    Q4   
2      CN115026807A  19.524426    Q3   
3  DE202013105677U1   0.000000    Q4   
4    WO2015086722A1  34.045956    Q2   

                                              Corpus  
0  Working robot such as robot for construction, ...  
1  Working robot such as robot for construction, ...  
2  Working method for a construction robot.; The ...  
3  Working fixture for use during work-piece asse...  
4  Working fixture for use during work-piece asse...  


# To perform NLP (Natural Language Processing) 
tasks using the topicwizard library with your corpus dataset, you can follow these steps:

1\First, make sure you have the topicwizard library installed. You can install it using pip if you haven't already:
2\Next, you can use the following code to perform topic modeling using topicwizard on your corpus dataset:
3\This code will perform topic modeling using NMF on your corpus dataset and visualize the topics using the topicwizard library. You can adjust the number of topics (n_components) as needed for your analysis.
Make sure to replace "corpus_dataset.xlsx" with the actual path to your Excel file, and if you have specific group labels, replace the group_labels list accordingly.

In [2]:
# !pip install topicwizard


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
import topicwizard
import pandas as pd
from topicwizard.figures import topic_barcharts, word_map, group_topic_barcharts

# Read the "corpus_dataset.xlsx" file into a Pandas DataFrame
file_path = "corpus_dataset_with_label.xlsx"
df = pd.read_excel(file_path)

# Extract the corpus from the DataFrame
corpus = df['Corpus'].tolist()
PRLI = df['PRLI'].tolist()
labels = df['label'].tolist()

# Setting up topic modeling pipeline
vectorizer = CountVectorizer(max_df=0.8, min_df=10, stop_words="english")
nmf = NMF(n_components=15)  # You can adjust the number of topics as needed

# print(corpus)
print("corpus length: ", len(corpus))
print(df.head())
print(corpus[0])


corpus length:  1831
            Patent#       PRLI label  \
0     JP2002207516A   0.000000    Q4   
1      JP03770383B2   0.000000    Q4   
2      CN115026807A  19.524426    Q3   
3  DE202013105677U1   0.000000    Q4   
4    WO2015086722A1  34.045956    Q2   

                                              Corpus  
0  Working robot such as robot for construction, ...  
1  Working robot such as robot for construction, ...  
2  Working method for a construction robot.; The ...  
3  Working fixture for use during work-piece asse...  
4  Working fixture for use during work-piece asse...  
Working robot such as robot for construction, robot for agrochemical, security robot and walk guide robot for visually handicapped, aged person, guide dog robot, self driven robot.; Enables driving and controlling working robot easily.; The figure shows the block diagram of working robot. (Drawing includes non-English language text).; Working robot such as security robot, robot for construction, generates

To preprocess the corpus dataset using NLTK (Natural Language Toolkit) in Python, you can perform common text preprocessing tasks such as tokenization, stopword removal, and stemming. Here's a sample code to preprocess the "Corpus" column of your dataset:

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Read the "corpus_dataset.xlsx" file into a Pandas DataFrame
file_path = "corpus_dataset_with_label.xlsx"
df = pd.read_excel(file_path)
PRLIs = df['PRLI'].tolist()
labels = df['label'].tolist()
Patents = df['Patent#'].tolist()

# Define a function for text preprocessing
def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        # Tokenization
        words = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]

        # Stemming (using Porter stemmer)
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in filtered_words]

        # Join the cleaned words back into a single string
        cleaned_text = ' '.join(stemmed_words)

        return cleaned_text
    else:
        return ''  # Return an empty string for non-string values

# Apply the preprocessing function to the "Corpus" column
df['Cleaned_Corpus'] = df['Corpus'].apply(preprocess_text)

# Display the cleaned corpus
print(df['Cleaned_Corpus'].head())

# Create a new DataFrame with the desired columns
cleaned_df = df[['Patent#', 'PRLI', 'label', 'Cleaned_Corpus']]

# Display the new DataFrame
print(cleaned_df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eric\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    work robot robot construct , robot agrochem , ...
1    work robot robot construct , robot agrochem , ...
2    work method construct robot . ; problem whole ...
3    work fixtur use work-piec assembl process . ; ...
4    work fixtur use work-piec assembl process . ; ...
Name: Cleaned_Corpus, dtype: object
            Patent#       PRLI label  \
0     JP2002207516A   0.000000    Q4   
1      JP03770383B2   0.000000    Q4   
2      CN115026807A  19.524426    Q3   
3  DE202013105677U1   0.000000    Q4   
4    WO2015086722A1  34.045956    Q2   

                                      Cleaned_Corpus  
0  work robot robot construct , robot agrochem , ...  
1  work robot robot construct , robot agrochem , ...  
2  work method construct robot . ; problem whole ...  
3  work fixtur use work-piec assembl process . ; ...  
4  work fixtur use work-piec assembl process . ; ...  


In [16]:
%%capture
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
import topicwizard
from topicwizard.pipeline import make_topic_pipeline
import numpy as np
from topicwizard.pipeline import make_topic_pipeline
from topicwizard.figures import topic_barcharts, word_map, group_topic_barcharts

# Extract the cleaned corpus from the DataFrame
corpus = df['Cleaned_Corpus'].tolist()
group_labels =df['label'].tolist()
# the actual textual label.
# group_labels = np.array(corpus.label)[corpus.label != '']

# Setting up topic modeling pipeline
bow_vectorizer = CountVectorizer(max_df=0.8, min_df=10, stop_words="english")
nmf = NMF(n_components=15)  # You can adjust the number of topics as needed

# Fitting the topic modeling pipeline
pipeline = make_topic_pipeline(bow_vectorizer, nmf)
pipeline.fit(corpus)


In [6]:
#Visualize take these components in the pipeline individually.
topicwizard.visualize(corpus=corpus, pipeline=pipeline, group_labels=group_labels)

Preprocessing
Dash is running on http://127.0.0.1:8050/

 * Serving Flask app 'topicwizard.app'
 * Debug mode: off


 * Running on http://127.0.0.1:8050
Press CTRL+C to quit
127.0.0.1 - - [04/Sep/2023 11:42:51] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "GET /_dash-dependencies HTTP/1.1" 200 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "GET /_dash-layout HTTP/1.1" 200 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "GET /_dash-component-suites/dash/dcc/async-graph.js HTTP/1.1" 304 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "GET /_dash-component-suites/dash/dcc/async-dropdown.js HTTP/1.1" 304 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "GET /_dash-component-suites/dash/dcc/async-plotlyjs.js HTTP/1.1" 304 -
127.0.0.1 - - [04/Sep/2023 11:42:52] "POST /_dash-update-component HTTP/1.1" 204 -

divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log

127.0.0.1 - - [04/Sep/2023 11:42:53] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [04/Sep/2023 11:42:54] "POST /_dash-update-component HTTP/1.1" 200 -
127.0.0.1 - - [0

In [7]:
topic_barcharts(corpus=corpus, pipeline=pipeline, top_n=10)



divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered 

In [8]:
from topicwizard.figures import topic_wordclouds

topic_wordclouds(corpus=corpus, pipeline=pipeline, alpha=1.0)


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered 

In [9]:

# Visualize topics using topic_barcharts
topic_barcharts(cleaned_df['Cleaned_Corpus'], pipeline=pipeline, top_n=5)

# Visualize word maps for topics
word_map(cleaned_df['Cleaned_Corpus'], pipeline=pipeline)

# If you have group labels, you can visualize topics for groups
# group_labels = df['Patent #'].tolist()  # Replace with your actual group labels if needed
# group_topic_barcharts(corpus, group_labels, pipeline=pipeline, top_n=5)



divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered 

In [13]:
from topicwizard.figures import  word_association_barchart
word_association_barchart(["site"], corpus=corpus, pipeline=pipeline)