In [None]:
# https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

In [None]:
type(dataset)

In [None]:
import pandas as pd

df = pd.DataFrame(dataset.data)
df.head()

In [None]:
df[0][0]

In [None]:
df[0][4]

In [None]:
df.shape

LDA is for Topic Modeling. Topic Modeling is for larger/longer documents, and you can assign categories to documents to group them. This is not what I'm trying to do.

I need to use Named Entity Recognition (NER). This can take a single stentence and break down the entities within it. This is frequently used for Chatbots.

Example: <i>Summer Hirst went to Ravensbourne University in 2010 and met Emily Victor there.</i>
NER can recognize Summer Hirst as a person, Ravensbourne University as a university, 2010 as a date, and Emily Victor as another person.

This needs intensive labeling to understand separate words and the categories they belong to. Apart from labeling, the model also needs to understand the context to remove ambiguity. Once the ambiguity is removed, it can be used for extracting information from unstructured text.

In [None]:
import nltk
import spacy

In [None]:
!pip3 install spacy

In [None]:
# https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/
# Step One: Import nltk and download necessary packages

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
 
# Step Two: Load Data
 
sentence = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."

# Step Three: Tokenise, find parts of speech and chunk words 

for sent in nltk.sent_tokenize(sentence):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()

[Blog](https://blog.hubspot.com/website/python-ai-chat-bot#:~:text=Exploring%20Natural%20Language%20Processing%20(NLP)%20in%20Python&text=The%20ultimate%20objective%20of%20NLP,user%20queries%20in%20human%20language.)

Python ChatBot libraries:
ChatterBot
Rasa
DialogFlow


In [None]:
# https://www.analyticsvidhya.com/blog/2021/10/complete-guide-to-build-your-ai-chatbot-with-nlp-in-python/
import transformers

# nlp = transformers.pipeline("conversational", model="microsoft/DialoGPT-medium")

# input_text = "hello!"
# nlp(transformers.Conversation(input_text), pad_token_id=50256)

In [None]:
!python3 --version

In [None]:
# https://rubikscode.net/2022/04/25/text-summarization-with-huggingface-transformers/


In [2]:
import chatterbot

ModuleNotFoundError: No module named 'chatterbot'

In [3]:
!pip install chatterbot

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting chatterbot
  Using cached ChatterBot-1.0.5-py2.py3-none-any.whl (67 kB)
Collecting mathparse<0.2,>=0.1 (from chatterbot)
  Using cached mathparse-0.1.2-py3-none-any.whl (7.2 kB)
Collecting nltk<4.0,>=3.2 (from chatterbot)
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting pint>=0.8.1 (from chatterbot)
  Using cached Pint-0.23-py3-none-any.whl.metadata (8.1 kB)
Collecting pymongo<4.0,>=3.3 (from chatterbot)
  Downloading pymongo-3.13.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (9.8 kB)
Collecting python-dateutil<2.8,>=2.7 (from chatterbot)
  Using cached python_dateutil-2.7.5-py2.py3-none-any.whl (225 kB)
Collecting pyyaml<5.2,>=5.1 (from chatterbot)
  Using cached PyYAML-5.1.2.

  [31m   [0m       clang -c /private/var/folders/92/89sb2k757pdb4xcbw7nw7rph0000gn/T/pip-install-g3kb7kww/blis_1a6f5b5ae9274f15b78d135446e713d8/blis/_src/ref_kernels/ind/bli_gemm4m1_ref.c -o /var/folders/92/89sb2k757pdb4xcbw7nw7rph0000gn/T/tmpe3fi166y/bli_gemm4m1_steamroller_ref.o -O2 -fomit-frame-pointer -mfpmath=sse -mavx -mfma -mno-fma4 -march=bdver3 -fPIC -std=c99 -D_POSIX_C_SOURCE=200112L -DBLIS_VERSION_STRING="0.5.0-6" -DBLIS_CNAME=steamroller -DBLIS_IS_BUILDING_LIBRARY -Iinclude/darwin-x86_64 -I./frame/3/ -I./frame/ind/ukernels/ -I./frame/1m/ -I./frame/1f/ -I./frame/1/ -I./frame/include -I/private/var/folders/92/89sb2k757pdb4xcbw7nw7rph0000gn/T/pip-install-g3kb7kww/blis_1a6f5b5ae9274f15b78d135446e713d8/blis/_src/include/darwin-x86_64
  [31m   [0m       clang -c /private/var/folders/92/89sb2k757pdb4xcbw7nw7rph0000gn/T/pip-install-g3kb7kww/blis_1a6f5b5ae9274f15b78d135446e713d8/blis/_src/ref_kernels/ind/bli_gemm4mb_ref.c -o /var/folders/92/89sb2k757pdb4xcbw7nw7rph0000gn/T/tmpe

[?25h

In [2]:
!pip3 install chatterbot==1.0.4

Collecting chatterbot==1.0.4
  Using cached ChatterBot-1.0.4-py2.py3-none-any.whl (66 kB)
Collecting chatterbot-corpus<1.3,>=1.2 (from chatterbot==1.0.4)
  Using cached chatterbot_corpus-1.2.0-py2.py3-none-any.whl (117 kB)
Collecting mathparse<0.2,>=0.1 (from chatterbot==1.0.4)
  Using cached mathparse-0.1.2-py3-none-any.whl (7.2 kB)
Collecting nltk<4.0,>=3.2 (from chatterbot==1.0.4)
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting pymongo<4.0,>=3.3 (from chatterbot==1.0.4)
  Using cached pymongo-3.13.0-cp312-cp312-macosx_11_0_arm64.whl
Collecting python-dateutil<2.8,>=2.7 (from chatterbot==1.0.4)
  Using cached python_dateutil-2.7.5-py2.py3-none-any.whl (225 kB)
Collecting sqlalchemy<1.3,>=1.2 (from chatterbot==1.0.4)
  Using cached SQLAlchemy-1.2.19-cp312-cp312-macosx_11_0_arm64.whl
Collecting pint>=0.8.1 (from chatterbot==1.0.4)
  Using cached Pint-0.23-py3-none-any.whl.metadata (8.1 kB)
Collecting PyYAML<4.0,>=3.12 (from chatterbot-corpus<1.3,>=1.2->chatterbo

In [4]:
!pip install chatterbot_corpus

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [6]:
!pip3 install chatterbot



In [1]:
import chatterbot

ModuleNotFoundError: No module named 'chatterbot'

In [2]:
import transformers

ModuleNotFoundError: No module named 'transformers'