In [None]:
# https://blog.mlreview.com/topic-modeling-with-scikit-learn-e80d33668730
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

In [None]:
type(dataset)

In [None]:
import pandas as pd

df = pd.DataFrame(dataset.data)
df.head()

In [None]:
df[0][0]

In [None]:
df[0][4]

In [None]:
df.shape

<h3>Thoughts</h3>

LDA is for Topic Modeling. Topic Modeling is for larger/longer documents, and you can assign categories to documents to group them. This is not what I'm trying to do.

I need to use Named Entity Recognition (NER). This can take a single stentence and break down the entities within it. This is frequently used for Chatbots.

Example: 

<i>Summer Hirst went to Ravensbourne University in 2010 and met Emily Victor there.</i>

NER can recognize Summer Hirst as a person, Ravensbourne University as a university, 2010 as a date, and Emily Victor as another person.

This needs intensive labeling to understand separate words and the categories they belong to. Apart from labeling, the model also needs to understand the context to remove ambiguity. Once the ambiguity is removed, it can be used for extracting information from unstructured text.

In [None]:
import nltk
import spacy

In [None]:
!pip3 install spacy

In [None]:
# https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/
# Step One: Import nltk and download necessary packages

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
 
# Step Two: Load Data
 
sentence = "WASHINGTON -- In the wake of a string of abuses by New York police officers in the 1990s, Loretta E. Lynch, the top federal prosecutor in Brooklyn, spoke forcefully about the pain of a broken trust that African-Americans felt and said the responsibility for repairing generations of miscommunication and mistrust fell to law enforcement."

# Step Three: Tokenise, find parts of speech and chunk words 

for sent in nltk.sent_tokenize(sentence):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()

[Blog](https://blog.hubspot.com/website/python-ai-chat-bot#:~:text=Exploring%20Natural%20Language%20Processing%20(NLP)%20in%20Python&text=The%20ultimate%20objective%20of%20NLP,user%20queries%20in%20human%20language.)

Python ChatBot libraries:
* ChatterBot
* Rasa
* DialogFlow


In [None]:
# https://www.analyticsvidhya.com/blog/2021/10/complete-guide-to-build-your-ai-chatbot-with-nlp-in-python/
import transformers

# nlp = transformers.pipeline("conversational", model="microsoft/DialoGPT-medium")

# input_text = "hello!"
# nlp(transformers.Conversation(input_text), pad_token_id=50256)

In [None]:
# https://rubikscode.net/2022/04/25/text-summarization-with-huggingface-transformers/


In [1]:
import chatterbot

ModuleNotFoundError: No module named 'chatterbot'

In [2]:
import transformers

ModuleNotFoundError: No module named 'transformers'

In [22]:
from datetime import datetime
import pytz

now = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
now

'2024-02-29 14:20:04'

In [9]:
current_dateTime

datetime.datetime(2024, 2, 29, 15, 15, 45, 948278)

In [28]:
event_dict = {
    "event": "call mom",
    "timestamp": "2024-03-02 12:00:00 PST"
}

In [48]:
response = f"You'll get a reminder to {event_dict['event']} at {event_dict['timestamp']}"

In [49]:
response

"You'll get a reminder to call mom at 2024-03-02 12:00:00 PST"