In [12]:
# Using BertTopic for topic modeling
import pandas as pd
import os
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


In [2]:
from sklearn.datasets import fetch_20newsgroups

In [8]:
df = fetch_20newsgroups(subset='all',  
                             shuffle=False, remove=('headers', 'footers', 'quotes'))

<h2> Apply the BERT Topic Model with default Parameters </h2>

In [16]:
topic_model = BERTopic()

In [18]:
topics, probs = topic_model.fit_transform(df['data'])

In [20]:
topic_model.get_topic(0)

[('game', 0.010474642339263639),
 ('team', 0.009115333048998012),
 ('games', 0.007258182019625589),
 ('he', 0.007153846122913083),
 ('players', 0.00638202023585328),
 ('season', 0.0062998866888781735),
 ('hockey', 0.00614911538257575),
 ('play', 0.005835085098680811),
 ('25', 0.005707349759313482),
 ('year', 0.0056932956350379425)]

In [24]:
topics_info = topic_model.get_topic_info()


In [34]:
topics_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6841,-1_to_the_of_and,"[to, the, of, and, is, for, you, in, it, that]",[Brian Kendig first states:\n\n\nI ask:\n\n\nB...
1,0,1832,0_game_team_games_he,"[game, team, games, he, players, season, hocke...","[\n\n""Deeply rooted rivalry?"" Ahem, Jokerit ha..."
2,1,577,1_key_clipper_chip_encryption,"[key, clipper, chip, encryption, keys, escrow,...",[\nI am not an expert in the cryptography scie...
3,2,526,2_ites_cheek_yep_huh,"[ites, cheek, yep, huh, ken, forget, why, lets...","[Ken\n, \n \n ..."
4,3,473,3_israel_israeli_jews_arab,"[israel, israeli, jews, arab, jewish, arabs, p...",[From: Center for Policy Research <cpr>\nSubje...
...,...,...,...,...,...
207,206,12,206_life_you_kendigianism_your,"[life, you, kendigianism, your, my, master, do...",[= In article <1993Apr21.231552.24869@organpip...
208,207,11,207_dock_apple_macs_duo,"[dock, apple, macs, duo, cpu, powerpc, rockets...",[\n >>The info I am about to give is not a r...
209,208,11,208_professors_university_teaching_phds,"[professors, university, teaching, phds, tas, ...","[speaking of the sick bastard, i noticed he at..."
210,209,11,209_ear_hearing_wax_ears,"[ear, hearing, wax, ears, syringe, widex, vida...",[\nVida> Can one develop inner-ear problems fr...


In [42]:
topic_model.get_document_info(df['data'])['Representative_document'].unique()

array([False,  True])

In [37]:
topic_model.get_document_info(df['data']).iloc[1]['Document'], topic_model.get_document_info(df['data']).iloc[1]['Representation']

('Well, I just got my Centris 610 yesterday.  It took just over two \nweeks from placing the order.  The dealer (Rutgers computer store) \nappologized because Apple made a substitution on my order.  I ordered\nthe one without ethernet, but they substituted one _with_ ethernet.\nHe wanted to know if that would be "alright with me"!!!  They must\nbe backlogged on Centri w/out ethernet so they\'re just shipping them\nwith!  \n\n\tAnyway, I\'m very happy with the 610 with a few exceptions.  \nBeing nosy, I decided to open it up _before_ powering it on for the first\ntime.  The SCSI cable to the hard drive was only partially connected\n(must have come loose in shipping).  No big deal, but I would have been\npissed if I tried to boot it and it wouldn\'t come up!\n\tThe hard drive also has an annoying high pitched whine.  I\'ve\nheard apple will exchange it if you complain, so I might try to get\nit swapped.\n\tI am also dissappionted by the lack of soft power-on/off.  This\nwasn\'t mentioned

In [225]:
topic_model.get_document_info(df['data'])[['Document','Representation','Topic']]

Unnamed: 0,Document,Representation,Topic
0,\nmorgan and guzman will have era's 1 run high...,"[game, team, games, he, players, season, hocke...",0
1,"Well, I just got my Centris 610 yesterday. It...","[mhz, clock, speed, fpu, cpu, 040, bus, operat...",11
2,Archive-name: cryptography-faq/part10\nLast-mo...,"[key, des, ripem, cipher, ciphers, cryptograph...",88
3,To the best of my knowledge there aren't any p...,"[drive, scsi, drives, ide, disk, controller, h...",4
4,\n\nI think that domestication will change beh...,"[moral, morality, objective, immoral, hudson, ...",16
...,...,...,...
18841,\nWhy are circuit boards green? The material ...,"[boards, solder, green, mask, board, leds, blu...",176
18842,\n\nAnybody who drove into somebody like that ...,"[lane, car, behind, bike, mph, lanes, mirrors,...",71
18843,We were told that the resolution on the 5FGe c...,"[to, the, of, and, is, for, you, in, it, that]",-1
18844,CAD Setup For Sale:\n\nG486PLB Local Bus Mothe...,"[to, the, of, and, is, for, you, in, it, that]",-1


<h2>Evaluate topics generated by BERT topic with LLM Chat GPT</h2>

In [49]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [51]:
LANGSMITH_TRACING=True
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY=""#Use your API key
LANGSMITH_PROJECT="pr-unnatural-stab-76"
OPENAI_API_KEY = ""#Use your API key

In [53]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = LANGSMITH_ENDPOINT
os.environ['LANGCHAIN_API_KEY'] = LANGSMITH_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [55]:
from langchain.prompts import ChatPromptTemplate

<h3> Each topic has three represenative documents. To cover the issue with max tokens provided only the first represenatative document as the input or context to the LLM</h3>

In [247]:


# Final call to llm with the actual query and the new document set

template = """I have topic that contains the following documents: \n{documents}
The topic is described by the following keywords: {keywords}

Please rate how related the following key words are to each other and to the documents on a scale from 1 to 3 (1 = not very related, 2 = moderately
related, 3 = very related). Reply with only a single number, indicating the overall appropriateness of the key words, document and topic
"""
prompt = ChatPromptTemplate.from_template(template)

for index,row in topics_info.iterrows():
    documents = row['Representative_Docs'][0]
    keywords = row['Representation']
    chain = ( prompt | ChatOpenAI(temperature = 0)| StrOutputParser())
    try:
        cv[index] = (int(chain.invoke({"documents": documents, "keywords": keywords})))
    
    except Exception as e:
        print("Skipping Topic ",index,e)
        cv[index] =0
        continue
  

Skipping Topic  99 Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 17339 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Skipping Topic  116 Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 43329 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}


In [277]:
sum([i for i in cv.values()])/len([i for i in cv.values()])

2.6745283018867925

<h2>Using Langchain to invoke Open AI to produce themes, by passing the document along with the keywords or topic words identified by bert model</h2>

In [59]:
# Calling the above using a list of documents
from langchain.load import dumps, loads


In [61]:
docs = [topic_model.get_document_info(df['data']).iloc[i]['Document'] for i in range(5)]
keywords =  [topic_model.get_document_info(df['data']).iloc[i]['Representation'] for i in range(5)]

In [63]:
documents = list(set(docs))
keywords = list(set(docs))

In [98]:


# Final call to llm with the actual query and the new document set

template = """I have topic that contains the following documents: \n{documents}
The topic is described by the following keywords: {keywords}

Based on the above information, can you give multiple theme for the topic?
"""
prompt = ChatPromptTemplate.from_template(template)

chain = ( prompt | ChatOpenAI(temperature = 0)| StrOutputParser())
chain.invoke({"documents": documents, 
     "keywords": keywords})

'1. Domestication and behavior change in animals\n2. Cryptography and encryption methods\n3. Technical issues with SCSI transfers\n4. Baseball predictions and player performance\n5. Review and experience with the Centris 610 computer'