In [None]:
from bertopic import BERTopic
import pandas as pd
from pathlib import Path

repo_path = Path('/home/krajda/anticipatio/')

topic_model = BERTopic.load(repo_path / 'models/pca10_kmeans200_cv.pkl')
tweets = pd.read_pickle(repo_path / 'data/final.pkl')
docs = tweets['txt'].tolist()

In [None]:
fig = topic_model.visualize_topics()
fig.show()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_info = topic_model.get_topic_info()

representative_docs = topic_model.representative_docs_

representative_words = []

for _, x in topic_model.topic_representations_.items():
    inner = []
    for w, _ in x:
        inner.append(w)
    representative_words.append(inner)

In [None]:
topic_info['Share [%]'] = topic_info['Count'] / topic_info['Count'].sum()*100
topic_info['Share [%]'] = topic_info['Share [%]'].round(2)
topic_info['Share [%]']

topic_info['Relevant words'] = representative_words

# LLM Labeling

In [None]:
from langchain import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.schema import SystemMessage
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI

def get_gpt4_answer(question: str, model_name:str = "gpt-4-1106-preview"):
    prompt = "{{ question }}"
    template = ChatPromptTemplate(
        input_variables=["question"],
        messages=[
            SystemMessage(content=""),
            HumanMessagePromptTemplate(
                prompt=PromptTemplate(
                    template=prompt, template_format="jinja2", input_variables=["question"]
                )
            ),
        ],
    )
    
    llm = ChatOpenAI(model_name=model_name, temperature=0.1)
    llm_chain = LLMChain(llm=llm, prompt=template)
    output = llm_chain({"question": question})
    
    return output["text"]


answers = {}

for i in range(0, 100):
    
    prompt = f"""You are an Data Analyst within Social Media Department of analitycal company. You analyze internet discussion about coherent topic and provide meaningful descriptions and labels to the topic.

        The topic is described by the following keywords: {representative_words[i]}

        This topic has also 3 most representative documents:
        - {representative_docs[i][0]}
        - {representative_docs[i][1]}
        - {representative_docs[i][2]}

        Based on the information above, extract a short topic description and a topic label.
        Return results as a JSON array with two fields:
        {{
        'topic_description': '...',
        'topic_label': '...'
        }}
        """
    
    try:
            
        raw_answer = get_gpt4_answer(prompt)
        answers[i] = {'raw': raw_answer}
        answer = json.loads(raw_answer)
        
        answers[i]['desc'] = answer['topic_description']
        answers[i]['label'] = answer['topic_label']
    
    except:
        print(i)
        pass
    


In [None]:
import json
json.dump(answers, open(repo_path / 'data/labels.json', 'w'))

In [None]:
formatted = []
errors = {}
for i, x in answers.items():
    iss = x['raw'].index('```json\n')
    ie = x['raw'].index('\n```', iss)
    
    try:
        formatted.append(json.loads(x['raw'][iss+8:ie].replace('\n', '').replace('    ', '')))
    except:
        formatted.append({'topic_description': '', 'topic_label': ''})
        errors[i] = x['raw'][iss+8:ie].replace('\n', '').replace('    ', '')
    

In [None]:
errors

In [None]:
formatted[56]['topic_description']= 'Discussion surrounding emerging blockchain technologies, focusing on Web3 innovations, NFTs, and potential updates or versions indicated by the numbers.'
formatted[56]['topic_label']= 'Blockchain Technology & Web3'

formatted[69]['topic_description']= 'Online social media interactions and expressions of gratitude or requests for information, characterized by the use of internet shorthand and conversational phrases.'
formatted[69]['topic_label']= 'Social Media Communication'

In [None]:
df = pd.DataFrame.from_dict(formatted)

topic_info['Label'] = df['topic_label']
topic_info['Description'] = df['topic_description']

In [None]:
topic_info.to_csv(repo_path / 'data/topic_info.csv', index=False)