#### Graph Generation

This is the code which is responsibile for calling the LLAMA3 LLM for inferring for each post: the problem, the causes and the possible solutions (if any) to the problem.

In [27]:
# Libraries to be imported
import json
import os
import re

In [28]:
# Writing in a dedicated JSON file
def write_json(file_path, dict):
    with open(file_path, 'w') as file:
        json.dump(dict, file, indent=4, separators=(',',': '))

In [29]:
prompt = "I provide you a JSON file that contains a post from Reddit containing problems and solutions to them."
prompt += " Identify for this post: causes of the problem, the problem and solutions to the problem."
prompt += " Reply in this way: Problem: <p1,...,pn>; Causes: <c1,.., cn>; Solutions: <s1,...,sn>. The reply must contain only short keywords."

In [30]:
import nltk
from nltk.tokenize import word_tokenize

def count_tokens(text):
    tokens = word_tokenize(text)
    return len(tokens)

In [31]:
def llm_output_parser(text):
    result = {}
    # Using regex to match the three sections
    problem_match = re.search(r'Problem:\s*(.*?);', text)
    causes_match = re.search(r'Causes:\s*(.*?);', text)
    solutions_match = re.search(r'Solutions:\s*(.*)', text)

    if problem_match:
        result['Problem'] = problem_match.group(1).strip()

    if causes_match:
        result['Causes'] = [cause.strip() for cause in causes_match.group(1).split(',')]

    if solutions_match:
        result['Solutions'] = [solution.strip() for solution in solutions_match.group(1).split(',')]
    
    return result

In [32]:
# Scan the whole set of documents and find the maxium number of tokens

# Here I'm opening the file one by one from the collection directory and we add the content as element in the data list
def load_json_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
                data.extend(json.load(f))
    return data

# Function calling: loading all the post in memory
dir = "./dataCollection"
posts = load_json_files(dir)

# Iterating each document and converting into a plain string
# Find the max number of tokens
token_nums = []
for i, document in enumerate(posts):
    document_string = json.dumps(document, separators=(',', ':'), ensure_ascii=False) # This line will both convert the document into a string and removes commas
    token_nums.append(count_tokens(prompt + " " + document_string))
max_token_num = max(token_nums)
max_token_num

4773

From this analysis, I conclude that the maximum possible number of tokens for my dataset it's equal to 4761. So, I'll use a "Context Lenght" equal to 5000 in my local LLM-LLAMA3 server instance

In [33]:
from openai import OpenAI
sys_prompt = "Reply directly according to what instructed. Don't take initiatives like"
sys_prompt += "putting -, * or newline characters."
# Using LLAMA3 LLM
def prompt_issueing_llm(prompt, document_string):
    prompt_content = prompt + " " + document_string
    # Pointing to the local server
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
    completion = client.chat.completions.create(
        model = "mstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": prompt_content}
        ],
        temperature = 0.7
    )
    return completion.choices[0].message.content

In [34]:
# Iterating each document and converting into a plain string
def graph_inference(start = 1, end=len(posts)):
   file_count = start
   count = 0
   for i, document in enumerate(posts):
      if(count < start - 1):
         count += 1
         continue
      if(count >= end):
         break
      document_string = json.dumps(document, separators=(',', ':'), ensure_ascii=False) # This line will both convert the document into a string and removes commas
      llm_response = prompt_issueing_llm(prompt, document_string)
      llm_dict = llm_output_parser(llm_response)
      dict_to_json = {
         "url": document['post_url'],
         "score": document['post_score'],
         "numComents": document['post_numComments'],
         "problem": llm_dict["Problem"],
         "causes": llm_dict["Causes"],
         "solutions": llm_dict["Solutions"]
      }
      count += 1
      file_count += 1
      write_json(f"./llm_output/graph_data_{count}.json", dict_to_json)

In [48]:
graph_inference()

In [3]:
# Code to be executed when graph_inference has completed its job
import os
import json

# Merging all the files in llm_output together
folder_path = './llm_output'
all_documents = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            document = json.load(file)
            all_documents.append(document)

output_file_path = 'graph_data.json'

# Scrivi tutti i documenti in un unico file JSON, con indentazione per una formattazione corretta
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    json.dump(all_documents, output_file, ensure_ascii=False, indent=4)

print(f'Merging successful. Written in: {output_file_path}')


Merging successful. Written in: graph_data.json
