# Clustering of issues

In [12]:
from utils import load_openai_key

load_openai_key()

In [53]:
from langchain_core.prompts import ChatPromptTemplate

prompt_msg = ChatPromptTemplate.from_template("""
    You are given a json of format {{id: ["issue1", "issue2", ....]}}

    I want you to cluster those issues into overarching critique points.

    Json with issues:
    {input}
""")

In [54]:
from langchain_core.pydantic_v1 import BaseModel, Field

class IssuesWithId(BaseModel):
    id: int = Field(..., description="ID of the issue")
    issues: list[str] = Field(..., description="list of issues from that critique that were associated with this cluster. Not all issues mentioned might belong to the same cluster")
    def __str__(self):
        return f"{self.id}: {self.issues}"

class ClusterRequest(BaseModel):
    name: str = Field(..., description="Name of the cluster")
    description: str = Field(..., description="Brief description of what defines the cluster")
    ids: list[IssuesWithId] = Field(..., description="List of issues with their respective IDs")
    def __str__(self):
        return f"{self.name}: {self.description}\n{self.ids}"

class ClustersResponse(BaseModel):
    clusters: list[ClusterRequest] = Field(..., description="List of clusters")
    def __str__(self):
        return f"{self.clusters}"

In [67]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0, model="gpt-4o").with_structured_output(ClustersResponse)

In [68]:
chain = prompt_msg | llm

In [69]:
import json


extracted_issues = json.load(open("extracted_issues.json"))

In [70]:
response = chain.invoke({"input": str(extracted_issues)} )

In [72]:
response.clusters

[ClusterRequest(name='Durability Issues', description='Issues related to the product breaking or wearing out quickly', ids=[IssuesWithId(id=0, issues=['Die geklebte Folie löst sich ab']), IssuesWithId(id=2, issues=['Es war nach wenigen Minuten kaputt']), IssuesWithId(id=4, issues=['am nächsten Tag schon gerissen']), IssuesWithId(id=8, issues=['nicht stabil', 'ziemlich schnell hinüber']), IssuesWithId(id=10, issues=['nur eine leere Packung erhalten', 'nichts drin außer das Handbuch und die Verpackung']), IssuesWithId(id=11, issues=['Die Folie auf dem Würfel ist nach zwei Monaten gerissen']), IssuesWithId(id=14, issues=['Folie löst sich nach einem Monat', 'Folie komplett abgerissen nach mehreren Wochen', 'Würfel unbrauchbar']), IssuesWithId(id=16, issues=['Folie geht auseinander', 'Teile halten nur mit Folie zusammen']), IssuesWithId(id=21, issues=['Verbindung brach schnell nach', 'nur noch äußerst vorsichtig genutzt werden kann']), IssuesWithId(id=23, issues=['Würfel verreist noch nicht

In [74]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp311-cp311-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.6 MB 4.2 MB/s eta 0:00:03
   - -------------------------------------- 0.5/11.6 MB 4.8 MB/s eta 0:00:03
   -- ------------------------------------- 0.8/11.6 MB 5.7 MB/s eta 0:00:02
   --- ------------------------------------ 1.1/11.6 MB 5.9 MB/s eta 0:00:02
   ---- ----------------------------------- 1.3/11.6 MB 5.7 MB/s eta 0:00:02
   ---- ----------------------------------- 1.3/11.6 MB 5.7 MB/s eta 0:00:02
   ------ --------------------------------- 1.9/11.6 MB 5.7 MB/s eta 0:00:02
   -------- ---------------------

In [78]:
import pandas as pd

# Extract the clusters from the response
clusters = response.clusters

# Create an empty list to store the data
data = []

# Iterate over each cluster
for cluster in clusters:
    cluster_name = cluster.name
    
    # Iterate over each issue in the cluster
    for issue_with_id in cluster.ids:
        review_id = issue_with_id.id
        for issue in issue_with_id.issues:
            # Append the data to the list
            data.append([cluster_name, review_id, issue])

# Create a pandas DataFrame from the data
df = pd.DataFrame(data, columns=['Cluster Name', 'Review ID', 'Issue'])

In [79]:
df.head()

Unnamed: 0,Cluster Name,Review ID,Issue
0,Durability Issues,0,Die geklebte Folie löst sich ab
1,Durability Issues,2,Es war nach wenigen Minuten kaputt
2,Durability Issues,4,am nächsten Tag schon gerissen
3,Durability Issues,8,nicht stabil
4,Durability Issues,8,ziemlich schnell hinüber


In [80]:
df.to_csv("clusters.csv", index=False)

This worked in web chatgpt:

You are given a json of format {id: ["issue1", "issue2", ....]}

I want you to cluster those issues into overarching critique points. Respond with:

[{"name": <name of cluster>, "description": "<brief description>", "ids": [{id: <critique id>, issues: [<list of issues from that critique that were associated with this cluster. Not all issues mentioned might belong to the same cluster>]}]}]