In [1]:
import pandas as pd  # type: ignore
from openai import OpenAI
from dotenv import dotenv_values
import json
from pycaret.clustering import load_model, predict_model, setup, create_model, assign_model, save_model

In [3]:
env = dotenv_values(".env")
openai_client = OpenAI(api_key=env["OPENAI_API_KEY"])

DATA = 'welcome_survey_simple_v2.csv'
all_df = pd.read_csv(DATA, sep=';')

In [4]:
setup(all_df, session_id=901)
kmeans = create_model('kmeans', num_clusters=8)
model = assign_model(kmeans)

Unnamed: 0,Description,Value
0,Session id,901
1,Original data shape,"(229, 5)"
2,Transformed data shape,"(229, 21)"
3,Categorical features,5
4,Rows with missing values,13.1%
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1972,25.6451,1.7094,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
final_df = predict_model(kmeans, data=all_df)

In [6]:
cluster_descriptions = {}
for cluster_id in final_df['Cluster'].unique():
    cluster_df = final_df[final_df['Cluster'] == cluster_id]
    summary = ""
    for column in final_df:
        if column == 'Cluster':
            continue

        value_counts = cluster_df[column].value_counts()
        value_counts_str = ', '.join([f"{idx}: {cnt}" for idx, cnt in value_counts.items()])
        summary += f"{column} - {value_counts_str}\n"

    cluster_descriptions[cluster_id] = summary

In [9]:
prompt = "Użyliśmy algorytmu klastrowania."
for cluster_id, description in cluster_descriptions.items():
    prompt += f"\n\nKlaster {cluster_id}:\n{description}"

prompt += """
Wygeneruj najlepsze nazwy dla każdego z klasterów oraz ich opisy

Użyj formatu JSON. Przykładowo:
{
    "Cluster 0": {
        "name": "Klaster 0",
        "description": "W tym klastrze znajdują się osoby, które..."
    },
    "Cluster 1": {
        "name": "Klaster 1",
        "description": "W tym klastrze znajdują się osoby, które..."
    }
}
"""

In [10]:
response = openai_client.chat.completions.create(
    model="gpt-4o",
    temperature=0,
    messages=[
        {
            "role": "user",
            "content": [{"type": "text", "text": prompt}],
        }
    ],
)

In [11]:
result = response.choices[0].message.content.replace("```json", "").replace("```", "").strip()
cluster_names_and_descriptions = json.loads(result)

In [12]:
with open("cluster_names_and_descriptions.json", "w") as f:
    f.write(json.dumps(cluster_names_and_descriptions))