<a href="https://colab.research.google.com/github/natanaelwgm/2025w-PromedUI-NLPCC-Ganjil20242025/blob/main/nlpcc_2025_week4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Step 1: Install the OpenAI Python library (quietly)
!pip install openai -q

In [3]:
# @title Say Hello to GPT-4o via OpenAI API
#
# This single code block will:
# 1. Install the OpenAI Python library.
# 2. Ask for your OpenAI API key (safely, using Colab secrets if available).
# 3. Send a "Hello" message to the GPT-4o model using the Responses API.
# 4. Print the model's response.


# Step 2: Import necessary libraries
import openai
import os
from google.colab import userdata # For Colab secrets
import getpass # For securely getting password if secret is not set

# Step 3: Get OpenAI API Key
# Try to load the API key from Colab secrets first
try:
    api_key = "APIKEY"
    if not api_key:
        # This will be caught by the KeyError or proceed to the manual input
        raise KeyError("OPENAI_API_KEY not found or is empty in Colab secrets.")
    print("✅ OpenAI API Key loaded successfully from Colab secrets.")
except KeyError:
    print("⚠️ OpenAI API Key not found in Colab secrets.")
    print("   You can add it by clicking the '🔑' (key) icon in the left sidebar,")
    print("   then 'Add new secret' with the name 'OPENAI_API_KEY'.")
    print("\nAlternatively, please paste your OpenAI API key here (less secure):")
    api_key = getpass.getpass('Enter your OpenAI API key: ')
    if api_key:
        print("✅ OpenAI API Key received.")
    else:
        print("❌ No API key provided. Please provide an API key to proceed.")
        # You might want to raise an error or exit here if no key is provided
        # For this example, we'll let it proceed and potentially fail at the API call

# Step 4: Set the API key for the OpenAI library
# The OpenAI client will automatically pick it up if it's set as an environment variable
# or you can pass it directly: client = openai.OpenAI(api_key=api_key)
if api_key:
    os.environ['OPENAI_API_KEY'] = api_key

    # Step 5: Initialize the OpenAI client and make the API call
    try:
        client = openai.OpenAI() # API key is read from environment variable

        model_id = "gpt-4o"
        user_input = "Hi can you tell me about Universitas Indonesia"

        print(f"\n🚀 Sending request to OpenAI API with model: {model_id} and input: '{user_input}'...")

        # Using the "Responses" API as recommended for new projects
        # POST https://api.openai.com/v1/responses
        response = client.responses.create(
            model=model_id,
            input=user_input
        )

        print("\n✅ API call successful!")

        # Print the full response object (for debugging or more details)
        # print("\n🔍 Full API Response Object:")
        # print(response)

        # Extract and print the text content from the response
        # The Responses API has a convenience property `output_text` in the SDK
        # or you can parse it from response.output[0].content[0].text
        assistant_reply = ""
        if hasattr(response, 'output_text') and response.output_text:
            assistant_reply = response.output_text
        elif response.output and len(response.output) > 0:
            first_output_item = response.output[0]
            if first_output_item.type == "message" and hasattr(first_output_item, 'content') and \
               first_output_item.content and len(first_output_item.content) > 0:
                first_content_part = first_output_item.content[0]
                if first_content_part.type == "output_text" and hasattr(first_content_part, 'text'):
                    assistant_reply = first_content_part.text

        if assistant_reply:
            print(f"\n💬 GPT-4o says:")
            print(assistant_reply)
        else:
            print("\n⚠️ Could not extract a text reply from the response. Full response printed above for inspection.")
            print("\n🔍 Full API Response Object:") # Print full response if text extraction fails
            print(response)


    except openai.APIConnectionError as e:
        print("❌ API Connection Error: The server could not be reached.")
        print(f"   Error details: {e.__cause__}")
    except openai.RateLimitError as e:
        print("❌ Rate Limit Error: You have exceeded your API quota or rate limit.")
        print(f"   Error details: {e}")
    except openai.AuthenticationError as e:
        print("❌ Authentication Error: Your API key is incorrect or invalid.")
        print(f"   Error details: {e}")
    except openai.APIStatusError as e:
        print(f"❌ OpenAI API returned an API Error (Status {e.status_code}):")
        print(f"   Error details: {e.message}")
        # print(f"   Full response: {e.response}") # Can be very verbose
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        print("   Make sure your API key is correctly set and has permissions for gpt-4o.")

else:
    print("\n❌ API call cannot proceed without an API key.")

✅ OpenAI API Key loaded successfully from Colab secrets.

🚀 Sending request to OpenAI API with model: gpt-4o and input: 'Hi can you tell me about Universitas Indonesia'...

✅ API call successful!

💬 GPT-4o says:
Universitas Indonesia (UI) is one of the most prestigious and oldest educational institutions in Indonesia. Located in Depok, just south of Jakarta, it serves as a leading center for research and higher education in the country. Here are some key points about UI:

1. **History and Establishment**: UI traces its origins to 1849, making it one of Indonesia's oldest universities. Its modern form was established in 1950.

2. **Campuses**: The university has two main campuses—one in Salemba, Central Jakarta, and the primary one in Depok. The Depok campus is known for its lush green environment and expansive facilities.

3. **Faculties and Programs**: Universitas Indonesia offers a wide range of academic programs through its faculties, including humanities, social sciences, engineeri

In [4]:
# @title Block 1: Process Excel and Generate Embeddings
#
# This block will:
# 1. Install pandas, openpyxl, and openai.
# 2. Get your OpenAI API key.
# 3. Prompt you to upload an Excel file.
# 4. Read the 'text' column from the Excel file.
# 5. Generate embeddings for each text.
# 6. Store texts and their embeddings.

# Step 1: Install libraries
!pip install pandas openpyxl openai scikit-learn -q

In [12]:


# Step 2: Import necessary libraries
import openai
import os
from google.colab import userdata, files
import getpass
import pandas as pd
import time # To add slight delays and avoid hitting rate limits too quickly if processing many rows

print("Libraries installed and imported.")

# Step 3: Get OpenAI API Key
try:
    api_key = "apikey"
    if not api_key:
        raise KeyError("OPENAI_API_KEY not found or is empty in Colab secrets.")
    print("✅ OpenAI API Key loaded successfully from Colab secrets.")
except KeyError:
    print("⚠️ OpenAI API Key not found in Colab secrets.")
    api_key = getpass.getpass('Enter your OpenAI API key: ')
    if api_key:
        print("✅ OpenAI API Key received.")
    else:
        print("❌ No API key provided. Exiting.")
        # Exit if no API key
        import sys
        sys.exit()

os.environ['OPENAI_API_KEY'] = api_key
client = openai.OpenAI()
print("OpenAI client initialized.")

# Step 4: Upload Excel file
print("\nPlease upload your Excel file ('texts_to_embed.xlsx' with a 'text' column):")
uploaded = files.upload()

if not uploaded:
    print("❌ No file uploaded. Please run the cell again and upload a file.")
else:
    filename = list(uploaded.keys())[0]
    print(f"\n✅ File '{filename}' uploaded successfully.")

    # Step 5: Read the Excel file
    try:
        df = pd.read_excel(filename)
        if 'text' not in df.columns:
            print(f"❌ Error: The Excel file '{filename}' must contain a column named 'text'.")
            print(f"   Found columns: {df.columns.tolist()}")
        else:
            texts_to_embed = df['text'].dropna().astype(str).tolist()
            print(f"✅ Found {len(texts_to_embed)} texts in the 'text' column.")

            # Step 6: Generate embeddings and store them
            texts_with_embeddings = [] # This will store {'text': original_text, 'embedding': vector}

            embedding_model = "text-embedding-3-small" # Efficient and good for most cases
            # embedding_model = "text-embedding-ada-002" # Older model

            print(f"\n⚙️ Generating embeddings using model: {embedding_model}")
            for i, text_content in enumerate(texts_to_embed):
                if not text_content.strip(): # Skip empty strings
                    print(f"   Skipping empty text at row {i+1}.")
                    continue
                try:
                    print(f"   Processing text {i+1}/{len(texts_to_embed)}: \"{text_content[:50]}...\"")
                    response = client.embeddings.create(
                        model=embedding_model,
                        input=text_content,
                        encoding_format="float" # Get float vectors directly
                    )
                    embedding = response.data[0].embedding
                    texts_with_embeddings.append({
                        'text': text_content,
                        'embedding': embedding
                    })
                    # Optional: Add a small delay to be kind to the API for very large files
                    # time.sleep(0.1)
                except openai.APIError as e:
                    print(f"   ❌ OpenAI API Error for text {i+1}: {e}")
                    print(f"      Skipping this text: \"{text_content[:50]}...\"")
                except Exception as e:
                    print(f"   ❌ An unexpected error occurred for text {i+1}: {e}")
                    print(f"      Skipping this text: \"{text_content[:50]}...\"")


            if texts_with_embeddings:
                print(f"\n✅ Successfully generated and stored embeddings for {len(texts_with_embeddings)} texts.")
                print("   You can now run Block 2 to perform a similarity search.")
                # Example: print the first stored item
                # print("\n   Example of stored data (first item):")
                # print(f"   Text: {texts_with_embeddings[0]['text']}")
                # print(f"   Embedding (first 5 dims): {texts_with_embeddings[0]['embedding'][:5]}")
            else:
                print("\n⚠️ No embeddings were generated. Check your Excel file or API errors.")

    except FileNotFoundError:
        print(f"❌ Error: File '{filename}' not found after upload. This shouldn't happen.")
    except Exception as e:
        print(f"❌ An error occurred while processing the Excel file: {e}")

Libraries installed and imported.
✅ OpenAI API Key loaded successfully from Colab secrets.
OpenAI client initialized.

Please upload your Excel file ('texts_to_embed.xlsx' with a 'text' column):


Saving week4_reviews.xlsx to week4_reviews.xlsx

✅ File 'week4_reviews.xlsx' uploaded successfully.
✅ Found 100 texts in the 'text' column.

⚙️ Generating embeddings using model: text-embedding-3-small
   Processing text 1/100: "The packaging feels really cheap and flimsy...."
   Processing text 2/100: "Getting the product out is a nightmare, the pump i..."
   Processing text 3/100: "My product leaked everywhere because the lid doesn..."
   Processing text 4/100: "The compact mirror is tiny and basically useless...."
   Processing text 5/100: "It's impossible to get the last bit of product out..."
   Processing text 6/100: "The lid cracked after only a couple of uses...."
   Processing text 7/100: "Too much product comes out with just a slight sque..."
   Processing text 8/100: "The closure mechanism is stiff and hard to open qu..."
   Processing text 9/100: "This packaging is bulkier than it needs to be...."
   Processing text 10/100: "The design of the packaging is awkward to hold an

In [14]:
# @title Block 2: Query and Find Top 5 Similar Texts
#
# This block will:
# 1. Take a query text as input.
# 2. Generate an embedding for the query.
# 3. Calculate cosine similarity against the stored embeddings.
# 4. Display the top 5 most similar texts.
#
# Make sure you have run Block 1 successfully first!

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Check if texts_with_embeddings exists from Block 1
if 'texts_with_embeddings' not in globals() or not texts_with_embeddings:
    print("❌ Stored embeddings not found. Please run Block 1 successfully first.")
else:
    print(f"✅ Found {len(texts_with_embeddings)} stored texts with embeddings from Block 1.")

    query_text = input("\nEnter your search query: ")

    if not query_text.strip():
        print("❌ Query cannot be empty.")
    else:
        try:
            # Step 1: Generate embedding for the query
            print(f"\n⚙️ Generating embedding for query: \"{query_text}\"")
            embedding_model = "text-embedding-3-small" # Must match the model used in Block 1
            response = client.embeddings.create(
                model=embedding_model,
                input=query_text,
                encoding_format="float"
            )
            query_embedding = response.data[0].embedding
            print("✅ Query embedding generated.")

            # Step 2: Calculate cosine similarities
            # Reshape query_embedding to be a 2D array for scikit-learn's cosine_similarity
            query_embedding_reshaped = np.array(query_embedding).reshape(1, -1)

            similarities = []
            for item in texts_with_embeddings:
                stored_embedding_reshaped = np.array(item['embedding']).reshape(1, -1)
                # cosine_similarity returns a 2D array, e.g., [[0.85]]
                sim_score = cosine_similarity(query_embedding_reshaped, stored_embedding_reshaped)[0][0]
                similarities.append({
                    'text': item['text'],
                    'score': sim_score
                })

            # Step 3: Sort by similarity score in descending order
            similarities_sorted = sorted(similarities, key=lambda x: x['score'], reverse=True)

            # Step 4: Display top 5 results
            print(f"\n🏆 Top 5 most similar texts to \"{query_text}\":")
            if not similarities_sorted:
                print("   No similar texts found (this is unlikely unless stored embeddings are empty).")
            else:
                for i, sim_item in enumerate(similarities_sorted[:5]):
                    print(f"   {i+1}. Score: {sim_item['score']:.4f} - Text: \"{sim_item['text']}\"")

        except openai.APIError as e:
            print(f"❌ OpenAI API Error during query processing: {e}")
        except Exception as e:
            print(f"❌ An unexpected error occurred during query processing: {e}")

✅ Found 100 stored texts with embeddings from Block 1.

Enter your search query: berminyak

⚙️ Generating embedding for query: "berminyak"
✅ Query embedding generated.

🏆 Top 5 most similar texts to "berminyak":
   1. Score: 0.2964 - Text: "Leaves a slick, persistent oily feeling on the skin's surface."
   2. Score: 0.2822 - Text: "Didn't expect it to be this oily; it's quite disappointing."
   3. Score: 0.2755 - Text: "It has a distinctly oily, unpleasant finish."
   4. Score: 0.2740 - Text: "Creates a shiny, oily appearance almost instantly upon application."
   5. Score: 0.2615 - Text: "Feels like I just rubbed cooking oil on my face."


In [15]:


# Step 2: Import necessary libraries
import openai
import os
from google.colab import userdata, files
import getpass
import pandas as pd
import time # To add slight delays and avoid hitting rate limits too quickly if processing many rows

print("Libraries installed and imported.")

# Step 3: Get OpenAI API Key
try:
    api_key = "Apikey"
    if not api_key:
        raise KeyError("OPENAI_API_KEY not found or is empty in Colab secrets.")
    print("✅ OpenAI API Key loaded successfully from Colab secrets.")
except KeyError:
    print("⚠️ OpenAI API Key not found in Colab secrets.")
    api_key = getpass.getpass('Enter your OpenAI API key: ')
    if api_key:
        print("✅ OpenAI API Key received.")
    else:
        print("❌ No API key provided. Exiting.")
        # Exit if no API key
        import sys
        sys.exit()

os.environ['OPENAI_API_KEY'] = api_key
client = openai.OpenAI()
print("OpenAI client initialized.")

# Step 4: Upload Excel file
print("\nPlease upload your Excel file ('texts_to_embed.xlsx' with a 'text' column):")
uploaded = files.upload()

if not uploaded:
    print("❌ No file uploaded. Please run the cell again and upload a file.")
else:
    filename = list(uploaded.keys())[0]
    print(f"\n✅ File '{filename}' uploaded successfully.")

    # Step 5: Read the Excel file
    try:
        df = pd.read_excel(filename)
        if 'text' not in df.columns:
            print(f"❌ Error: The Excel file '{filename}' must contain a column named 'text'.")
            print(f"   Found columns: {df.columns.tolist()}")
        else:
            texts_to_embed = df['text'].dropna().astype(str).tolist()
            print(f"✅ Found {len(texts_to_embed)} texts in the 'text' column.")

            # Step 6: Generate embeddings and store them
            texts_with_embeddings = [] # This will store {'text': original_text, 'embedding': vector}

            embedding_model = "text-embedding-3-small" # Efficient and good for most cases
            # embedding_model = "text-embedding-ada-002" # Older model

            print(f"\n⚙️ Generating embeddings using model: {embedding_model}")
            for i, text_content in enumerate(texts_to_embed):
                if not text_content.strip(): # Skip empty strings
                    print(f"   Skipping empty text at row {i+1}.")
                    continue
                try:
                    print(f"   Processing text {i+1}/{len(texts_to_embed)}: \"{text_content[:50]}...\"")
                    response = client.embeddings.create(
                        model=embedding_model,
                        input=text_content,
                        encoding_format="float" # Get float vectors directly
                    )
                    embedding = response.data[0].embedding
                    texts_with_embeddings.append({
                        'text': text_content,
                        'embedding': embedding
                    })
                    # Optional: Add a small delay to be kind to the API for very large files
                    # time.sleep(0.1)
                except openai.APIError as e:
                    print(f"   ❌ OpenAI API Error for text {i+1}: {e}")
                    print(f"      Skipping this text: \"{text_content[:50]}...\"")
                except Exception as e:
                    print(f"   ❌ An unexpected error occurred for text {i+1}: {e}")
                    print(f"      Skipping this text: \"{text_content[:50]}...\"")


            if texts_with_embeddings:
                print(f"\n✅ Successfully generated and stored embeddings for {len(texts_with_embeddings)} texts.")
                print("   You can now run Block 2 to perform a similarity search.")
                # Example: print the first stored item
                # print("\n   Example of stored data (first item):")
                # print(f"   Text: {texts_with_embeddings[0]['text']}")
                # print(f"   Embedding (first 5 dims): {texts_with_embeddings[0]['embedding'][:5]}")
            else:
                print("\n⚠️ No embeddings were generated. Check your Excel file or API errors.")

    except FileNotFoundError:
        print(f"❌ Error: File '{filename}' not found after upload. This shouldn't happen.")
    except Exception as e:
        print(f"❌ An error occurred while processing the Excel file: {e}")

Libraries installed and imported.
✅ OpenAI API Key loaded successfully from Colab secrets.
OpenAI client initialized.

Please upload your Excel file ('texts_to_embed.xlsx' with a 'text' column):


Saving week4_news.xlsx to week4_news.xlsx

✅ File 'week4_news.xlsx' uploaded successfully.
✅ Found 30 texts in the 'text' column.

⚙️ Generating embeddings using model: text-embedding-3-small
   Processing text 1/30: "Lakers Dominate Pacers in Game 3 Win..."
   Processing text 2/30: "Rising Star Rookie Breaks Scoring Record in Thrill..."
   Processing text 3/30: "Veteran Guard Announces Retirement After Champions..."
   Processing text 4/30: "Team Trades Key Player Ahead of Deadline Buzz..."
   Processing text 5/30: "Coach Explains Strategic Foul Call in Final Second..."
   Processing text 6/30: "College Hoops Upset Shocks Nation..."
   Processing text 7/30: "Injuries Plague Frontcourt as Playoffs Approach..."
   Processing text 8/30: "NBA Investigating Altercation Between Players..."
   Processing text 9/30: "New Arena Unveiled with Exhibition Game..."
   Processing text 10/30: "Fantasy Basketball Draft: Who to Pick Early..."
   Processing text 11/30: "Local Man Charged in Apparent Ro

In [16]:
# --- BLOK KODE UNTUK VISUALISASI T-SNE (SETELAH BLOK EMBEDDING DARI EXCEL) ---
# Pastikan library yang dibutuhkan sudah terinstall jika belum
# !pip install scikit-learn plotly pandas numpy # numpy & pandas mungkin sudah dari blok sebelumnya

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px

# Memastikan variabel 'texts_with_embeddings' ada dari blok sebelumnya
if 'texts_with_embeddings' not in locals() or not texts_with_embeddings:
    print("❌ Error: Variabel 'texts_with_embeddings' tidak ditemukan atau kosong.")
    print("   Pastikan blok kode sebelumnya (untuk embedding dari Excel) sudah dijalankan dengan sukses.")
    # Bisa tambahkan sys.exit() jika ingin menghentikan eksekusi di sini
    # import sys
    # sys.exit("Hentikan eksekusi karena data embedding tidak tersedia.")
else:
    print(f"✅ Ditemukan {len(texts_with_embeddings)} item dalam 'texts_with_embeddings'.")

    # Step 1: Ekstrak embeddings dan teks dari 'texts_with_embeddings'
    # `texts_with_embeddings` adalah list of dictionaries: [{'text': ..., 'embedding': ...}, ...]

    # Inisialisasi list kosong untuk menampung embeddings dan teks
    all_embeddings_list = []
    all_texts_list = []

    for item in texts_with_embeddings:
        if 'embedding' in item and 'text' in item:
            all_embeddings_list.append(item['embedding'])
            all_texts_list.append(item['text'])
        else:
            print(f"⚠️ Peringatan: Item dalam 'texts_with_embeddings' tidak memiliki 'embedding' atau 'text': {item}")

    if not all_embeddings_list or not all_texts_list:
        print("❌ Error: Tidak ada embedding atau teks yang valid yang bisa diekstrak.")
        # import sys
        # sys.exit("Hentikan eksekusi karena tidak ada data valid untuk t-SNE.")
    else:
        # Konversi list of embeddings ke NumPy array
        embeddings_np = np.array(all_embeddings_list)
        # List teks sudah siap (all_texts_list)

        print(f"   Berhasil mengekstrak {embeddings_np.shape[0]} embeddings dengan dimensi {embeddings_np.shape[1]}.")
        print(f"   Berhasil mengekstrak {len(all_texts_list)} label teks.")


        # Step 2: Reduksi dimensi menggunakan t-SNE dengan pengaturan default
        print("\n🔄 Melakukan reduksi dimensi dengan t-SNE (pengaturan default)...")

        n_samples = embeddings_np.shape[0]

        if n_samples <= 1:
            print(f"❌ Error: t-SNE membutuhkan setidaknya 2 sampel. Ditemukan: {n_samples}")
        else:
            perplexity_value = min(30.0, float(n_samples - 1))
            if perplexity_value < 5 and n_samples > 1:
                 print(f"   ⚠️ Jumlah sampel ({n_samples}) kecil. Menggunakan perplexity={perplexity_value:.1f}.")
            if perplexity_value == 0: # Untuk kasus n_samples = 1
                perplexity_value = 1.0

            tsne_model = TSNE(
                n_components=2,
                perplexity=perplexity_value,
                learning_rate='auto',
                init='pca',
                n_iter=1000,
                random_state=42, # Untuk hasil yang konsisten
                verbose=0
            )
            tsne_results = tsne_model.fit_transform(embeddings_np)
            print(f"✅ Reduksi dimensi t-SNE selesai. Shape hasil: {tsne_results.shape}")

            # Step 3: Buat DataFrame untuk Plotly
            df_tsne = pd.DataFrame({
                'tsne_x': tsne_results[:, 0],
                'tsne_y': tsne_results[:, 1],
                'text_label': all_texts_list # Teks asli untuk hover
            })

            # (Opsional) Jika kamu punya kolom kategori di DataFrame `df` awal,
            # kamu bisa coba menambahkannya di sini untuk pewarnaan.
            # Misalnya, jika ada kolom 'category' di `df` dan urutannya sama:
            # if 'category' in df.columns and len(df['category'].dropna()) == len(all_texts_list):
            #     # Ini asumsi bahwa texts_to_embed diambil tanpa dropna yang mengubah urutan signifikan
            #     # atau bahwa kita memfilter df asli agar sesuai dengan texts_with_embeddings
            #     # Untuk cara yang lebih aman, sebaiknya simpan kategori bersamaan dengan embedding
            #     # Namun untuk sekarang, kita coba cara sederhana:
            #     try:
            #         # Ambil kategori yang sesuai dengan teks yang berhasil di-embed
            #         # Ini agak tricky jika ada teks yang di-skip.
            #         # Cara paling aman adalah menyimpan kategori saat looping embedding.
            #         # Untuk kesederhanaan, kita asumsikan urutan masih terjaga jika tidak ada skip.
            #         if len(texts_to_embed) == len(df['text'].dropna()): # Jika tidak ada skip
            #             df_tsne['category'] = df[df['text'].isin(all_texts_list)]['category'].tolist() # ini masih berisiko jika ada duplikat teks
            #             color_column_name = 'category'
            #             print("   Menambahkan kolom kategori untuk pewarnaan.")
            #         else:
            #             print("   ⚠️ Tidak dapat menambahkan kategori secara otomatis karena ada teks yang mungkin di-skip.")
            #             color_column_name = None
            #     except Exception as e:
            #         print(f"   ⚠️ Gagal menambahkan kategori: {e}")
            #         color_column_name = None
            # else:
            #     color_column_name = None

            # Untuk saat ini, kita tidak menggunakan pewarnaan kategori otomatis dari df awal
            # karena kompleksitas mencocokkan kembali jika ada teks yang di-skip.
            # Jika kamu menyimpan kategori di `texts_with_embeddings` (misal `item['category']`),
            # maka akan lebih mudah.
            color_column_name = None


            print("\n📊 Membuat visualisasi t-SNE interaktif dengan Plotly...")

            # Step 4: Visualisasi Interaktif dengan Plotly
            fig = px.scatter(
                df_tsne,
                x='tsne_x',
                y='tsne_y',
                hover_name='text_label',  # Tampilkan teks dari kolom 'text_label' saat hover
                color=color_column_name,  # Akan None jika kategori tidak ditambahkan
                title='t-SNE Visualization of Text Embeddings from Excel',
                labels={'tsne_x': 't-SNE Component 1', 'tsne_y': 't-SNE Component 2'}
            )

            fig.update_traces(
                marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')),
                selector=dict(mode='markers')
            )
            fig.update_layout(
                hovermode='closest',
                legend_title_text='Category' if color_column_name else '',
                width=900,
                height=700
            )

            fig.show()

            print("\n🎉 Visualisasi selesai! Arahkan kursor ke titik-titik untuk melihat label teksnya.")
            print("   Kelompok titik yang berdekatan menunjukkan teks dengan embedding yang serupa.")

# Jika 'texts_with_embeddings' tidak ada (misalnya, blok sebelumnya belum dijalankan)
# akan ada pesan error dari blok if/else di atas.

✅ Ditemukan 30 item dalam 'texts_with_embeddings'.
   Berhasil mengekstrak 30 embeddings dengan dimensi 1536.
   Berhasil mengekstrak 30 label teks.

🔄 Melakukan reduksi dimensi dengan t-SNE (pengaturan default)...




✅ Reduksi dimensi t-SNE selesai. Shape hasil: (30, 2)

📊 Membuat visualisasi t-SNE interaktif dengan Plotly...



🎉 Visualisasi selesai! Arahkan kursor ke titik-titik untuk melihat label teksnya.
   Kelompok titik yang berdekatan menunjukkan teks dengan embedding yang serupa.


In [None]:
# ... (kode sebelumnya untuk mendapatkan embeddings_np dan all_texts_list) ...

n_samples = embeddings_np.shape[0]

if n_samples <= 1:
    print(f"❌ Error: t-SNE membutuhkan setidaknya 2 sampel. Ditemukan: {n_samples}")
else:
    # --- AWAL BAGIAN YANG DIMODIFIKASI UNTUK EKSPERIMEN ---
    print("\n🧪 Eksperimen dengan parameter t-SNE:")

    # Pilihan 1: Perplexity lebih rendah
    custom_perplexity = 10 # Coba nilai antara 5 dan n_samples-1
    if custom_perplexity >= n_samples:
        custom_perplexity = max(1.0, float(n_samples - 1) / 2.0) # Pastikan valid

    print(f"   Menggunakan perplexity: {custom_perplexity}")
    print(f"   Menggunakan n_iter: 2500") # Tingkatkan iterasi
    # print(f"   Menggunakan early_exaggeration: 15.0")

    tsne_model = TSNE(
        n_components=2,
        perplexity=custom_perplexity,       # Diubah
        learning_rate='auto',           # Biasanya 'auto' atau 200.0 sudah baik
        init='pca',
        n_iter=2500,                    # Diubah (ditingkatkan)
        # early_exaggeration=15.0,      # Opsional: coba ubah ini juga
        random_state=42,
        verbose=0
    )
    # --- AKHIR BAGIAN YANG DIMODIFIKASI ---

    tsne_results = tsne_model.fit_transform(embeddings_np)
    print(f"✅ Reduksi dimensi t-SNE selesai. Shape hasil: {tsne_results.shape}")

    # ... (sisa kode untuk membuat DataFrame dan plot Plotly tetap sama) ...
    df_tsne = pd.DataFrame({
        'tsne_x': tsne_results[:, 0],
        'tsne_y': tsne_results[:, 1],
        'text_label': all_texts_list
    })

    # (Kode untuk pewarnaan kategori jika ada)
    color_column_name = None
    # ...

    print("\n📊 Membuat visualisasi t-SNE interaktif dengan Plotly...")
    fig = px.scatter(
        df_tsne,
        x='tsne_x',
        y='tsne_y',
        hover_name='text_label',
        color=color_column_name,
        title='t-SNE Visualization of Text Embeddings (Custom Parameters)',
        labels={'tsne_x': 't-SNE Component 1', 'tsne_y': 't-SNE Component 2'}
    )
    fig.update_traces(
        marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')),
        selector=dict(mode='markers')
    )
    fig.update_layout(
        hovermode='closest',
        legend_title_text='Category' if color_column_name else '',
        width=900,
        height=700
    )
    fig.show()
    print("\n🎉 Visualisasi selesai!")

In [None]:
# --- BLOK 1: EMBEDDING DARI EXCEL ---

# Step 1: Install libraries (jika belum)
# !pip install openai pandas openpyxl # openpyxl dibutuhkan untuk pd.read_excel

# Step 2: Import necessary libraries
import openai
import os
from google.colab import userdata, files # Untuk Colab
import getpass # Untuk input API key jika tidak di Colab secrets
import pandas as pd
import time # Untuk delay opsional
import numpy as np # Untuk t-SNE
from sklearn.manifold import TSNE # Untuk t-SNE
import plotly.express as px # Untuk visualisasi interaktif
import sys # Untuk keluar jika ada error fatal

print("Libraries installed and imported.")

# Step 3: Get OpenAI API Key
# Menggunakan API key yang sudah di-hardcode di contoh user,
# idealnya menggunakan Colab secrets atau input aman.
# api_key_user_provided = "Apikey" # CONTOH DARI USER
# Gunakan salah satu metode di bawah ini:

API_KEY_NAME_IN_SECRETS = 'Apikey' # Ganti jika nama secretmu berbeda

try:
    # Coba dari Colab secrets (lebih aman)
    api_key = "Apikey"
    if not api_key:
        # Jika ada di secrets tapi kosong
        print(f"⚠️ {API_KEY_NAME_IN_SECRETS} ditemukan di Colab secrets tapi kosong.")
        raise KeyError # Jatuh ke input manual
    print(f"✅ OpenAI API Key loaded successfully from Colab secrets ('{API_KEY_NAME_IN_SECRETS}').")
except KeyError:
    print(f"⚠️ OpenAI API Key ('{API_KEY_NAME_IN_SECRETS}') not found in Colab secrets.")
    print("   You can add it by clicking the '🔑' (key) icon in the left sidebar,")
    print(f"   then 'Add new secret' with the name '{API_KEY_NAME_IN_SECRETS}'.")
    print("\nAlternatively, please paste your OpenAI API key here (less secure):")
    api_key = getpass.getpass('Enter your OpenAI API key: ')
    if api_key:
        print("✅ OpenAI API Key received via manual input.")
    else:
        print("❌ No API key provided. Exiting.")
        sys.exit("API Key is required to proceed.")
# except Exception as e: # Untuk menangkap error lain saat akses userdata, jika ada
# print(f"An error occurred trying to access Colab secrets: {e}")
# print("Proceeding to manual API key input.")
# api_key = getpass.getpass('Enter your OpenAI API key: ')
# if api_key:
# print("✅ OpenAI API Key received via manual input.")
# else:
# print("❌ No API key provided. Exiting.")
# sys.exit("API Key is required to proceed.")


os.environ['OPENAI_API_KEY'] = api_key
try:
    client = openai.OpenAI()
    # Coba lakukan panggilan tes sederhana (opsional, tapi bagus untuk validasi awal)
    # client.models.list()
    print("✅ OpenAI client initialized successfully.")
except openai.AuthenticationError:
    print("❌ OpenAI Authentication Error: Your API key is invalid or has been revoked.")
    print("   Please check your API key and try again.")
    sys.exit("Exiting due to Authentication Error.")
except Exception as e:
    print(f"❌ Failed to initialize OpenAI client: {e}")
    sys.exit("Exiting due to OpenAI client initialization failure.")


# Step 4: Upload Excel file
print("\nPlease upload your Excel file (e.g., 'texts_to_embed.xlsx' with a 'text' column):")
# Pastikan ini dijalankan di environment yang mendukung files.upload() seperti Google Colab
try:
    uploaded = files.upload()
except NameError:
    print("⚠️ `files.upload()` is not available. This suggests you are not running in Google Colab.")
    print("   Please provide the file path manually if running locally.")
    # Contoh untuk input manual jika tidak di Colab:
    # filename_manual = input("Enter the full path to your Excel file: ")
    # if os.path.exists(filename_manual):
    #     filename = filename_manual
    #     uploaded = {filename: None} # Simulasikan struktur 'uploaded'
    # else:
    #     print(f"❌ File not found at path: {filename_manual}")
    #     uploaded = None
    # Untuk demo ini, kita akan hentikan jika tidak di Colab dan tidak ada path manual
    sys.exit("File upload mechanism not available or handled.")


if not uploaded:
    print("❌ No file uploaded. Please run the cell again and upload a file.")
    sys.exit("Exiting because no file was uploaded.")
else:
    filename = list(uploaded.keys())[0]
    print(f"\n✅ File '{filename}' uploaded successfully.")

    # Step 5: Read the Excel file
    df = None # Inisialisasi df
    try:
        df = pd.read_excel(filename)
        if 'text' not in df.columns:
            print(f"❌ Error: The Excel file '{filename}' must contain a column named 'text'.")
            print(f"   Found columns: {df.columns.tolist()}")
            sys.exit("Exiting due to missing 'text' column.")
        else:
            # Bersihkan teks: hapus NaN, konversi ke string, hapus spasi ekstra
            texts_from_excel = df['text'].dropna().astype(str).str.strip().tolist()
            # Filter string kosong setelah strip
            texts_to_embed = [text for text in texts_from_excel if text]

            if not texts_to_embed:
                print("❌ No valid, non-empty texts found in the 'text' column after cleaning.")
                sys.exit("Exiting as no texts to embed.")
            print(f"✅ Found {len(texts_to_embed)} non-empty texts in the 'text' column to process.")

            # (Opsional) Jika ada kolom kategori, siapkan juga
            categories_for_texts = None
            if 'category' in df.columns:
                # Ambil kategori yang sesuai dengan teks yang valid
                # Ini butuh pencocokan hati-hati jika ada NaN atau baris kosong di 'text'
                # Cara aman: buat DataFrame baru hanya dengan baris yang 'text'-nya valid
                valid_texts_df = df.dropna(subset=['text'])
                valid_texts_df['text'] = valid_texts_df['text'].astype(str).str.strip()
                valid_texts_df = valid_texts_df[valid_texts_df['text'] != '']

                if len(valid_texts_df) == len(texts_to_embed):
                    categories_for_texts = valid_texts_df['category'].astype(str).tolist()
                    print(f"✅ Found {len(categories_for_texts)} corresponding categories.")
                else:
                    print("⚠️ Could not perfectly align categories with texts. Categories will not be used for coloring.")


            # Step 6: Generate embeddings and store them
            texts_with_embeddings_and_category = [] # Akan menyimpan {'text': original_text, 'embedding': vector, 'category': category_value}

            embedding_model = "text-embedding-3-small"
            print(f"\n⚙️ Generating embeddings using model: {embedding_model}")

            for i, text_content in enumerate(texts_to_embed):
                # Teks sudah dipastikan tidak kosong di sini
                try:
                    print(f"   Processing text {i+1}/{len(texts_to_embed)}: \"{text_content[:60].replace(chr(10), ' ').replace(chr(13), ' ')}...\"")
                    response = client.embeddings.create(
                        model=embedding_model,
                        input=text_content,
                        encoding_format="float"
                    )
                    embedding = response.data[0].embedding
                    item_to_store = {'text': text_content, 'embedding': embedding}

                    # Tambahkan kategori jika tersedia dan cocok
                    if categories_for_texts and i < len(categories_for_texts):
                        item_to_store['category'] = categories_for_texts[i]

                    texts_with_embeddings_and_category.append(item_to_store)

                    # time.sleep(0.05) # Delay sangat kecil, biasanya tidak perlu untuk model baru
                except openai.APIError as e:
                    print(f"   ❌ OpenAI API Error for text {i+1} (\"{text_content[:30]}...\"): {e}")
                    print(f"      Skipping this text.")
                except Exception as e:
                    print(f"   ❌ An unexpected error occurred for text {i+1} (\"{text_content[:30]}...\"): {e}")
                    print(f"      Skipping this text.")

            if texts_with_embeddings_and_category:
                print(f"\n✅ Successfully generated and stored embeddings for {len(texts_with_embeddings_and_category)} texts.")
            else:
                print("\n⚠️ No embeddings were generated. Check your Excel file content or API errors.")
                sys.exit("Exiting as no embeddings could be generated.")

    except FileNotFoundError:
        print(f"❌ Error: File '{filename}' not found after upload (should not happen if upload was successful).")
        sys.exit("Exiting due to FileNotFoundError.")
    except pd.errors.EmptyDataError:
        print(f"❌ Error: The Excel file '{filename}' is empty or has no data.")
        sys.exit("Exiting due to EmptyDataError.")
    except Exception as e:
        print(f"❌ An error occurred while processing the Excel file or generating embeddings: {e}")
        sys.exit(f"Exiting due to an unexpected error: {e}")


# --- BLOK 2: VISUALISASI T-SNE DENGAN PARAMETER DISESUAIKAN ---

if 'texts_with_embeddings_and_category' in locals() and texts_with_embeddings_and_category:
    print(f"\n--- Starting t-SNE Visualization ---")
    print(f"✅ Using {len(texts_with_embeddings_and_category)} items with embeddings.")

    # Step 1: Ekstrak embeddings, teks, dan kategori (jika ada)
    all_embeddings_list = []
    all_texts_list = []
    all_categories_list = [] # Untuk pewarnaan
    has_categories = False

    for item in texts_with_embeddings_and_category:
        if 'embedding' in item and 'text' in item:
            all_embeddings_list.append(item['embedding'])
            all_texts_list.append(item['text'])
            if 'category' in item:
                all_categories_list.append(item['category'])
                has_categories = True # Set flag jika setidaknya satu item punya kategori
            elif has_categories: # Jika beberapa punya, beberapa tidak, isi dengan placeholder
                 all_categories_list.append("N/A")
        else:
            print(f"⚠️ Skipping item due to missing 'embedding' or 'text': {str(item)[:100]}")

    if not all_embeddings_list or not all_texts_list:
        print("❌ Error: No valid embedding/text data extracted for t-SNE.")
        sys.exit("Exiting as no data for t-SNE.")

    embeddings_np = np.array(all_embeddings_list)
    print(f"   Extracted {embeddings_np.shape[0]} embeddings with dimension {embeddings_np.shape[1]}.")
    print(f"   Extracted {len(all_texts_list)} text labels.")
    if has_categories and len(all_categories_list) == len(all_texts_list):
        print(f"   Extracted {len(all_categories_list)} category labels for coloring.")
    elif has_categories: # Jika jumlah tidak cocok setelah placeholder
        print(f"   ⚠️ Category count mismatch. Will not use categories for coloring.")
        has_categories = False


    # Step 2: Reduksi dimensi menggunakan t-SNE dengan parameter disesuaikan
    print("\n🔄 Performing dimensionality reduction with t-SNE (custom parameters)...")

    n_samples = embeddings_np.shape[0]

    if n_samples <= 1:
        print(f"❌ Error: t-SNE requires at least 2 samples. Found: {n_samples}")
        sys.exit("Exiting due to insufficient samples for t-SNE.")
    else:
        # --- Parameter t-SNE yang Disesuaikan ---
        custom_perplexity = 10.0  # Coba nilai antara 5 dan n_samples-1. Lebih rendah untuk dataset kecil.
        if custom_perplexity >= n_samples: # Pastikan perplexity valid
            custom_perplexity = max(1.0, float(n_samples - 1) / 2.0)
            print(f"   Adjusted perplexity to {custom_perplexity:.1f} due to small sample size ({n_samples}).")

        custom_n_iter = 2500    # Tingkatkan jumlah iterasi
        custom_learning_rate = 'auto' # 'auto' biasanya baik, atau coba nilai eksplisit (misal 100)
        # custom_early_exaggeration = 12.0 # Default, bisa juga disesuaikan (misal 15.0)

        print(f"   Using perplexity: {custom_perplexity:.1f}")
        print(f"   Using n_iter: {custom_n_iter}")
        print(f"   Using learning_rate: {custom_learning_rate}")
        # print(f"   Using early_exaggeration: {custom_early_exaggeration}")

        tsne_model = TSNE(
            n_components=2,
            perplexity=custom_perplexity,
            learning_rate=custom_learning_rate,
            init='pca', # 'pca' lebih stabil dan seringkali lebih cepat konvergen
            n_iter=custom_n_iter,
            # early_exaggeration=custom_early_exaggeration,
            random_state=42, # Untuk hasil yang konsisten
            verbose=0 # Set ke 1 untuk melihat progress jika diinginkan
        )
        tsne_results = tsne_model.fit_transform(embeddings_np)
        print(f"✅ t-SNE dimensionality reduction complete. Result shape: {tsne_results.shape}")

        # Step 3: Buat DataFrame untuk Plotly
        df_tsne = pd.DataFrame({
            'tsne_x': tsne_results[:, 0],
            'tsne_y': tsne_results[:, 1],
            'text_label': all_texts_list
        })

        color_param_for_plotly = None
        if has_categories and len(all_categories_list) == len(df_tsne):
            df_tsne['category'] = all_categories_list
            color_param_for_plotly = 'category'
            print("   DataFrame for Plotly includes 'category' column for coloring.")
        else:
            print("   Categories will not be used for coloring in the plot.")


        print("\n📊 Creating interactive t-SNE visualization with Plotly...")

        # Step 4: Visualisasi Interaktif dengan Plotly
        fig = px.scatter(
            df_tsne,
            x='tsne_x',
            y='tsne_y',
            hover_name='text_label',
            color=color_param_for_plotly, # Akan None jika kategori tidak ada/valid
            title=f't-SNE Visualization of Text Embeddings (Perplexity: {custom_perplexity:.0f}, Iter: {custom_n_iter})',
            labels={'tsne_x': 't-SNE Component 1', 'tsne_y': 't-SNE Component 2'},
            width=1000, # Lebar plot
            height=800  # Tinggi plot
        )

        fig.update_traces(
            marker=dict(size=9, line=dict(width=1, color='DarkSlateGrey')), # Sedikit perbesar ukuran titik
            selector=dict(mode='markers')
        )
        fig.update_layout(
            hovermode='closest',
            legend_title_text='Category' if color_param_for_plotly else ''
        )

        fig.show()

        print("\n🎉 Visualization complete! Hover over points to see their text labels.")
        print("   Clusters of points suggest texts with similar embeddings.")
else:
    print("\nℹ️ Skipping t-SNE visualization because 'texts_with_embeddings_and_category' is not available or empty.")
    print("   Ensure the Excel processing and embedding generation (Block 1) completed successfully.")