# PokeMBTI-LLM


In [1]:
# Change to correct working directory in Colab-Jupyter
import os
os.chdir('/tf-ojw/pokellm')
os.getcwd()

'/tf-ojw/pokellm'

In [None]:
# https://github.com/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter05/Chapter%205%20-%20Text%20Clustering%20and%20Topic%20Modeling.ipynb
from sentence_transformers import SentenceTransformer

# Create an embedding for each abstract
embedding_model = SentenceTransformer('thenlper/gte-small')
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

In [None]:
# Check the dimensions of the resulting embeddings
embeddings.shape

In [None]:
from umap import UMAP

# Reduce the dimensionality of input embeddings from 384 dimenions to 5 dimenions
umap_model = UMAP(
    n_components=5, min_dist=0.0, metric='cosine', random_state=42
)
reduced_embeddings = umap_model.fit_transform(embeddings)


In [None]:
# Cluster the reduced embeddings
from hdbscan import HDBSCAN

# Fit the model and extract the clusters
hdbscan_model = HDBSCAN(
    min_cluster_size=50, metric='euclidean', cluster_selection_method='eom'
).fit(reduced_embeddings)
clusters = hdbscan_model.labels_

# How many clusters were generated?
len(set(clusters))


In [None]:
import numpy as np

# Print first three documents in cluster 0
cluster = 0
for index in np.where(clusters==cluster)[0][:3]:
    print(abstracts[index][:300] + "... \n")


In [None]:
import pandas as pd

# Reduce 384-dimensional embeddings to 2 dimensions for easier visualization
reduced_embeddings = UMAP(
    n_components=2, min_dist=0.0, metric='cosine', random_state=42
).fit_transform(embeddings)

# Create dataframe
df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

# Select outliers and non-outliers (clusters)
clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

In [None]:
import matplotlib.pyplot as plt

# Plot outliers and non-outliers seperately
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, s=2, c="grey")
plt.scatter(
    clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int),
    alpha=0.6, s=2, cmap='tab20b'
)
plt.axis('off')
# plt.savefig("matplotlib.png", dpi=300)  # Uncomment to save the graph as a .png

In [29]:
pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                      Version
---------------------------- --------------------
absl-py                      1.4.0
accelerate                   0.22.0
aiohappyeyeballs             2.4.4
aiohttp                      3.10.11
aiosignal                    1.3.1
anyio                        4.0.0
argon2-cffi                  23.1.0
argon2-cffi-bindings         21.2.0
arrow                        1.2.3
asttokens                    2.4.0
astunparse                   1.6.3
async-lru                    2.0.4
async-timeout                5.0.1
attrs                        23.1.0
Babel                        2.12.1
backcall                     0.2.0
beautifulsoup4               4.12.2
bleach                       6.0.0
cachetools                   5.3.1
certifi                      2019.11.28
cffi                         1.15.1
chardet                      3.0.4
charset-normalizer           3.2.0
click                        8.1.8
cmake                        3.27

In [None]:
# pip install sentence-transformers==2.4.0            # 2.4.0 to run 'intfloat/multilingual-e5-large-instruct' # 2.2.2 to avoid using 'datasets'
# pip install huggingface_hub==0.25.0                 # originally  0.30.2
# pip install datasets      # latest versions of sentence-transformers use datasets.Dataset under the hood to wrap and iterate over training data

# to use 'intfloat/multilingual-e5-large-instruct':
 "sentence_transformers": "2.4.0.dev0",
    "transformers": "4.37.0",
    "pytorch": "2.1.0+cu121"

In [None]:
# Delete this portion and keep just the custom similarity pairs
"""
from sentence_transformers import InputExample, losses

train_examples = [InputExample(texts=[u, p], label=score) for u, p, score in training_data]
train_dataset = SentencesDataset(train_examples, model.get_tokenizer())
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)

train_loss = losses.CosineSimilarityLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3)
"""
# Result: Model learns subjective tone-to-description alignment that cosine alone cannot achieve.


# custom similarity pairs

# Calm / Nature / Kind
InputExample(
    texts=[
        "I’m introverted and enjoy quiet walks through nature. I’m dependable and grounded.",
        "Bulbasaur is a calm and grounded Pokémon who enjoys nature and steady growth."
    ],
    label=0.95
),

# Firey / Brave / Independent
InputExample(
    texts=[
        "I love challenges, I’m intense and passionate about what I do.",
        "Charizard is confident, powerful, and breathes fire with great pride."
    ],
    label=0.95
),

# Sleepy / Easy-going
InputExample(
    texts=[
        "I like to take things easy and avoid conflict. I’d rather nap than argue.",
        "Snorlax is a sleepy giant that is chill, relaxed, and rarely gets bothered."
    ],
    label=1.0
),

# Adaptive / Curious
InputExample(
    texts=[
        "I’m flexible and like trying different things. I go with the flow.",
        "Eevee is an adaptable Pokémon with many potential forms."
    ],
    label=0.9
),

# Playful / Energetic
InputExample(
    texts=[
        "I’m energetic, social, and love being around friends.",
        "Pikachu is an energetic and friendly electric mouse known for loyalty."
    ],
    label=0.95
),

# Negative / Mismatched Pairs (Important for contrast)
InputExample(
    texts=[
        "I’m slow-paced and calm. I dislike being rushed.",
        "Charizard is aggressive and proud, breathing fire to overwhelm opponents."
    ],
    label=0.2
),
InputExample(
    texts=[
        "I’m quiet and introverted.",
        "Pikachu is loud, playful, and energetic."
    ],
    label=0.3
),
InputExample(
    texts=[
        "I enjoy peaceful meditation and spiritual thought.",
        "Machamp is a physical fighter who punches mountains for training."
    ],
    label=0.1
),




In [3]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses, SentencesDataset
import pandas as pd
import torch
from torch.utils.data import DataLoader
# from torch.nn.functional import cosine_similarity
# from datasets import Dataset


# Load model
#MODEL_NAME = 'all-MiniLM-L6-v2'   # 'all-MiniLM-L6-v2' is great for general similarity (news, reviews), but not optimized for abstract personality-tone alignment
#MODEL_NAME = 'all-mpnet-base-v2'   # 'all-mpnet-base-v2' is larger, more nuanced, better with sentence-level semantics
MODEL_NAME = 'intfloat/multilingual-e5-large-instruct'

"""
Consider other models: https://huggingface.co/spaces/mteb/leaderboard
https://huggingface.co/intfloat/multilingual-e5-large-instruct - very lightweight but great performance
'intfloat/multilingual-e5-large-instruct'


"""

"""
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

"""
"""
# Prefix prompt (required for E5 instruct models)
sentence = "passage: I love nature and I'm very calm."

inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = model(**inputs)

# Mean Pooling
embeddings = outputs.last_hidden_state.mean(dim=1)
"""
"""
# Test program for using AutoTokenizer instead of SentenceTransformer
# Function to generate embedding
def get_embedding(text, prefix="passage"):
    full_text = f"{prefix}: {text}"
    inputs = tokenizer(full_text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1)  # shape: (1, hidden_dim)

# Generate Pokémon embeddings (passages)
print("Embedding Pokémon descriptions...")
pokemon_embeddings = []
for desc in df["biology"]:
    emb = get_embedding(desc, prefix="passage")
    pokemon_embeddings.append(emb)

# Stack into one tensor
pokemon_tensor = torch.vstack(pokemon_embeddings)  # shape: (num_pokemon, hidden_dim)

# Get user input
user_input = '
My favorite color is baby blue. My dream job is to work remotely as a machine learning engineer while working on my DIY hobby as a side hustle.
'
user_emb = get_embedding(user_input, prefix="query")

# Compute cosine similarities
similarities = cosine_similarity(user_emb, pokemon_tensor)[0]  # shape: (num_pokemon,)
df["similarity"] = similarities.cpu().numpy()

# Sort by similarity
top_matches = df.sort_values(by="similarity", ascending=False).head(6)
print(top_matches[["name", "similarity"]])
df.sort_values(by="similarity", ascending=False).head(n=10)

"""


model = SentenceTransformer(MODEL_NAME)



  from .autonotebook import tqdm as notebook_tqdm
2025-05-06 07:23:15.567180: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 07:23:15.734790: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.4.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  return torch._C._cuda_getDeviceCount() > 0


In [4]:
# Fine-Tuning SentenceTransformer on Custom Similarity Pairs

train_data = [
    InputExample(texts=["calm and caring", "Bulbasaur is peaceful"], label=0.9),
    InputExample(texts=["fiery and competitive", "Charizard is aggressive"], label=0.95),
    InputExample(texts=["lazy and sleepy", "Snorlax loves to sleep"], label=1.0),
    InputExample(texts=["energetic and active", "Snorlax is lazy"], label=0.1),
]

#train_data = SentencesDataset(train_data, model.get_tokenizer())

# Train
train_dataset = SentencesDataset(train_data, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
train_loss = losses.CosineSimilarityLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, show_progress_bar=True)
model.save("custom-pokemon-matcher")


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration: 100%|██████████| 1/1 [00:04<00:00,  4.89s/it]
Epoch:  33%|███▎      | 1/3 [00:04<00:09,  4.89s/it]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]
Epoch:  67%|██████▋   | 2/3 [00:08<00:04,  4.18s/it]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration: 100%|██████████| 1/1 [00:03<00:00,  3.52s/it]
Epoch: 100%|██████████| 3/3 [00:12<00:00,  4.04s/it]


In [8]:
# Save pokemon embeddings
#df = pd.read_csv("pokemon_data.csv", sep='^([^,]+),')
EMBEDDINGS_PATH = 'pokemon_embeddings.pt'
embeddings = model.encode(list(df['biology'] + df['auxiliary']), convert_to_tensor=True)   #
# embeddings = model.encode(df['biology'].tolist(), convert_to_tensor=True)   # torch.Size([151, 384]) for 151 pokemon
torch.save({'names': df['name'].tolist(), 'embeddings': embeddings}, EMBEDDINGS_PATH)

#user_text = "I'm a chill dude."
#user_vec = model.encode(user_text, convert_to_tensor=True)
#pokemon_vecs = model.encode(embeddings, convert_to_tensor=True)
#scores = util.cos_sim(user_vec, pokemon_vecs)
#scores

In [11]:
# What is your favorite color?
# What is your dream job?
# What is your favorite food?
# What hobbies do you have?
# How do you like to spend your free time?
user_text = "My favorite color is baby blue. My dream job is to work remotely as a machine learning engineer while working on my DIY hobby as a side hustle. My favorite food would probably have to be asian cuisines like Korean, Japanese, Chinese, Vietnamese and Thai food. I love meats, spicy food and clean and healthy, but stimulating and appetizing cuisines - I like to be surprised! My hobbies are tennis, reading, coding projects, learning, working out at the gym, trying new delicious food - both eating and cooking recipes - and any sort of creative endeavors like drawing, writing, crafting, etc. I like to spend my free time relaxing, playing games like Genshin Impact, playing tennis, surfing through social media and YouTube for interesting content, enjoying a good weather with good food, and such."
#user_text = "My favorite color is sage. My dream job is to be a professor. My favorite food is tteokbokki, hotpot, gopchang, and pastry. I like to bake and swim as my hobbies. In my free time, I like to hang out with my friends."
user_vec = model.encode(user_text, convert_to_tensor=True)




In [12]:
########### Trying running with & without text preprocessing #################

import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9.,;!?()\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word not in stop_words]
    text = " ".join(tokens)

    return text

user_text = clean_text(user_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# find top matches
def find_top_matches(user_input: str, top_k: int = 6):
    # Load model + precomputed Pokémon embeddings
    user_embedding = model.encode(user_input, convert_to_tensor=True)

    data = torch.load(EMBEDDINGS_PATH, weights_only=True)  # weights_only=False uses default pickle module implicitly, which can construct malicious pickle data that will execute arbitrary code during unpickling.
    names = data['names']
    pokemon_embeddings = data['embeddings']   # 'description'?

    # Cosine similarity
    cosine_scores = util.cos_sim(user_embedding, pokemon_embeddings)[0]

    # Get top matches
    top_results = torch.topk(cosine_scores, k=top_k)

    results = []
    for idx, score in zip(top_results.indices, top_results.values):
        results.append({
            'name': names[idx],
            'score': round(score.item(), 4)
        })

    return results

print('Top matches')
matches = find_top_matches(user_text)
for m in matches:
  print(f"{m['name']}: {m['score']*100:.2f}%")

Top matches
Nidoran♀: 78.69%
Nidoran♂: 78.49%
Slowpoke: 78.21%
Hitmonchan: 78.13%
Doduo: 77.96%
Seadra: 77.80%


In [16]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(embeddings)
distances, indices = knn.kneighbors(np.array([user_vec.numpy()]))# [user_vec])

print('Proximity')
for d, i in zip(distances.flatten(), indices.flatten()):
    print(f'{df.name.iloc[i]}: {d:.4f}')

Proximity
Nidoran♀: 0.2188
Slowpoke: 0.2200
Nidoran♂: 0.2211
Articuno: 0.2218
Seaking: 0.2226
Vaporeon: 0.2243


## Web crawler

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

BASE_URL = "https://bulbapedia.bulbagarden.net"
START_URL = "/wiki/Bulbasaur_(Pok%C3%A9mon)"
# "User-Agent" - a string your browser (or bot) sends to a server to identify itself
# "Mozilla/5.0" - pretends to be a regular browser
HEADERS = {"User-Agent": "Mozilla/5.0"}   # alternatives: "Chrome/91.0", "Safari/537.36"

def clean_text(text):
    return ' '.join(text.strip().replace('\n', ' ').split())

def get_pokemon_data(page_url):
    url = BASE_URL + page_url
    print(f"Scraping: {url}")
    res = requests.get(url, headers=HEADERS)
    # res.text - raw HTML string returned from the webpage
    soup = BeautifulSoup(res.text, "html.parser")   # converts that HTML string into a navigable object tree

    data = {}

    # Hyperlink
    data["url"] = url

    # Name
    # .strip() - removes whitespace in front & back
    data["name"] = soup.find("h1", {"id": "firstHeading"}).text.strip()[:-10]   # Slice off ' (Pokémon)'

    # Biology
    bio_heading = soup.find("span", id="Biology")
    if bio_heading:
        # find_parent() - finds the closest ancestor element (parent tag) of the current element
        # find_next_sibling("p") - finds the next sibling tag (same level in the DOM) of type <p> after the current element
        biology = bio_heading.find_parent().find_next_sibling("p")   # find the first ancestor which name is p
        data["biology"] = clean_text(biology.text) if biology else ""

    # Type and Abilities
    infobox = soup.find("table", class_="roundy")
    if infobox:
        # title=lambda x: x and "Type" in x checks that the title attribute exists and contains "Type"
        type_row = infobox.find("a", title=lambda x: x and "Type" in x)
        if type_row:
            # find_all_next() - finds all matching elements that appear after the current tag (not just siblings—anywhere forward)
            types = type_row.find_all_next("a", title=lambda x: x and "(type)" in x)
            data["types"] = [t.text for t in types[:2]]  # max 2 types

        abilities = infobox.find_all("a", title=lambda x: x and "Ability" in x)
        data["abilities"] = list(set([a.text for a in abilities]))

        # Leveling rate
        leveling_rate = ""
        try:
            exp_td_all = soup.find_all("td", class_="roundy")
            for exp_td in exp_td_all:
                # Find the correct 'td' with experience table
                if exp_td and exp_td.find("a", title="Experience"):
                    leveling_rate = exp_td.find("table", class_="roundy").find(
                              "tbody").find("tr").find("td").text.strip()
                    """inner_table = exp_td.find("table", class_="roundy")
                    if inner_table:
                        cell = inner_table.find("tbody").find("tr").find("td")
                        if cell:
                            leveling_rate = cell.text.strip()"""
        except Exception as e:
            print(f"⚠️ Failed to extract leveling rate for {data['name']}: {e}")

        data["leveling_rate"] = leveling_rate

    # Link to next Pokémon
    next_link = None
    for a_tag in soup.find_all("a", href=True):
        # Find link with href containing "(Pok%C3%A9mon)" and child span with → arrow
        if "(Pok%C3%A9mon)" in a_tag['href']:
            span = a_tag.find("span", style="color:#000;")
            if span and "→" in span.text:
                next_link = a_tag['href']
                break

    data["next_link"] = next_link

    return data

def crawl_pokemon(limit=10):
    all_data = []
    current_url = START_URL
    visited = set()

    for _ in range(limit):
        if current_url in visited:
            break
        visited.add(current_url)

        try:
            data = get_pokemon_data(current_url)
            all_data.append(data)
            current_url = data.get("next_link")
            # https://bulbapedia.bulbagarden.net/robots.txt crawl-delay 5
            time.sleep(5)
        except Exception as e:
            print(f"❌ Error: {e}")
            break

        if not current_url:
            break

    return all_data

if __name__ == "__main__":
    pokemon_data = crawl_pokemon(limit=1)
    df = pd.DataFrame(pokemon_data)
    df.to_csv("test_pokemon_data.csv", index=False)   ############# change to pokemon_data.csv
    print("✅ Saved pokemon_data.csv")


Scraping: https://bulbapedia.bulbagarden.net/wiki/Bulbasaur_(Pok%C3%A9mon)
Grass
<class 'str'>
[<a href="/wiki/Ability" title="Ability"><span style="color:#000;">Abilities</span></a>, <a href="/wiki/Overgrow_(Ability)" title="Overgrow (Ability)"><span style="color:#000;">Overgrow</span></a>, <a href="/wiki/Cacophony_(Ability)" title="Cacophony (Ability)"><span style="color:#000;">Cacophony</span></a>, <a href="/wiki/Cacophony_(Ability)" title="Cacophony (Ability)"><span style="color:#000;">Cacophony</span></a>, <a href="/wiki/Chlorophyll_(Ability)" title="Chlorophyll (Ability)"><span style="color:#000;">Chlorophyll</span></a>, <a href="/wiki/Cacophony_(Ability)" title="Cacophony (Ability)"><span style="color:#000;">Cacophony</span></a>, <a href="/wiki/Cacophony_(Ability)" title="Cacophony (Ability)"><span style="color:#000;">Cacophony</span></a>, <a href="/wiki/Cacophony_(Ability)" title="Cacophony (Ability)"><span style="color:#000;">Cacophony</span></a>]
<class 'bs4.element.ResultSet

## Data Processing

In [6]:
import ast

# Remove (Pokemon) from name
df = pd.read_csv('pokemon_data.csv')

# Convert string containing literal to list
df['types'] = df['types'].apply(lambda x: ast.literal_eval(x))
df['abilities'] = df['abilities'].apply(lambda x: ast.literal_eval(x))

for i in range(len(df)):
    # Remove ' (Pokemon)' from 'name'
    df['name'][i] = df.name.iloc[i][:-10]
    # Remove "Abilities" from "abilities"
    if "Abilities" in df['abilities'][i]:
        df['abilities'][i].remove("Abilities")

# Synthesize auxiliary data using other columns
df['auxiliary'] = df.apply(lambda x:
                f"{x['name']} is a Pokémon of type {x['types']}, with main abilities {x['abilities']} and leveling rate of {x['leveling_rate']}.", axis=1)



df.head(n=3)


Unnamed: 0,url,name,biology,types,abilities,leveling_rate,next_link,auxiliary
0,https://bulbapedia.bulbagarden.net/wiki/Bulbas...,Bulbasaur,"Bulbasaur is a small, quadrupedal amphibian Po...","[Grass, Poison]","[Overgrow, Cacophony, Chlorophyll]",Medium Slow,/wiki/Ivysaur_(Pok%C3%A9mon),"Bulbasaur is a Pokémon of type ['Grass', 'Pois..."
1,https://bulbapedia.bulbagarden.net/wiki/Ivysau...,Ivysaur,Ivysaur is a quadrupedal amphibian Pokémon tha...,"[Grass, Poison]","[Overgrow, Cacophony, Chlorophyll]",Medium Slow,/wiki/Venusaur_(Pok%C3%A9mon),"Ivysaur is a Pokémon of type ['Grass', 'Poison..."
2,https://bulbapedia.bulbagarden.net/wiki/Venusa...,Venusaur,"Venusaur is a squat, quadrupedal amphibian Pok...","[Grass, Poison]","[Overgrow, Cacophony, Thick Fat, Chlorophyll]",Medium Slow,/wiki/Charmander_(Pok%C3%A9mon),"Venusaur is a Pokémon of type ['Grass', 'Poiso..."


In [None]:
# Webscrape pokedex's pokemon biology description (and some numerical data)
# Use
# Things to try 20250504:
  # try it with auxiliary data on embeddings & compare performance
  # try w & w/o text preprocessing & compare performance

  # cosine similarity pairs
  # new models: all-mpnet-base-v2 -> instruct

