# LNG320 Gen Z Slang Similarity Analysis


In [20]:
%pip install -qU datasets pinecone-client umap-learn hdbscan scikit-learn plotly tqdm python-dotenv kaleido

/Users/beam/Workspace/Project/lng320-genz-slag-embedded/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import uuid
from getpass import getpass

import numpy as np
from datasets import load_dataset
from dotenv import load_dotenv
from IPython.display import display
from pinecone import Pinecone
from sklearn.decomposition import PCA
from tqdm import tqdm

import hdbscan
import plotly.express as px

try:
    import umap  # type: ignore
except ImportError:
    import umap.umap_ as umap

load_dotenv()


True

In [2]:
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY") or getpass(
    "Enter your Pinecone API key: "
)

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

## Load and Prepare Dataset


In [3]:
ds = load_dataset("MLBtrio/genz-slang-dataset", split="train")
df = ds.to_pandas()

df.columns = [c.lower().strip() for c in df.columns]
slang_col = "slang"

df["input_for_embedding"] = (
    df[slang_col].astype(str) + " is a slang term that means " + df["description"]
)

display(df.head())
print(f"Dataset shape: {df.shape}")


Unnamed: 0,slang,description,example,context,input_for_embedding
0,W,Shorthand for win,"Got the job today, big W!",Typically used in conversations to celebrate s...,W is a slang term that means Shorthand for win
1,L,Shorthand for loss/losing,"I forgot my wallet at home, that’s an L.",Often used when referring to a failure or mish...,L is a slang term that means Shorthand for los...
2,L+ratio,Response to a comment or action on the interne...,Your tweet got 5 likes and 100 replies calling...,Popularized on social media platforms to signi...,L+ratio is a slang term that means Response to...
3,Dank,excellent or of very high quality,That meme is so dank!,Commonly used in internet slang to refer to me...,Dank is a slang term that means excellent or o...
4,Cheugy,Derogatory term for Millennials. Used when mil...,"That phrase is so cheugy, no one says that any...",Used to refer to things that were once popular...,Cheugy is a slang term that means Derogatory t...


Dataset shape: (1779, 5)


In [4]:
df["id"] = [str(uuid.uuid5(uuid.NAMESPACE_DNS, str(s))) for s in df["slang"]]
print("Sample IDs:")
display(df[["slang", "id"]].head())


Sample IDs:


Unnamed: 0,slang,id
0,W,de7bed5e-22b7-516e-abdd-b6702d14f2c2
1,L,d0f78763-dafa-50e6-98cf-80e37b9ff0ac
2,L+ratio,acc7c7d5-9aaa-5d7d-9176-c8963f4ed424
3,Dank,99ca01c1-e02d-5c30-93f2-843624ab74bd
4,Cheugy,ea5729af-abec-59be-9fa5-2fb1403d92a1


## Connect to Pinecone Index


In [5]:
index_name = "lng320-genz-slang-qwen3-8b"
index = pc.Index(index_name)
print(f"Connected to index '{index_name}'")


Connected to index 'lng320-genz-slang-qwen3-8b'


## Retrieve Embeddings from Pinecone


In [6]:
fetched_vectors: dict[str, list[float]] = {}
batch_size = 100

for start in tqdm(range(0, len(df), batch_size), desc="Fetching embeddings"):
    batch_ids = df["id"].iloc[start : start + batch_size].tolist()
    response = index.fetch(ids=batch_ids)
    fetched_vectors.update(
        {vid: vec["values"] for vid, vec in response.vectors.items()}
    )

df["values"] = [fetched_vectors.get(row.id) for row in df.itertuples(index=False)]
missing = df[df["values"].isna()]["slang"].tolist()
if missing:
    raise RuntimeError(f"Missing vectors for {len(missing)} terms: {missing[:5]}...")

print(f"Fetched {len(fetched_vectors)} embeddings")


Fetching embeddings: 100%|██████████| 18/18 [03:07<00:00, 10.43s/it]

Fetched 1605 embeddings





In [7]:
embedding_matrix = np.vstack(df["values"].to_numpy())
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (1779, 4096)


## PCA 3D Visualization

Visualize the high-dimensional embeddings in 3D using Principal Component Analysis.


In [None]:
pca_3d = PCA(n_components=3, random_state=42)
pca_coords = pca_3d.fit_transform(embedding_matrix)

pca_df = df[["slang", "description"]].copy()
pca_df["PC1"] = pca_coords[:, 0]
pca_df["PC2"] = pca_coords[:, 1]
pca_df["PC3"] = pca_coords[:, 2]

explained_var = pca_3d.explained_variance_ratio_
print(
    f"Explained variance ratio: PC1={explained_var[0]:.3f}, PC2={explained_var[1]:.3f}, PC3={explained_var[2]:.3f}"
)
print(f"Total explained variance: {sum(explained_var):.3f}")


Explained variance ratio: PC1=0.025, PC2=0.024, PC3=0.022
Total explained variance: 0.072


In [27]:
fig_pca = px.scatter_3d(
    pca_df,
    x="PC1",
    y="PC2",
    z="PC3",
    hover_data=["slang", "description"],
    title="PCA 3D Projection of Gen Z Slang Embeddings",
    labels={
        "PC1": f"PC1 ({explained_var[0]:.1%})",
        "PC2": f"PC2 ({explained_var[1]:.1%})",
        "PC3": f"PC3 ({explained_var[2]:.1%})",
    },
)

fig_pca.update_traces(marker=dict(size=4, opacity=0.7))
fig_pca.update_layout(
    width=1000,
    height=800,
    font=dict(family="Times New Roman", size=12),
    title=dict(font=dict(size=16)),
    scene=dict(
        xaxis=dict(title=dict(font=dict(size=12))),
        yaxis=dict(title=dict(font=dict(size=12))),
        zaxis=dict(title=dict(font=dict(size=12))),
    ),
)

fig_pca.show()


## Dimensionality Reduction & Clustering

Using optimized parameters based on BERTopic best practices for text embeddings:

- **UMAP**: n_neighbors=15 (local structure), min_dist=0.0 (tight clusters), n_components=5 for clustering
- **HDBSCAN**: min_cluster_size=15, min_samples=10 for balanced cluster detection


In [8]:
umap_cluster = umap.UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)
umap_embeddings = umap_cluster.fit_transform(embedding_matrix)
print(f"UMAP clustering embeddings shape: {umap_embeddings.shape}")

  warn(


UMAP clustering embeddings shape: (1779, 5)


### HDBSCAN Clustering


In [9]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=15,
    min_samples=10,
    cluster_selection_method="eom",
    metric="euclidean",
    cluster_selection_epsilon=0.0,
    prediction_data=True,
)
cluster_labels = clusterer.fit_predict(umap_embeddings)

n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = (cluster_labels == -1).sum()

print(f"Number of clusters: {n_clusters}")
print(f"Noise points: {n_noise} ({n_noise / len(cluster_labels) * 100:.1f}%)")

Number of clusters: 30
Noise points: 368 (20.7%)




### 2D Projection for Visualization


In [10]:
umap_viz = umap.UMAP(
    n_neighbors=15,
    n_components=2,
    min_dist=0.1,
    metric="cosine",
    random_state=42,
)
umap_2d = umap_viz.fit_transform(embedding_matrix)

viz_df = df[["id", "slang", "description"]].copy()
viz_df[["umap_x", "umap_y"]] = umap_2d
viz_df["cluster"] = cluster_labels
viz_df["cluster_prob"] = clusterer.probabilities_

print(f"Visualization embeddings shape: {umap_2d.shape}")
viz_df.head()


  warn(


Visualization embeddings shape: (1779, 2)


Unnamed: 0,id,slang,description,umap_x,umap_y,cluster,cluster_prob
0,de7bed5e-22b7-516e-abdd-b6702d14f2c2,W,Shorthand for win,-1.586983,6.195272,29,1.0
1,d0f78763-dafa-50e6-98cf-80e37b9ff0ac,L,Shorthand for loss/losing,5.152213,6.530412,7,0.534892
2,acc7c7d5-9aaa-5d7d-9176-c8963f4ed424,L+ratio,Response to a comment or action on the interne...,5.337737,6.624959,7,0.494982
3,99ca01c1-e02d-5c30-93f2-843624ab74bd,Dank,excellent or of very high quality,2.778757,6.843239,26,0.970819
4,ea5729af-abec-59be-9fa5-2fb1403d92a1,Cheugy,Derogatory term for Millennials. Used when mil...,2.72159,7.790856,26,1.0


In [23]:
fig_umap = px.scatter(
    viz_df,
    x="umap_x",
    y="umap_y",
    hover_data=["slang", "description"],
    title="UMAP 2D Projection of Gen Z Slang Embeddings",
    labels={"umap_x": "UMAP 1", "umap_y": "UMAP 2"},
)

fig_umap.update_traces(marker=dict(size=6, opacity=0.6, color="#636EFA"))
fig_umap.update_layout(
    width=1000,
    height=700,
    font=dict(family="Times New Roman", size=12),
    title=dict(font=dict(size=16)),
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
)

fig_umap.show()


### Cluster Analysis


In [11]:
cluster_counts = viz_df["cluster"].value_counts().sort_index()
print("Cluster distribution:")
display(cluster_counts)

print("\nExemplar slang terms per cluster:")
for cluster_id in sorted(viz_df["cluster"].unique()):
    cluster_terms = viz_df[viz_df["cluster"] == cluster_id]
    top_terms = cluster_terms.nlargest(5, "cluster_prob")[
        ["slang", "description", "cluster_prob"]
    ]
    label = "Noise" if cluster_id == -1 else f"Cluster {cluster_id}"
    print(f"\n{label} ({len(cluster_terms)} terms):")
    display(top_terms)


Cluster distribution:


cluster
-1     368
 0      24
 1      18
 2      36
 3      16
 4      81
 5      79
 6      32
 7      91
 8      22
 9      25
 10     78
 11     21
 12     19
 13     41
 14     37
 15     24
 16     44
 17    107
 18     29
 19     29
 20     69
 21     78
 22     36
 23     83
 24     26
 25     90
 26     83
 27     29
 28     16
 29     48
Name: count, dtype: int64


Exemplar slang terms per cluster:

Noise (368 terms):


Unnamed: 0,slang,description,cluster_prob
6,Woke,being politically aware,0.0
12,Stan,Supporting something. Specifically used in som...,0.0
17,Iykyk,"Acronym for ""If you know, you know."" Used to d...",0.0
18,Rent free,To be constantly thinking or upset about somet...,0.0
40,I’m weak,"Similar to ""I'm dead"", this is just another te...",0.0



Cluster 0 (24 terms):


Unnamed: 0,slang,description,cluster_prob
139,ILY,I love you,1.0
175,ILU,ILU: I Love You,1.0
219,ILY,I love you,1.0
247,1432,I love you too,1.0
923,ICFILWU,I could fall in love with you,1.0



Cluster 1 (18 terms):


Unnamed: 0,slang,description,cluster_prob
68,On god,"Short for ""I swear to god."" Means that a perso...",1.0
1218,OGIM,"Oh God, it’s Monday",1.0
1220,OHHEMMGEE,Oh My God,1.0
1227,OM,Old man,1.0
1228,OM,"Oh, my",1.0



Cluster 2 (36 terms):


Unnamed: 0,slang,description,cluster_prob
61,I oop,"Used to express shock, embarrassment, and or a...",1.0
287,86,Over,1.0
1208,O,Opponent,1.0
1209,O,hugs,1.0
1210,O,Over,1.0



Cluster 3 (16 terms):


Unnamed: 0,slang,description,cluster_prob
145,AFK,Away From Keyboard,1.0
147,ATK,At The Keyboard,1.0
150,BAK,Back At Keyboard,1.0
307,AATK,Always at the keyboard,1.0
339,AFC,Away from computer,1.0



Cluster 4 (81 terms):


Unnamed: 0,slang,description,cluster_prob
146,ASAP,As Soon As Possible,1.0
149,A3,"Anytime, Anywhere, Anyplace",1.0
293,A3,"Anytime, anywhere, anyplace",1.0
294,AA,Alcoholics Anonymous,1.0
296,AA,Ask about,1.0



Cluster 5 (79 terms):


Unnamed: 0,slang,description,cluster_prob
124,G2G,Got to go,1.0
261,2G2BT,Too good to be true,1.0
793,G2G,Got to go,1.0
795,G2R,Got to run,1.0
796,G2TU,Got to tell you,1.0



Cluster 6 (32 terms):


Unnamed: 0,slang,description,cluster_prob
19,Catch these hands,Used to threaten physical combat with someone,1.0
564,COH,City of Heroes,1.0
879,HAND,Have a nice day,1.0
904,HVH,Heroic Violet Hold,1.0
1606,V,Very,1.0



Cluster 7 (91 terms):


Unnamed: 0,slang,description,cluster_prob
183,LOL,Laughing Out Loud,1.0
215,LOL,Laughing out loud,1.0
361,ALOL,Actually laughing out loud,1.0
1063,LHO,Laughing head off,1.0
1081,LOA,List of acronyms,1.0



Cluster 8 (22 terms):


Unnamed: 0,slang,description,cluster_prob
240,@TEOTD,At the end of the day,1.0
389,ATEOTD,At the end of the day,1.0
686,ED,Erase display,1.0
699,EOD,End of day,1.0
700,EOD,End of discussion,1.0



Cluster 9 (25 terms):


Unnamed: 0,slang,description,cluster_prob
25,e-boy,"similar to emo or goth culture, but they use t...",1.0
26,e-girl,"similar to emo or goth culture, but they use t...",1.0
100,Sleeping on,"When you ""sleep on"" something, you overlook or...",1.0
143,Zaddy,"Refers to an attractive, well-dressed man. Oft...",1.0
144,Zombie-ing,"This happens after someone ghosts you, and the...",1.0



Cluster 10 (78 terms):


Unnamed: 0,slang,description,cluster_prob
98,Pick me girl,Pick me girls are the type of girls who claim ...,1.0
192,PRW,Parents Are Watching,1.0
290,9,Parent is watching,1.0
545,CD9,parents are around,1.0
723,F2P,Free to play,1.0



Cluster 11 (21 terms):


Unnamed: 0,slang,description,cluster_prob
163,FAQ,Frequently Asked Questions,1.0
193,QPSA?,Que Pasa?,1.0
706,EQ,EverQuest,1.0
730,FAQ,Frequently asked questions,1.0
1333,Q,Queue,1.0



Cluster 12 (19 terms):


Unnamed: 0,slang,description,cluster_prob
95,Mom,It just means to admire or look up to someone ...,1.0
1106,M$,Microsoft,1.0
1108,MB,Mamma’s boy,1.0
1109,MBS,Mom behind shoulder,1.0
1110,MC,Merry Christmas,1.0



Cluster 13 (41 terms):


Unnamed: 0,slang,description,cluster_prob
173,IC,I See,1.0
189,OIC,Oh I See,1.0
908,IA8,I already ate,1.0
909,IAAA,I am an accountant,1.0
910,IAAD,I am a doctor,1.0



Cluster 14 (37 terms):


Unnamed: 0,slang,description,cluster_prob
93,Karen,A stereotype for women who behave extremely un...,1.0
180,KISS,"Keep It Simple, Stupid",1.0
210,7K,Sick:-D Laugher,1.0
284,7K,Sick,1.0
1001,K,Okay,1.0



Cluster 15 (24 terms):


Unnamed: 0,slang,description,cluster_prob
218,IDC,I don't care,1.0
222,IDC,I don't care,1.0
233,?,I don’t understand what you mean,1.0
551,CID,Crying in disgrace,1.0
552,CID,Consider it done,1.0



Cluster 16 (44 terms):


Unnamed: 0,slang,description,cluster_prob
618,DEGT,Dear daughter,1.0
620,DF,Don’t even go there,1.0
622,DGA,Don’t go anywhere,1.0
624,DGT,Don’t go there,1.0
625,DGTG,"Don’t go there, girlfriend",1.0



Cluster 17 (107 terms):


Unnamed: 0,slang,description,cluster_prob
104,TBH,short for “to be honest” and is usually used w...,1.0
200,THX,Thank You,1.0
201,TTFN,Ta-Ta For Now!,1.0
202,TTYL,Talk To You Later,1.0
225,TIME,Tears in my eyes,1.0



Cluster 18 (29 terms):


Unnamed: 0,slang,description,cluster_prob
160,CU,See You,1.0
161,CUL8R,See You Later,1.0
162,CYA,See You,1.0
586,CU,See you too,1.0
587,CU,See you,1.0



Cluster 19 (29 terms):


Unnamed: 0,slang,description,cluster_prob
187,M8,Mate,1.0
288,88,Bye-bye,1.0
289,88,Hugs and kisses,1.0
669,DV8,Deviate,1.0
875,H&K,Hugs & kisses,1.0



Cluster 20 (69 terms):


Unnamed: 0,slang,description,cluster_prob
97,NGL,Not Gonna Lie,1.0
1153,N1,Nice one,1.0
1154,N2M,Nothing too much,1.0
1158,NBD,No big deal,1.0
1160,NC,Nice crib,1.0



Cluster 21 (78 terms):


Unnamed: 0,slang,description,cluster_prob
134,Benching,"Just like in sports, benching someone in datin...",1.0
259,2B,To be,1.0
409,B,Back,1.0
410,B,Be,1.0
411,B&,Banned,1.0



Cluster 22 (36 terms):


Unnamed: 0,slang,description,cluster_prob
151,BBL,Be Back Later,1.0
152,BBS,Be Back Soon,1.0
156,BRT,Be Right There,1.0
157,BTW,By The Way,1.0
439,BBIAB,Be back in a bit,1.0



Cluster 23 (83 terms):


Unnamed: 0,slang,description,cluster_prob
1580,UGTBK,You’ve got to be kidding,1.0
1581,UHGTBSM,You have got to be shitting me!,1.0
1722,Y?,Why?,1.0
1723,Y,Meaning Yawn,1.0
1725,YA,Your,1.0



Cluster 24 (26 terms):


Unnamed: 0,slang,description,cluster_prob
130,JK,Just kidding,1.0
217,JK,Just kidding,1.0
897,HOAS,Hold on a second,1.0
980,j00r,Your,1.0
981,JAC,Just a second,1.0



Cluster 25 (90 terms):


Unnamed: 0,slang,description,cluster_prob
24,Ffs,For fuck sake,1.0
89,Fr,For real,1.0
726,FAB,Fabulous,1.0
728,FAF,Funny as freak,1.0
749,FEITCTAJ,Freak them if they can’t take a joke,1.0



Cluster 26 (83 terms):


Unnamed: 0,slang,description,cluster_prob
4,Cheugy,Derogatory term for Millennials. Used when mil...,1.0
11,Glow up,a makeover or transformation from bad to good.,1.0
13,Ghosting,common amongst the earlier talking stages of a...,1.0
16,Drip,"Another way of saying swag, drip is a term for...",1.0
20,Drag,"If you drag someone, you're criticizing or mak...",1.0



Cluster 27 (29 terms):


Unnamed: 0,slang,description,cluster_prob
203,U,You,1.0
204,U2,You Too,1.0
1572,^URS,Up yours,1.0
1574,UDI,Unidentified drinking injury,1.0
1575,UDM,U (You) da (the) man,1.0



Cluster 28 (16 terms):


Unnamed: 0,slang,description,cluster_prob
209,WUF,Where Are You From?,1.0
1467,SUP,What’s up?,1.0
1642,WAYF,Where are you from?,1.0
1646,WBU,What about you?,1.0
1678,WRU,Where are you?,1.0



Cluster 29 (48 terms):


Unnamed: 0,slang,description,cluster_prob
0,W,Shorthand for win,1.0
292,*w*,wink,1.0
1628,W/,With,1.0
1647,WC,Welcome,1.0
1648,WC,Who cares,1.0


### Cluster Visualization


In [15]:
viz_df["cluster_label"] = viz_df["cluster"].apply(
    lambda x: "Noise" if x == -1 else f"Cluster {x}"
)

cluster_order = ["Noise"] + [f"Cluster {i}" for i in range(n_clusters)]
viz_df["cluster_label"] = viz_df["cluster_label"].astype("category")
viz_df["cluster_label"] = viz_df["cluster_label"].cat.set_categories(
    [c for c in cluster_order if c in viz_df["cluster_label"].values], ordered=True
)
viz_df_sorted = viz_df.sort_values("cluster_label")

fig = px.scatter(
    viz_df_sorted,
    x="umap_x",
    y="umap_y",
    color="cluster_label",
    hover_data=["slang", "description", "cluster_prob"],
    title="UMAP + HDBSCAN Clustering of Gen Z Slang",
    labels={"umap_x": "UMAP 1", "umap_y": "UMAP 2", "cluster_label": "Cluster"},
    category_orders={
        "cluster_label": [
            c for c in cluster_order if c in viz_df["cluster_label"].values
        ]
    },
)

fig.update_traces(marker=dict(size=6, opacity=0.8))
fig.update_layout(
    width=1200,
    height=800,
    font=dict(family="Times New Roman", size=12),
    title=dict(font=dict(size=16)),
    legend=dict(
        title=dict(text="Cluster", font=dict(size=14)),
        orientation="h",
        yanchor="top",
        y=-0.15,
        xanchor="center",
        x=0.5,
        font=dict(size=10),
        itemwidth=30,
    ),
    margin=dict(b=150),
    xaxis=dict(title=dict(font=dict(size=14))),
    yaxis=dict(title=dict(font=dict(size=14))),
)

fig.show()

In [13]:
viz_df.to_csv("viz_df.csv", index=False)
print("Exported viz_df.csv")

Exported viz_df.csv
