In [13]:
import pandas as pd
import re
import nltk
import numpy as np
import string
from collections import Counter
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from IPython.display import Markdown, display

display(Markdown("# Downloads Needed Dependencies and Data"))
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')



# Downloads Needed Dependencies and Data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [14]:
display(Markdown("# Reads Lscs Member Data"))
# lscs members
lscs = pd.read_csv(
    "lscs_members.csv",
    on_bad_lines="warn"
)

lscs.head()

# Reads Lscs Member Data


  lscs = pd.read_csv(


Unnamed: 0,id,full_name,nickname,email,telegram,position_id,committee_id,college,program,discord,interests,contact_number,fb_link
0,11825782,"Bernaldez, Leo Mikhail B.",,leo_bernaldez@dlsu.edu.ph,,MEM,MEM,GCOE,BS - Chemical Engineering,,,09052898843,https://www.facebook.com/beltran.bernaldez
1,11941820,"De Jesus, Matthew Joseph M.",,matthew.dejesus@dlsu.edu.ph,,MEM,MEM,College of Computer Studies (CCS),BS - Computer Science Major in Network and Inf...,,"Gaming, Editing",0917 618 8375,https://www.facebook.com/matti42/
2,12033391,"Palencia, Ralph Dwanne V",,ralph_palencia@dlsu.edu.ph,,MEM,MEM,SOE,BS Applied Industrial Economics,,"Tech, games",09567349274,fb.com/dwanne.palencia
3,12038660,"Fong, Hannah Regine C.",,hannah_regine_fong@dlsu.edu.ph,,MEM,MEM,CCS,BSMS - Computer Science,,,09177012643,fb.com/fong.hannah
4,12073067,"de Ramos, Ghrazielle Rei A.",,ghrazielle_deramos@dlsu.edu.ph,,MEM,MEM,CCS,BS Computer Science Major in Software Technology,,"Basketball, Sketching, Coding, Volunteering",0933 862 0716,https://web.facebook.com/ghrazielle/


In [15]:
display(Markdown("# Defines House Data"))
# Define the houses
houses = {
    "Red": {"House Name": "House of Jasper", "Gem/Crystal": "Jasper", "Signature Color/Look": "earthy red/brown-red"},
    "Orange": {"House Name": "House of Carnelian", "Gem/Crystal": "Carnelian", "Signature Color/Look": "fiery orange"},
    "Yellow": {"House Name": "House of Citrine", "Gem/Crystal": "Citrine", "Signature Color/Look": "golden yellow"},
    "Green": {"House Name": "House of Aventurine", "Gem/Crystal": "Aventurine", "Signature Color/Look": "shimmering green"},
    "Light Blue": {"House Name": "House of Amazonite", "Gem/Crystal": "Amazonite", "Signature Color/Look": "turquoise blue-green"},
    "Indigo/Dark Blue": {"House Name": "House of Lapis Lazuli", "Gem/Crystal": "Lapis Lazuli", "Signature Color/Look": "deep royal blue"},
    "Purple": {"House Name": "House of Amethyst", "Gem/Crystal": "Amethyst", "Signature Color/Look": "violet to purple"},
    "White": {"House Name": "House of Moonstone", "Gem/Crystal": "Moonstone", "Signature Color/Look": "milky white with rainbow sheen"},
    "Black": {"House Name": "House of Obsidian", "Gem/Crystal": "Obsidian / Onyx", "Signature Color/Look": "jet black (volcanic glass)"},
    "Pink/Brown": {"House Name": "House of Rose Quartz", "Gem/Crystal": "Serpentine", "Signature Color/Look": "pink"}
}

# Convert to DataFrame
house_df = pd.DataFrame.from_dict(houses, orient="index").reset_index()

# Rename columns
house_df.rename(columns={"index": "Color Category"}, inplace=True)

# Add numeric ID column
house_df.insert(0, "House ID", range(len(house_df)))

# Save to CSV
house_df.to_csv("houses.csv", index=False)

house_df


# Defines House Data

Unnamed: 0,House ID,Color Category,House Name,Gem/Crystal,Signature Color/Look
0,0,Red,House of Jasper,Jasper,earthy red/brown-red
1,1,Orange,House of Carnelian,Carnelian,fiery orange
2,2,Yellow,House of Citrine,Citrine,golden yellow
3,3,Green,House of Aventurine,Aventurine,shimmering green
4,4,Light Blue,House of Amazonite,Amazonite,turquoise blue-green
5,5,Indigo/Dark Blue,House of Lapis Lazuli,Lapis Lazuli,deep royal blue
6,6,Purple,House of Amethyst,Amethyst,violet to purple
7,7,White,House of Moonstone,Moonstone,milky white with rainbow sheen
8,8,Black,House of Obsidian,Obsidian / Onyx,jet black (volcanic glass)
9,9,Pink/Brown,House of Rose Quartz,Serpentine,pink


In [16]:
display(Markdown("# Clean your 'interests' column"))

# Clean your "interests" column
lscs["cleaned_interests"] = (
    lscs["interests"]
    .astype(str)
    .replace(r"[&/;\n|*+-]", ",", regex=True)  # added + and -
    .str.replace(r",+", ",", regex=True)        # collapse multiple commas
    .str.strip(", ")                            # strip leading/trailing commas/spaces
)

initial = lscs['cleaned_interests'].sort_values()
initial.to_csv("initial_keywords.csv")
get_nan = lscs[lscs['cleaned_interests'].str.contains('nan',case=False)]
initial

# Clean your 'interests' column

Unnamed: 0,cleaned_interests
746,1. Coding (I worked on a platform game with tw...
789,3D Modeling and Video Editing
5,?
708,"ACADEMIC,Math, Science, History, Philosophy,MU..."
777,"Anime, Deepwoken, Overwatch, Pokémon, Genshin ..."
...,...
37,what if ayaw ko
56,writing
802,"writing, drawing, tech, games, music"
456,"writing, graphic design, walking"


In [17]:
display(Markdown("# Extracts Keywords from Interests"))
# Build phrase map automatically from dataframe
stop_words = set(stopwords.words('english'))

# Collect all words from the dataframe
all_words = []
for text in lscs['cleaned_interests']:
    tokens = [w.lower() for w in word_tokenize(str(text)) if w not in string.punctuation and w not in stop_words]
    all_words.extend(tokens)

# Detect top bigrams
bigram_finder = BigramCollocationFinder.from_words(all_words)
top_bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)  # top 20 bigrams

# Detect top trigrams
trigram_finder = TrigramCollocationFinder.from_words(all_words)
top_trigrams = trigram_finder.nbest(TrigramAssocMeasures.likelihood_ratio, 10)  # top 10 trigrams

# Create phrase map: join words with _ so they become single tokens
phrase_map = {" ".join(b): "_".join(b) for b in top_bigrams + top_trigrams}

# Define keyword extraction function with nouns and verbs
def extract_keywords_auto(sentence):
    sentence_lower = str(sentence).lower()

    # Apply phrase mapping
    for phrase, replacement in phrase_map.items():
        if phrase in sentence_lower:
            sentence_lower = sentence_lower.replace(phrase, replacement)

    # Tokenize
    words = word_tokenize(sentence_lower)
    # Remove punctuation and stopwords
    words = [w for w in words if w not in string.punctuation and w not in stop_words]

    # POS tagging
    tagged = pos_tag(words)

    # Keep nouns and verbs
    keywords = [word for word, tag in tagged if tag.startswith('NN') or tag.startswith('VB')]

    # Return unique keywords
    return list(set(keywords))

# Apply to DataFrame
lscs["keywords_list"] = lscs["cleaned_interests"].apply(extract_keywords_auto)

# Save and inspect
keywords = lscs['keywords_list'].sort_values()
keywords.to_csv('keywords.csv', index=False)

# Count empty keyword rows
empty_keywords = lscs['keywords_list'].apply(lambda x: len(x) == 0).sum()
print("Number of empty keyword rows: ", empty_keywords)

# Get rows with empty keywords
empty_rows = lscs[lscs['keywords_list'].apply(lambda x: len(x) == 0)]

# Display their names (or any column you want)
empty_names = empty_rows['full_name'].tolist()
print("Empty keywords found for these names:")
print(empty_names)


# Extracts Keywords from Interests

Number of empty keyword rows:  2
Empty keywords found for these names:
['Rejano, Hans Martin F.', 'Arous, Adam']


In [18]:
# Convert list of keywords into a comma-separated string
lscs["keywords"] = lscs["keywords_list"].apply(lambda x: ", ".join(x) if x else "")

# Sort the keywords column alphabetically
lscs_sorted = lscs.sort_values(by="keywords")

# Save the sorted CSV
lscs_sorted[['keywords']].to_csv("keywords_sorted.csv", index=False)

# If you want to save the original list column too
lscs_sorted['keywords_list'].to_csv("keywords_list_sorted.csv", index=False)

lscs_sorted['keywords_list']

Unnamed: 0,keywords_list
5,[]
642,[]
124,"[,cybersecurity, cello, piano, managing, model..."
375,"[activities, leadership, programming, journali..."
564,"[activities, rock, dressing, house, sports, hi..."
...,...
433,"[work, public_speaking, volunteer, reading, si..."
718,[writing]
56,[writing]
694,"[writing, coding, listening, biking, music, pr..."


In [19]:
display(Markdown("# Flatten and create a DataFrame of Keywords"))
# Flatten and create a DataFrame
keywords_df = lscs['keywords_list'].explode().dropna().to_frame(name="keywords_list")

keywords_df = keywords_df.sort_values(by="keywords_list").reset_index(drop=True)
keywords_df

# Flatten and create a DataFrame of Keywords

Unnamed: 0,keywords_list
0,'m
1,'ve
2,",2."
3,",cybersecurity"
4,",video_games"
...,...
2754,’
2755,’
2756,👍
2757,👽


In [20]:
display(Markdown("# Cleans Data"))

def clean_keyword(word):
    if not isinstance(word, str):
        return None
    # Lowercase + strip spaces
    word = word.lower().strip()
    # Remove leading/trailing punctuation
    word = re.sub(r'^[\W_]+|[\W_]+$', '', word)
    # Remove if empty or just numbers/punctuation
    if word == "" or re.fullmatch(r"[\W_]+", word) or word.isdigit():
        return None
    return word

# Apply cleaning
keywords_df["keywords_list"] = keywords_df["keywords_list"].apply(clean_keyword)

# Drop None/empty
keywords_df = keywords_df.dropna().drop_duplicates().reset_index(drop=True)
keywords_df = keywords_df.dropna(subset=["keywords_list"])
keywords_df = keywords_df[~keywords_df['keywords_list'].isin(["nan"])].reset_index(drop=True)


print(keywords_df.shape[0])
print(keywords_df.head(20))


# Cleans Data

749
    keywords_list
0               m
1              ve
2   cybersecurity
3     video_games
4           acads
5          action
6      activities
7     advertising
8      aesthetics
9     aftermarket
10             ai
11            ako
12       analysis
13      analytics
14         animal
15       animanga
16      animation
17          anime
18          animo
19       annoying


In [21]:
display(Markdown("""
# 🔑 Flatten and Cluster Keywords

We are using **SentenceTransformers** to embed the keywords and then applying **KMeans clustering** with `n_clusters = 10`.

Steps performed:
1. Flatten the keywords and create a DataFrame.
2. Generate embeddings using `all-MiniLM-L6-v2`.
3. Apply KMeans clustering on embeddings.
4. Assign cluster labels to each keyword.
5. Display sorted keywords and cluster counts.
"""))

model = SentenceTransformer("all-mpnet-base-v2")

# Get embeddings for words
embeddings = model.encode(keywords_df["keywords_list"].tolist(), convert_to_tensor=False)

# Cluster on embeddings
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
keywords_df["cluster"] = kmeans.fit_predict(embeddings)

print(keywords_df.sort_values('keywords_list'))
print(keywords_df["cluster"].value_counts())


# 🔑 Flatten and Cluster Keywords

We are using **SentenceTransformers** to embed the keywords and then applying **KMeans clustering** with `n_clusters = 10`.

Steps performed:
1. Flatten the keywords and create a DataFrame.
2. Generate embeddings using `all-MiniLM-L6-v2`.
3. Apply KMeans clustering on embeddings.
4. Assign cluster labels to each keyword.
5. Display sorted keywords and cluster counts.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

    keywords_list  cluster
4           acads        1
5          action        0
6      activities        8
7     advertising        4
8      aesthetics        7
..            ...      ...
741         years        0
742       youtube        1
743       zenless        3
744          zone        9
745           ˆᗜˆ        9

[749 rows x 2 columns]
cluster
7    118
9     94
1     92
5     82
3     74
0     74
4     74
6     52
2     50
8     39
Name: count, dtype: int64


In [22]:
clustered_df = (
    keywords_df.groupby("cluster")["keywords_list"]
    .apply(list)
    .reset_index()
    .rename(columns={"keywords_list": "words"})
)

clustered_df

Unnamed: 0,cluster,words
0,0,"[action, anything, appreciation, arrangements,..."
1,1,"[cybersecurity, acads, aftermarket, applicatio..."
2,2,"[archery, badminton, basketball, bball, beginn..."
3,3,"[m, ve, ako, animanga, animo, ayaw, branched, ..."
4,4,"[advertising, baking, binge, bingewatching, bi..."
5,5,"[ai, analysis, analytics, animation, architect..."
6,6,"[video_games, billiards, cello, chess, csgo, c..."
7,7,"[aesthetics, animal, anime, art, articles, art..."
8,8,"[activities, bike, biking, bouldering, camping..."
9,9,"[annoying, assist, atm, baker, based, bass, be..."


In [23]:
n_clusters = kmeans.n_clusters
n_members = len(lscs_sorted)
embedding_dim = kmeans.cluster_centers_.shape[1]

# Encode all members safely
vectors_list = []
for kw in lscs_sorted["keywords_list"]:
    if kw:
        vec = model.encode(kw, convert_to_tensor=False)
        vectors_list.append(np.mean(vec, axis=0) if len(vec) > 0 else np.zeros(embedding_dim))
    else:
        vectors_list.append(np.zeros(embedding_dim))

vectors = np.vstack(vectors_list)

# Compute distances to centroids
distances = np.linalg.norm(vectors[:, None, :] - kmeans.cluster_centers_[None, :, :], axis=2)

# Sort preferences (closest centroid first)
preferences = np.argsort(distances, axis=1)

# Initialize assignment
assigned = np.full(n_members, -1)
cluster_counts = np.zeros(n_clusters, dtype=int)

# Define max allowed difference between largest and smallest cluster
max_diff = 20
ideal_size = n_members // n_clusters

# Greedy assignment with max difference check
for member_idx in range(n_members):
    for pref in preferences[member_idx]:
        if cluster_counts[pref] <= min(cluster_counts) + max_diff:
            assigned[member_idx] = pref
            cluster_counts[pref] += 1
            break
    # fallback: assign to cluster with fewest members if still unassigned
    if assigned[member_idx] == -1:
        min_cluster = np.argmin(cluster_counts)
        assigned[member_idx] = min_cluster
        cluster_counts[min_cluster] += 1

lscs_sorted["assigned_cluster"] = assigned
lscs_sorted["assigned_cluster"]

Unnamed: 0,assigned_cluster
5,9
642,9
124,1
375,7
564,8
...,...
433,9
718,9
56,9
694,8


In [25]:
lscs_sorted["keywords_list"]

Unnamed: 0,keywords_list
5,[]
642,[]
124,"[,cybersecurity, cello, piano, managing, model..."
375,"[activities, leadership, programming, journali..."
564,"[activities, rock, dressing, house, sports, hi..."
...,...
433,"[work, public_speaking, volunteer, reading, si..."
718,[writing]
56,[writing]
694,"[writing, coding, listening, biking, music, pr..."
