In [80]:
#!pip install transformers
#!pip install torch
!pip install faiss-cpu




In [81]:
# necessary imports

import pandas as pd
import faiss
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModel
import torch

In [82]:
# loading test data

user_data = [
    ["13.12.2023 17:14:56", 4.76, 5.04, 4.48, 3.64, 4.48, 5.32],
    ["13.12.2023 17:21:03", 3.64, 3.92, 3.92, 3.64, 3.64, 3.92],
    ["13.12.2023 17:47:13", 5.88, 6.44, 6.65, 7.00, 5.60, 5.32],
    ["13.12.2023 18:03:34", 2.52, 1.40, 5.88, 2.24, 4.20, 1.96],
    ["13.12.2023 18:05:36", 5.88, 7.00, 5.04, 4.76, 7.00, 4.76],
    ["13.12.2023 18:09:50", 3.36, 4.76, 7.00, 3.64, 3.36, 4.48],
    ["13.12.2023 18:42:14", 6.44, 5.88, 3.64, 3.64, 4.76, 5.32],
    ["13.12.2023 19:18:19", 4.76, 5.04, 2.24, 4.48, 3.36, 4.20],
    ["13.12.2023 21:09:46", 5.04, 6.16, 6.44, 5.88, 5.88, 5.60],
    ["16.12.2023 11:36:37", 4.20, 3.92, 3.92, 4.48, 4.48, 5.04],
    ["16.12.2023 13:02:24", 5.88, 7.00, 7.00, 7.00, 7.00, 7.00],
    ["16.12.2023 13:07:36", 4.20, 4.76, 4.48, 4.20, 4.20, 4.76],
    ["16.12.2023 16:12:31", 5.04, 7.00, 3.92, 5.60, 7.00, 6.72],
    ["16.12.2023 16:16:46", 4.76, 6.44, 4.76, 4.76, 7.00, 5.32],
    ["16.12.2023 16:17:51", 6.44, 6.16, 4.48, 5.04, 5.04, 5.88],
    ["16.12.2023 16:33:55", 4.20, 6.72, 7.00, 3.36, 4.76, 4.76],
    ["16.12.2023 18:38:24", 3.92, 4.20, 4.48, 5.88, 5.88, 4.20],
    ["16.12.2023 18:58:30", 4.20, 5.32, 5.88, 5.32, 4.48, 5.60],
    ["17.12.2023 09:45:31", 5.32, 6.44, 5.32, 3.64, 2.80, 6.72],
    ["17.12.2023 09:48:34", 5.32, 4.20, 1.96, 1.96, 3.08, 3.92],
    ["17.12.2023 22:22:43", 4.20, 5.32, 4.48, 5.04, 5.04, 5.88],
    ["17.12.2023 23:17:10", 5.04, 6.44, 5.32, 6.44, 5.88, 5.60],
    ["18.12.2023 14:25:16", 4.48, 4.76, 3.64, 4.48, 4.76, 5.04],
    ["23.12.2023 07:38:09", 3.64, 5.88, 2.52, 3.08, 4.76, 5.60],
]

columns = ["Timestamp", "Realistic", "Investigative", "Artistic", "Social", "Enterprising", "Conventional"]

user_data = pd.DataFrame(user_data, columns=columns)

#print(user_data)

print(np.array(user_data.iloc[0, 1:]))


[4.76 5.04 4.48 3.64 4.48 5.32]


In [83]:
def preprocess_to_faiss(df):

    # normalize
    df = df / np.linalg.norm(df, axis=1, keepdims=True)

    index = faiss.IndexFlatIP(df.shape[1]) # Inner Product for cosine similarity)
    index.add(df)

    return index


def find_similar_with_faiss(user_riasec, index, top_k=100): # quickly sort and retrieve top 100 results

    # normalize
    user_riasec = user_riasec / np.linalg.norm(user_riasec)

    # top_k most similar professions
    distances, indices = index.search(user_riasec.reshape(1, -1), top_k)

    return distances.flatten(), indices.flatten()



In [84]:
# loading riasec data

jd = pd.read_excel("Processed_Interests.xlsx")

print(jd.head)

riasec_columns = ["R_score", "I_score", "A_score", "S_score", "E_score", "C_score"]
processed_jd = jd[riasec_columns]

# Preprocess tto the faiss
jd_faiss = preprocess_to_faiss(processed_jd)

# get similar professions
distances, indices = find_similar_with_faiss(np.array(user_data.iloc[0, 1:]), jd_faiss, top_k=100)
answer = np.column_stack((jd.iloc[indices, 0], distances))

print(answer)



<bound method NDFrame.head of                                     profession  R_score  I_score  A_score  \
0                             Chief Executives     1.33     2.00     2.67   
1                Chief Sustainability Officers     1.00     4.33     2.67   
2              General and Operations Managers     1.33     1.33     1.00   
3                                  Legislators     1.00     3.67     3.67   
4          Advertising and Promotions Managers     1.67     2.00     5.33   
..                                         ...      ...      ...      ...   
969    Pump Operators, Except Wellhead Pumpers     7.00     4.00     1.00   
970                           Wellhead Pumpers     7.00     3.67     1.00   
971  Refuse and Recyclable Material Collectors     7.00     1.33     1.00   
972                 Mine Shuttle Car Operators     7.00     1.33     1.00   
973          Tank Car, Truck, and Ship Loaders     7.00     3.00     1.00   

     S_score  E_score  C_score  HP_1  HP_2  H

In [85]:
# prepare LLM

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [None]:
# processing text-based answers embeddings

def extract_embeddings(texts, max_length=128):

    encoded_inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = model(**encoded_inputs)

    embeddings = output.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return embeddings

In [109]:
# extracting insights from the user answer

base_words = ["math", "chemistry", "physical education", "history", "literature", "biology", "physics"] # no need to specify all of them
school_subject = "school subject"

base_embeddings = extract_embeddings(base_words)
school_subject_embedding = extract_embeddings([school_subject])

def clean_text(user_input):

    # tokenize the user input and make embeddings
    user_input_tokens = tokenizer.tokenize(user_input.lower())

    user_input_embeddings = extract_embeddings(user_input_tokens)

    relevant_subjects = []

    # compare each user input token to the base words and school subject
    for token, token_embedding in zip(user_input_tokens, user_input_embeddings):
        similarities = []

        for base_word, base_embedding in zip(base_words, base_embeddings):

            cos_sim = cosine_similarity(token_embedding
                                        .detach()
                                        .numpy()
                                        .reshape(1, -1),

                                        base_embedding
                                        .detach()
                                        .numpy()
                                        .reshape(1, -1)
                                        )

            similarities.append(cos_sim[0][0])

        cos_sim_subject = cosine_similarity(token_embedding
                                            .detach()
                                            .numpy()
                                            .reshape(1, -1),

                                            school_subject_embedding
                                            .detach()
                                            .numpy()
                                            .reshape(1, -1)
                                            )

        similarities.append(cos_sim_subject[0][0])

        # check if the token is sufficiently similar to any base word
        max_similarity = max(similarities)
        if max_similarity > 0.98:
            relevant_subjects.append(token)

    # Remove duplicates using set datatype property
    return list(set(relevant_subjects))

# Example usage
user_input = "I love studying math and physical education, but chemistry is my favorite."
cleaned_subjects = clean_subject_input(user_input)
cleaned_subjects.append("informatics")
cleaned_subjects.remove("physical")
cleaned_subjects.remove("chemistry")
print(cleaned_subjects)

subjects_embedding = extract_embeddings(cleaned_subjects)
print(subjects_embedding.shape)

['math', 'informatics']
torch.Size([2, 768])


In [110]:
def combine_score(profession_embedding, subject_embedding, original_score, weight=0.7):

    # cosine similarity between the embeddings
    cos_sim = cosine_similarity(profession_embedding
                                .detach()
                                .numpy()
                                .reshape(1, -1)
                                ,

                                subject_embedding
                                .detach()
                                .numpy()
                                .reshape(1, -1)
                                )

    # combine score with predefined weight
    combined_score = (weight * cos_sim[0][0]) + ((1 - weight) * original_score)

    return combined_score

In [115]:
def combine_subjects_and_riasec(user_input, answer, top_k=20):

    # process user input
    cleaned_subjects = clean_text(user_input)

    if not cleaned_subjects:
        return []

    # generate embeddings for the cleaned subjects
    subject_embeddings = extract_embeddings(cleaned_subjects)

    # extract professions and their embeddings
    profession_names = [item[0] for item in answer]

    #print(profession_names[0:5])

    original_scores = [item[1] for item in answer]
    profession_embeddings = extract_embeddings(profession_names)

    # calculate the combined scores for each profession

    combined_scores = []

    for i, profession_embedding in enumerate(profession_embeddings):

        score = 0
        for subject_embedding in subject_embeddings:
            #print(profession_names[i], subject_embedding.shape, original_scores[i])
            score += combine_score(profession_embedding, subject_embedding, original_scores[i])

        # Average the scores if there are multiple subjects
        score /= len(subject_embeddings)
        combined_scores.append((profession_names[i], score))

    #print(combined_scores[0:5])

    # sort the results based on the combined score
    sorted_results = sorted(combined_scores, key=lambda x: x[1], reverse=True)

    # return the top_k results
    return sorted_results[:top_k]

user_input = "I like math and humanities, but math I like more"
top_20_professions = combine_subjects_and_riasec(user_input, answer)


for profession, score in top_20_professions:
    print(f"{profession}: {score}")


Web Developers: 0.949806171655655
Computer Network Architects: 0.9480694949626922
Computer Programmers: 0.9449951827526093
Environmental Engineers: 0.9440422177314758
Validation Engineers: 0.9434145450592041
Database Architects: 0.9433716416358948
Database Administrators: 0.9411588549613952
Computer Systems Analysts: 0.9411425590515137
Operations Research Analysts: 0.9405641376972198
Transportation Planners: 0.9404208362102509
Computer User Support Specialists: 0.9404193699359893
Materials Scientists: 0.9398408591747285
Nanosystems Engineers: 0.9390329360961913
Industrial Engineers: 0.9386328995227813
Archeologists: 0.9383579552173614
Web Administrators: 0.9382719278335572
Materials Engineers: 0.9380653619766236
Biomedical Engineers: 0.9379156708717347
Intelligence Analysts: 0.9376263260841369
Magnetic Resonance Imaging Technologists: 0.9375298142433166
