# Feature Extraction using Word2Vec Word Embeddings

### General Libraries and settings

In [1]:
import pandas as pd
import re
import numpy as np
import contractions


# Print longer cells in pd
pd.options.display.max_colwidth = 1000
pd.options.display.width = 10000
# Print all rows
pd.options.display.max_rows = None
pd.options.display.max_columns = 1000

### Feature Extraction Lib

In [2]:
# import gensim
from gensim.models import Word2Vec
# import gensim.downloader as api
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity
import ast

## Feature Extraction using Word2Vec Word Embeddings

### Train a custom Word2Vec Model using 2 datasets 
1. Read data and merge to corpus
2. Hypertuning word2vec (Word2Vec is an unsupervised learning algorithm --> no need train/test split)

#### Read Datasets

In [3]:
university_df = pd.read_csv("../university_data_normalized.csv")
user_pref_df = pd.read_csv("../user_preferences_normalized.csv")

university_df['merge_normalize_tokenize'] = university_df['merge_normalize_tokenize'].apply(ast.literal_eval)
user_pref_df['normalize_tokenize'] = user_pref_df['normalize_tokenize'].apply(ast.literal_eval)

In [4]:
# vector_size (int, optional) – Dimensionality of the word vectors.
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# w2v_model = Word2Vec(documents['tokenized'], vector_size=300, window=8, workers=5, min_count=1)

# join data to corpus
documents = pd.DataFrame()
documents['tokenized'] = pd.concat([university_df['merge_normalize_tokenize'], user_pref_df['normalize_tokenize']])
corpus = documents['tokenized'].tolist()

#### Hyperparameter Tuning (uncomment if want to rebuild model)
- Run all combination models
- Save all model to csv list
- Choose best model
- Save best model to .model file

In [5]:
# # Base model Word2Vec
# # base_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, sg=0, epochs=10)

# # Hyperparameter combinations
# param_grid = {
#     'vector_size': [100, 200, 300],
#     'window': [3, 5, 10],
#     'min_count': [1, 2],
#     'sg': [0, 1],
#     'epochs': [15, 25],
#     'alpha': [0.025, 0.05]
# }
# # Generate all combinations of hyperparameters
# param_combinations = list(product(
#     param_grid['vector_size'],
#     param_grid['window'],
#     param_grid['min_count'],
#     param_grid['sg'],
#     param_grid['epochs'],
#     param_grid['alpha']
# ))

# # Store results
# results = []
# intrinsic_pairs = [('computer','science'),('economics','finance'),('engineering','technology'),('history','sociology'),('psychology','neuroscience'),('philosophy','ethic'),('art','design'),('business','management'),('medicine','healthcare'),('education','teaching')]
# for params in param_combinations:
#     vector_size, window, min_count, sg, epochs, alpha = params
#     print(f"Training model with: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, epochs={epochs}, alpha={alpha}")
#     model = Word2Vec(corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg, epochs=epochs, alpha=alpha, workers=5)
    
#     # Evaluate the model (example: intrinsic evaluation or similarity tasks)
#     score = 0
#     for a, b in intrinsic_pairs:
#         score += model.wv.similarity(a,b)  # Example similarity tasks
#     results.append((params, score))

In [6]:
# # Save all hyperparams to csv file
# results_df = pd.DataFrame(results)
# results_df.columns = ['params', 'computer-science']
# with open('Word2Vec_hyperparam.csv', 'a') as file:
#     results_df.to_csv(file, index=False)

In [7]:
# # Find the best parameters
# best_params, best_score = max(results, key=lambda x: x[1])
# print(f"Best parameters: {best_params} with score: {best_score}")

# # Retrain with best model
# vector_size, window, min_count, sg, epochs, alpha = best_params
# # better core can go up to 16 worker
# best_model = Word2Vec(corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg, epochs=epochs, alpha=alpha, workers=5)

# # Save the model
# best_model.save("word2vec_best_model.model")

### Load Word2Vec Model

In [8]:
# Load the model when needed
model = Word2Vec.load("word2vec_best_model.model")

# model = Word2Vec(
#     corpus,
#     vector_size=200, 
#     window=10, 
#     min_count=1, 
#     sg=0, 
#     epochs=5, 
#     alpha=0.025, 
#     workers=5)

### Compute TF-IDF Weights

- Emphasizes Important Words: Words with higher TF-IDF scores contribute more to the sentence vector.
- Reduces Noise: Less important words have less impact on the representation.\
Example: Words like "computer" and "science" in your example sentence will have higher weights, while words like "gain" and "in" will have lower influence.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all text for TF-IDF computation
all_text_contracted = pd.concat([user_pref_df['preference'], university_df['merge_raw']])
all_text = all_text_contracted.apply(lambda x: contractions.fix(x))
# all_text.to_list()

# Init TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Compute
tfidf_vectorizer.fit_transform(all_text)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary mapping each word to its IDF weight
tfidf_dict = dict(zip(feature_names, tfidf_vectorizer.idf_))

### Compute Sentence Vectors with TF-IDF Weighted Averaging

In [10]:
def tfidf_weighted_vector(tokens, model, tfidf_dict):
    word_vectors = []
    total_weight = 0
    for word in tokens:
        if word in model.wv and word in tfidf_dict:  # Ensure the word is in both Word2Vec and TF-IDF
            weight = tfidf_dict[word]
            word_vectors.append(model.wv[word] * weight)
            total_weight += weight

    if word_vectors:  # If there are valid word vectors
        return np.sum(word_vectors, axis=0) / total_weight
    else:  # Return a zero vector if no valid tokens are found
        return np.zeros(model.vector_size)
    
documents['sentence_vector'] = documents['tokenized'].apply(lambda x: tfidf_weighted_vector(x, model, tfidf_dict))

## Compute Similarities with cosine similarity --- NLP-Based CBF

- TESTING: try one user with all programs

In [None]:
# edit test_user_id to try other users
test_user_id = 462
test_user = user_pref_df['normalize_tokenize'].iloc[test_user_id]


# Compute TF-IDF weighted vectors for user preference
test_user_vector = tfidf_weighted_vector(test_user, model, tfidf_dict)
test_user_vector = np.array(test_user_vector).reshape(1, -1) # (1, vector dimension)

# Compute TF-IDF weighted vectors for university programs
university_df['tfidf_vector'] = university_df['merge_normalize_tokenize'].apply(lambda x: tfidf_weighted_vector(x, model, tfidf_dict))
program_vectors = np.stack(university_df['tfidf_vector'].values)

# Compute similarity between user1 and all programs
similarities = cosine_similarity(test_user_vector, program_vectors)[0]

In [50]:
test_df = pd.DataFrame()
test_df['similarity'] = similarities
test_df_sorted = test_df.sort_values(by='similarity', ascending=False)
rankingID_top = test_df_sorted.head(5).index.to_list()

print(f"User {test_user_id} Preference: {user_pref_df['preference'].iloc[test_user_id]}")
# print(txt)
print(f"Top 5: {university_df[university_df.columns[[0,2,3,5,6]]].iloc[rankingID_top]}")

User 462 Preference: I'm interested in degrees in public policy and governance, particularly those with a focus on equity.
Top 5:                                                     program_name                    university  location               degreeType language
8070  Policy Management, Public Policy and Public Administration  University of Duisburg-Essen  Duisburg           Master of Arts   German
8602                                               Public Policy          University of Erfurt    Erfurt  Master of Public Policy  English
6448                                     Master of Public Policy  University of Duisburg-Essen  Duisburg  Master of Public Policy   German
4690   Governance and Public Policy - Political Theory (Science)          University of Passau    Passau                   Master   German
6450                               Master of Public Policy (MPP)                 Hertie School    Berlin                 Master's  English


## Map Similarity Scores to Synthetic Ratings

### Use similarity + introduce noise for synthetic ratings

In [117]:
def map_similarity_to_rating(similarities):
    # Scale similarity (0-1) to ratings (1-5)
    ratings = similarities * 4 + 1  # Scale to 1-5
    # Introduce randomness
    noise = np.random.normal(0, 0.5, size=similarities.shape)  # Adjust standard deviation as needed
    ratings += noise
    # Ensure ratings is within bounds (1,5)
    ratings = np.clip(ratings, 1, 5)
    # Round to nearest half
    ratings = np.round(ratings, 2)
    return ratings 

### Generate Synthetic Ratings

In [118]:
user_columns = []
uni_id = [f'uni_id_{i}' for i in range(similarities.shape[0])]

for user_index, user_preference in enumerate(user_pref_df['normalize_tokenize']):
    # Compute TF-IDF weighted vectors for user preference
    user_vector = tfidf_weighted_vector(user_preference, model, tfidf_dict)
    user_vector = np.array(user_vector).reshape(1, -1) # (1, vector dimension)

    similarities = cosine_similarity(user_vector, program_vectors)[0]

    rating = map_similarity_to_rating(similarities)
    user_columns.append(pd.Series(rating, index=uni_id, name=f'userid_{user_index}'))

user_ratings = pd.concat(user_columns, axis=1)
user_ratings.to_csv('../university_user_ratings.csv',index=True)

interested
master
science
computer
science
berlin
gain
exposure
cutting-edge
ai
technology
master
art
business
administration
international
focus
would
align
career
goal
global
management
look
master
science
environmental
science
hamburg
one
emphasize
sustainability
climate
research
study
medium
informatics
munich
international
master
’
program
would
help
merge
creativity
technology
keen
pursue
master
engineering
maritime
engineering
kiel
want
specialize
ship
design
ocean
technology
english-taught
master
science
data
science
berlin
would
perfect
hone
analytical
program
skill
would
love
enroll
master
art
linguistics
cologne
program
explore
multilingualism
communication
pursue
master
education
educational
technology
hamburg
would
help
integrate
innovative
tool
modern
teaching
practice
part-time
master
science
business
analytics
frankfurt
would
allow
study
continue
work
current
role
fascinate
idea
study
aeronautical
engineering
munich
master
’
program
focus
future
aircraft
technology
sear