## Numerical Matrix Factorization

In [1]:
import numpy as np

# Example user-item interaction matrix (ratings matrix)
# Rows represent users, columns represent items, and values represent ratings
R = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [1, 0, 0, 4],
    [0, 1, 5, 4],
])

# Number of latent features
k = 2

# Perform Singular Value Decomposition
U, sigma, Vt = np.linalg.svd(R, full_matrices=False)

# Keep only the top k singular values
U_k = U[:, :k]
sigma_k = np.diag(sigma[:k])
Vt_k = Vt[:k, :]

# Reconstruct the matrix using only the top k singular values
R_k = np.dot(np.dot(U_k, sigma_k), Vt_k)

print("Original Matrix (R):")
print(R)
print("\nReconstructed Matrix (R_k) using top {} latent features:".format(k))
print(R_k)

# Prediction for a specific user-item pair
user_index = 0  # First user
item_index = 3  # Third item
predicted_rating = np.dot(np.dot(U_k[user_index, :], sigma_k), Vt_k[:, item_index])
print("\nPredicted rating for user {} and item {}:".format(user_index, item_index))
print(predicted_rating)


Original Matrix (R):
[[5 3 0 1]
 [4 0 0 1]
 [1 1 0 5]
 [1 0 0 4]
 [0 1 5 4]]

Reconstructed Matrix (R_k) using top 2 latent features:
[[ 5.13406479  1.90612125 -0.72165061  1.5611261 ]
 [ 3.43308995  1.28075331 -0.45629689  1.08967559]
 [ 1.54866643  1.0449763   1.78873709  3.96755551]
 [ 1.17598269  0.80359806  1.40136891  3.08786154]
 [-0.44866693  0.5443561   3.09799526  5.15263893]]

Predicted rating for user 0 and item 3:
1.56112610151169


## Query-LLM Performance Matrix

In [2]:
import numpy as np

# Sample queries
queries = [
    "What is the capital of France?",
    "Explain quantum computing",
    "Write a poem about spring",
    "Summarize the plot of Hamlet",
    "How does photosynthesis work?"
]

# Sample LLMs
llms = ["GPT-4", "BERT", "T5", "LLaMA"]

# Hypothetical function to evaluate LLM performance on a query
def evaluate_llm_performance(query, llm):
    # This function should return a performance score for the given query and LLM
    # For demonstration, we'll use random scores
    np.random.seed(hash(query + llm) % (2**32))
    return np.random.randint(1, 11)  # Random score between 1 and 10

# Generate the performance matrix
performance_matrix = np.zeros((len(queries), len(llms)))

for i, query in enumerate(queries):
    for j, llm in enumerate(llms):
        performance_matrix[i, j] = evaluate_llm_performance(query, llm)

print("Performance Matrix:")
print(performance_matrix)


Performance Matrix:
[[ 2.  3.  3.  3.]
 [ 1.  8.  4.  1.]
 [ 3.  9.  4.  2.]
 [ 5.  8.  6.  5.]
 [ 5.  4. 10.  3.]]


## Predicting Query-LLM Performance - Matrix Factorization

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert queries to TF-IDF vectors
vectorizer = TfidfVectorizer()
query_vectors = vectorizer.fit_transform(queries)

# Number of latent factors
k = 2

# Perform SVD on the performance matrix
U, sigma, Vt = np.linalg.svd(performance_matrix, full_matrices=False)

# Keep only top k factors
U_k = U[:, :k]
sigma_k = np.diag(sigma[:k])
Vt_k = Vt[:k, :]

# Function to predict performance for a new query
def predict_performance(new_query):
    new_vector = vectorizer.transform([new_query])
    similarities = cosine_similarity(new_vector, query_vectors)
    query_factors = np.dot(similarities, U_k)
    predicted_scores = np.dot(np.dot(query_factors, sigma_k), Vt_k)
    return predicted_scores[0]

# Example usage
new_query = "Explain the theory of relativity"
predicted_scores = predict_performance(new_query)

print("Predicted performance scores for the query:")
print(f"'{new_query}'")
for llm, score in zip(llms, predicted_scores):
    print(f"{llm}: {score:.2f}")

# Route to the best LLM
best_llm = llms[np.argmax(predicted_scores)]
print(f"\nRouting to: {best_llm}")


Predicted performance scores for the query:
'Explain the theory of relativity'
GPT-4: 3.29
BERT: 7.54
T5: 5.35
LLaMA: 2.85

Routing to: BERT


## Alternating Least Squares (ALS) for matrix factorization

In [4]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

def als_matrix_factorization(R, k, num_iterations=10, lambda_reg=0.1):
    """
    Perform matrix factorization using Alternating Least Squares (ALS).
    
    Args:
    R: The input matrix (queries x LLMs)
    k: Number of latent factors
    num_iterations: Number of iterations to perform
    lambda_reg: Regularization parameter
    
    Returns:
    P, Q: Factorized matrices such that R ≈ P * Q.T
    """
    m, n = R.shape
    P = np.random.rand(m, k)
    Q = np.random.rand(n, k)
    
    # Create a mask for non-zero elements
    mask = R != 0
    
    for _ in range(num_iterations):
        # Update P
        for i in range(m):
            if np.sum(mask[i]) > 0:
                Q_i = Q[mask[i], :]
                R_i = R[i, mask[i]]
                A = Q_i.T.dot(Q_i) + lambda_reg * np.eye(k)
                b = Q_i.T.dot(R_i)
                P[i] = np.linalg.solve(A, b)
        
        # Update Q
        for j in range(n):
            if np.sum(mask[:, j]) > 0:
                P_j = P[mask[:, j], :]
                R_j = R[mask[:, j], j]
                A = P_j.T.dot(P_j) + lambda_reg * np.eye(k)
                b = P_j.T.dot(R_j)
                Q[j] = np.linalg.solve(A, b)
    
    return P, Q

# Sample queries and LLMs
queries = [
    "What is the capital of France?",
    "Explain quantum computing",
    "Write a poem about spring",
    "Summarize the plot of Hamlet",
    "How does photosynthesis work?"
]

llms = ["GPT-4", "BERT", "T5", "LLaMA"]

# Sample performance matrix (queries x LLMs)
R = np.array([
    [9, 7, 5, 8],
    [8, 6, 7, 9],
    [9, 4, 6, 8],
    [8, 5, 7, 8],
    [7, 8, 6, 7]
])

# Number of latent factors
k = 2

# Perform matrix factorization
P, Q = als_matrix_factorization(R, k)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()
query_vectors = vectorizer.fit_transform(queries)

# Use TruncatedSVD to reduce dimensionality of TF-IDF vectors to k
svd = TruncatedSVD(n_components=k)
query_latent_factors = svd.fit_transform(query_vectors)

# Function to convert a new query to latent factor space
def query_to_latent_factor(new_query):
    new_vector = vectorizer.transform([new_query])
    return svd.transform(new_vector)

# Function to predict performance for a new query
def predict_performance(query_vector, Q):
    return np.dot(query_vector, Q.T)

# Example: Predict performance for a new query
new_query = "Explain the theory of relativity"
new_query_vector = query_to_latent_factor(new_query)
predicted_scores = predict_performance(new_query_vector, Q)

print("Predicted performance scores for the query:")
print(f"'{new_query}'")
for llm, score in zip(llms, predicted_scores[0]):
    print(f"{llm}: {score:.2f}")

# Route to the best LLM
best_llm = llms[np.argmax(predicted_scores)]
print(f"\nRouting to: {best_llm}")

# Demonstrate with multiple new queries
new_queries = [
    "What are the main causes of climate change?",
    "How does machine learning work?",
    "Explain the process of photosynthesis"
]

print("\nPredictions for multiple new queries:")
for query in new_queries:
    query_vector = query_to_latent_factor(query)
    scores = predict_performance(query_vector, Q)
    best_llm = llms[np.argmax(scores)]
    print(f"\nQuery: '{query}'")
    print(f"Routed to: {best_llm}")
    print("Scores:")
    for llm, score in zip(llms, scores[0]):
        print(f"  {llm}: {score:.2f}")


Predicted performance scores for the query:
'Explain the theory of relativity'
GPT-4: 0.55
BERT: -0.19
T5: 0.39
LLaMA: 0.50

Routing to: GPT-4

Predictions for multiple new queries:

Query: 'What are the main causes of climate change?'
Routed to: GPT-4
Scores:
  GPT-4: 0.75
  BERT: -0.27
  T5: 0.54
  LLaMA: 0.69

Query: 'How does machine learning work?'
Routed to: BERT
Scores:
  GPT-4: 0.30
  BERT: 1.54
  T5: 0.28
  LLaMA: 0.37

Query: 'Explain the process of photosynthesis'
Routed to: GPT-4
Scores:
  GPT-4: 0.56
  BERT: 0.33
  T5: 0.42
  LLaMA: 0.54


# Sleep and Car Data - Matrix Factorization

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import svds
import joblib

# Load datasets
sleep_data = pd.read_json('../data/raw/training_qna_sleep.json')
car_data = pd.read_json('../data/raw/training_qna_car.json')

# Extract questions and add categories
sleep_data['question'] = sleep_data['qna'].apply(lambda x: x['question'])
sleep_data['category'] = 'sleep'
car_data['question'] = car_data['qna'].apply(lambda x: x['question'])
car_data['category'] = 'car'

# Combine datasets
data = pd.concat([sleep_data, car_data], ignore_index=True)

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(data['question'])

# Create user-item interaction matrix
interaction_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
interaction_matrix['category'] = data['category'].apply(lambda x: 1 if x == 'sleep' else 0)

# Split data into training and testing sets
train_data, test_data = train_test_split(interaction_matrix, test_size=0.2, random_state=42)

# Perform matrix factorization using SVD
k = 10  # number of latent factors
U, sigma, Vt = svds(train_data.drop('category', axis=1).values, k=k)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Save the components
joblib.dump(vectorizer, '../models/vectorizer.pkl')
joblib.dump(U, '../models/U.pkl')
joblib.dump(sigma, '../models/sigma.pkl')
joblib.dump(Vt, '../models/Vt.pkl')


['models/Vt.pkl']

In [53]:
import numpy as np

def classify_query(query, U, sigma, Vt, vectorizer, train_data):
    query_vector = vectorizer.transform([query])
    
    # Project the query vector into the latent space
    query_latent = query_vector.dot(Vt.T)
    
    # Compute the mean latent vectors for sleep and car categories
    sleep_latent = train_data[train_data['category'] == 1].drop('category', axis=1).values.mean(axis=0).dot(Vt.T)
    car_latent = train_data[train_data['category'] == 0].drop('category', axis=1).values.mean(axis=0).dot(Vt.T)
    
    # Ensure all vectors are 2D
    query_latent = query_latent.reshape(1, -1)
    sleep_latent = sleep_latent.reshape(1, -1)
    car_latent = car_latent.reshape(1, -1)
    
    # Compute the similarity scores using cosine similarity
    sleep_score = np.dot(query_latent, sleep_latent.T) / (np.linalg.norm(query_latent) * np.linalg.norm(sleep_latent))
    car_score = np.dot(query_latent, car_latent.T) / (np.linalg.norm(query_latent) * np.linalg.norm(car_latent))
    
    return 'sleep' if sleep_score > car_score else 'car'

# Test the classifier
test_query = "What is the impact of sleep deprivation on driving?"
classification = classify_query(test_query, U, sigma, Vt, vectorizer, train_data)
print(f"Query: {test_query}")
print(f"Classification: {classification}")


Query: What is the impact of sleep deprivation on driving?
Classification: sleep


In [54]:
queries = [
    "What's the impact of sleep deprivation on cognitive function?",
    "Can you explain the history of the internal combustion engine?",
    "How does REM sleep affect memory consolidation?",
    "What are the main components of an electric vehicle's drivetrain?",
]

# Test the classifier
for query in queries:
    result = classify_query(query, U, sigma, Vt, vectorizer, train_data)
    print(f"Query: {query}")
    print(f"Classified as: {result}\n")

Query: What's the impact of sleep deprivation on cognitive function?
Classified as: sleep

Query: Can you explain the history of the internal combustion engine?
Classified as: car

Query: How does REM sleep affect memory consolidation?
Classified as: sleep

Query: What are the main components of an electric vehicle's drivetrain?
Classified as: car

