There are two files attached containing the following data:

- input-data.csv: details of 50 items from some user input.
- activities.csv: a list of activities representing emission factors. Each activity is composed by a name (“activity_name” column), and is grouped by a set of categories (“category” column).

Note: All additional information has been striped for simplicity, but for sake of completeness: these activities contain metadata about their respective emissions.

Task: You are tasked with developing a system that finds the most relevant match for each of the input data strings from input-data.csv to activity names in activities.csv.

Deliverable: Submit your solution as a Python script or a Jupyter notebook, including the following requirements:
- Build a prototype model, including data processing, labeling and evaluation.
- For each unstructured input record, return the best-matched activity (name and category) along with a score.

## Data Exploration

In [1]:
import pandas as pd

input_data_df = pd.read_csv("input-data.csv")
activities_df = pd.read_csv("activities.csv")

In [2]:
print(f"Input Data Shape: {input_data_df.shape}")
print(f"Activites Shape: {activities_df.shape}")

Input Data Shape: (49, 1)
Activites Shape: (7133, 2)


In [3]:
activities_df = activities_df.drop_duplicates(subset=["activity_name", "category"])

In [4]:
print(f"Activites Shape: {activities_df.shape}")

Activites Shape: (7133, 2)


In [5]:
input_data_df.head(10)

Unnamed: 0,item
0,Drill bits
1,Light bulb filaments
2,PACKAGING PALLET S7A
3,Steel alloys for rails
4,hex bolt
5,Catalysts in chemical reactions
6,"t-joint, steel"
7,Aircraft structures
8,Thermometers
9,Airplane fuselages


In [6]:
activities_df.head(10)

Unnamed: 0,activity_name,category
0,Gas condensing boiler 120-400kW (upright unit),Electrical Equipment
1,Spinach (market for spinach),Arable Farming
2,Potatoes crisps,Food/Beverages/Tobacco
3,Tax preparation services,Professional Services and Activities
4,Aluminium - North American,Metals
5,Savings institutions,Financial Services
6,Hydrogen gaseous low pressure (market for hydr...,Fuel
7,Other commercial buildings,Real Estate
8,Aircraft,Vehicles
9,All other insurance related activities,Insurance Services


In [7]:
num_categories = len(activities_df["category"].unique())
print(f"{num_categories} categories")

69 categories


In [8]:
category_to_activity = {}
category_lengths = []
for category in activities_df["category"].unique():
    category_to_activity[category] = activities_df.index[activities_df["category"] == category].to_list()
    category_lengths.append(len(category_to_activity[category]))

print(f"Max category length: {max(category_lengths)}, Min category length: {min(category_lengths)}")
print(f"Median category length: {pd.Series(category_lengths).median()}")
print(f"Mean category length: {pd.Series(category_lengths).mean()}")


Max category length: 820, Min category length: 1
Median category length: 34.0
Mean category length: 103.3768115942029


## Pre-Evaluation

In [9]:
category = None
activity_target = [" rod"]
for index, row in activities_df.iterrows():
    found_activity = True
    if activity_target is not None:
        found_activity = False
        for t in activity_target:
            if t in row["activity_name"].lower():
                found_activity = True
                break
    if found_activity and (category is None or row["category"] in category):
        print(index, row["activity_name"])

251 Aluminium - not alloyed - bars / rods
471 Iron non-alloy steel - bars / rods / long
848 Steel - Wire rod
1424 Iron non-alloy steel - bars rods hot-rolled drawn extruded forgings
1520 Steel - hollow drill bars rods - forgings
1864 Iron non-alloy steel - bars rods hot-rolled coils
2311 Stainless steel - shapes sections bars rods long
2761 Iron non-alloy steel - bars rods nec
3845 Aluminium - alloys - bars / rods
4140 Steel alloy - bars rods hot-rolled coils
5459 Stainless steel - bars rods hot-rolled coils
6026 Iron non-alloy steel - hot-rolled drawn extruded - bars rods long
6084 Steel - hollow bars rods long


In [10]:
activities_df.iloc[6671]

activity_name             Light bulbs
category         Electrical Equipment
Name: 6671, dtype: object

In [11]:
"""
The following is in the format of:
"input item": [(activity, score), (activity, score), ...]

Items for which ground truth rankings were obtained:
"Control rods in nuclear ractors": []
"""

test_set_relevant_mappings = {
    13: [2930, 471, 2311],
    27: [349, 770, 3745],
    14: [3213, 4975],
    45: [4277, 5433, 993],
    38: [103, 1052],
    10: [6946, 2635, 2307, 5035, 5537],
    36: [25, 3044, 5329, 5484, 6765],
    1: [3501, 6671],
    42: [467],
    24: [5807]
}

In [12]:
from typing import Dict, List

def average_precision_at_k(predicted_indices: List[int], relevant_indices: List[int], k: int) -> float:
    num_relevant = 0
    sum_precision_at_k = 0
    for i, index in enumerate(predicted_indices):
        if index in relevant_indices:
            num_relevant += 1
            sum_precision_at_k += num_relevant / (i + 1) # add precision at k
        if i + 1 >= k:
            break
    return sum_precision_at_k / min(k, len(relevant_indices))


def mean_average_precision_at_k(
        item_to_predicted_indices: Dict[int, List[List[int]]],
        test_set_relevant_mappings: Dict[int, List[List[int]]],
        k: int
) -> float:
    sum_average_precision_at_k = 0
    for item_index, predicted_indices in item_to_predicted_indices.items():
        sum_average_precision_at_k += average_precision_at_k(predicted_indices, test_set_relevant_mappings[item_index], k)
    return sum_average_precision_at_k / len(item_to_predicted_indices)

In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_closest_matches(query_embedding: List[float], target_embeddings: List[List[float]], k: int = 5) -> List[int]:
    similarities = cosine_similarity([query_embedding], target_embeddings)
    return np.argsort(similarities)[0][-5:][::-1]


def get_test_set_performance(
        query_embeddings: List[List[float]],
        target_embeddings: List[List[float]],
        test_set_relevant_mappings: Dict[int, List[int]],
        k: List[int]
) -> float:
    """
    Given the predicted query and target embeddings, return the mean average precision at all k values
    provided.
    """
    item_to_predicted_activity_indices = {}
    for test_index in test_set_relevant_mappings:
        test_embedding = query_embeddings[test_index]
        closest_matches = get_closest_matches(test_embedding, target_embeddings)
        item_to_predicted_activity_indices[test_index] = closest_matches

    map_at_k = []
    for k_val in k:
        map_at_k.append(mean_average_precision_at_k(item_to_predicted_activity_indices, test_set_relevant_mappings, k_val))
    return map_at_k

In [14]:
activities_df["combined_string"] = activities_df.apply(lambda x: x["category"] + ": " + x["activity_name"], axis=1)

## TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english") # Use bigrams on top of single words
query_embeddings = vectorizer.fit_transform(input_data_df["item"].tolist()).toarray()
target_embeddings = vectorizer.transform(activities_df["combined_string"].tolist())
map_at_k = get_test_set_performance(query_embeddings, target_embeddings, test_set_relevant_mappings, [1, 5])
print(f"MAP@1: {map_at_k[0]}, MAP@5: {map_at_k[1]}")


MAP@1: 0.3, MAP@5: 0.22333333333333333


## Pre-trained Sentence Transformer Model Approach

### all-MiniLM-L6-v2

In [16]:
from sentence_transformers import SentenceTransformer

mini_lm_model = SentenceTransformer("all-MiniLM-L6-v2")

target_embeddings = mini_lm_model.encode(activities_df["combined_string"].tolist())
query_embeddings = mini_lm_model.encode(input_data_df["item"].tolist())

map_at_k = get_test_set_performance(query_embeddings, target_embeddings, test_set_relevant_mappings, [1, 5])
print(f"MAP@1: {map_at_k[0]}, MAP@5: {map_at_k[1]}")

  from .autonotebook import tqdm as notebook_tqdm


MAP@1: 0.4, MAP@5: 0.29833333333333334


### intfloat/e5-base

In [17]:
infloat_model = SentenceTransformer("intfloat/e5-base")
target_embeddings = infloat_model.encode(activities_df["combined_string"].tolist())
query_embeddings = infloat_model.encode(input_data_df["item"].tolist())

map_at_k = get_test_set_performance(query_embeddings, target_embeddings, test_set_relevant_mappings, [1, 5])
print(f"MAP@1: {map_at_k[0]}, MAP@5: {map_at_k[1]}")

MAP@1: 0.5, MAP@5: 0.37416666666666665


### all-mpnet-base-v2

In [18]:
mpnet_base_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
target_embeddings = mpnet_base_model.encode(activities_df["combined_string"].tolist())
query_embeddings = mpnet_base_model.encode(input_data_df["item"].tolist())

map_at_k = get_test_set_performance(query_embeddings, target_embeddings, test_set_relevant_mappings, [1, 5])
print(f"MAP@1: {map_at_k[0]}, MAP@5: {map_at_k[1]}")

MAP@1: 0.4, MAP@5: 0.2833333333333333


# Use intfloat/e5-base for final prediction

In [19]:
infloat_model = SentenceTransformer("intfloat/e5-base")
target_embeddings = infloat_model.encode(activities_df["combined_string"].tolist())
query_embeddings = infloat_model.encode(input_data_df["item"].tolist())

output_activity = []
output_category = []
output_scores = []
for query_embedding in query_embeddings:
    similarities = cosine_similarity([query_embedding], target_embeddings)
    best_match_index = np.argmax(similarities)
    output_activity.append(activities_df.iloc[best_match_index]["activity_name"])
    output_category.append(activities_df.iloc[best_match_index]["category"])
    output_scores.append(similarities[0][best_match_index])

output_df = pd.DataFrame({
    "item": input_data_df["item"],
    "activity": output_activity,
    "category": output_category,
    "score": output_scores
})
output_df.to_csv("output-data.csv", index=False)
    

## Fine tuning intfloat/e5-base

In [20]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "intfloat/e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [25]:
from datasets import Dataset

train_data = [
    {"query": "query: example query 1", "positive": "similar to example query 1", "negative": "not similar to example query 1"},
    {"query": "query: example query 2", "positive": "similar to example query 2", "negative": "not similar to example query 2"},
    {"query": "query: similar to example query 3", "positive": "similar to example query 3", "negative": "not similar to example query 3"},
]

train_dataset = Dataset.from_list(train_data) # huggingface dataset

In [28]:
def tokenize_triplets(examples):
    anchor = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=128)
    positive = tokenizer(examples["positive"], padding="max_length", truncation=True, max_length=128)
    negative = tokenizer(examples["negative"], padding="max_length", truncation=True, max_length=128)

    return {
        "input_ids_query": anchor["input_ids"],
        "attention_mask_query": anchor["attention_mask"],
        "input_ids_positive": positive["input_ids"],
        "attention_mask_positive": positive["attention_mask"],
        "input_ids_negative": negative["input_ids"],
        "attention_mask_negative": negative["attention_mask"],
    }

# Apply tokenization
train_dataset = train_dataset.map(tokenize_triplets, batched=True, remove_columns=["query", "positive", "negative"])
train_dataset.set_format(type="torch")


Map: 100%|██████████| 3/3 [00:00<00:00, 724.07 examples/s]


In [29]:
import torch.nn as nn

class TripletLossModel(nn.Module):
    def __init__(self, model, margin=1.0):
        super().__init__()
        self.model = model
        self.loss_fn = nn.TripletMarginLoss(margin=margin, p=2)  # Euclidean distance

    def forward(self, input_ids_anchor, attention_mask_anchor, 
                      input_ids_positive, attention_mask_positive, 
                      input_ids_negative, attention_mask_negative):
        
        anchor_emb = self.model(input_ids_anchor, attention_mask=attention_mask_anchor).last_hidden_state[:, 0, :]
        positive_emb = self.model(input_ids_positive, attention_mask=attention_mask_positive).last_hidden_state[:, 0, :]
        negative_emb = self.model(input_ids_negative, attention_mask=attention_mask_negative).last_hidden_state[:, 0, :]

        loss = self.loss_fn(anchor_emb, positive_emb, negative_emb)
        return loss

In [24]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./e5_finetuned",
    per_device_train_batch_size=64,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",
    fp16=True,
    report_to="none"
)

model = TripletLossModel(model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


# trainer.train()



ValueError: fp16 mixed precision requires a GPU (not 'mps').