In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('/kaggle/input/')

from sklearn.neighbors import NearestNeighbors
import torch

In [10]:
import torch
from transformers import AutoTokenizer, AutoModel
%env TOKENIZERS_PARALLELISM=true

top_n = 10

device = "cuda" if torch.cuda.is_available() else "cpu"
model_paths = ['/kaggle/input/paraphrasemultilingualmpnetbasev2','/kaggle/input/d/tomoyayanagi/all-minilm-l6-v2']

env: TOKENIZERS_PARALLELISM=true


In [3]:
# Load the datasets
topics_df = pd.read_csv("learning-equality-curriculum-recommendations/topics.csv")
content_df = pd.read_csv("learning-equality-curriculum-recommendations/content.csv")
corr_df = pd.read_csv("learning-equality-curriculum-recommendations/correlations.csv")
submission = pd.read_csv("learning-equality-curriculum-recommendations/sample_submission.csv")

In [4]:
content_df.columns = ["content_"+ column for column in content_df.columns]

corr_df['content_ids'] = corr_df['content_ids'].str.split()
corr_df = corr_df.explode('content_ids').reset_index(drop = True)
corr_df = corr_df.rename(columns = {'content_ids':'content_id'})
corr_df.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [5]:
topics_df_topic_tree = pd.DataFrame()

for channel in tqdm(topics_df["channel"].unique()):
    channel_df = topics_df[(topics_df["channel"] == channel)].reset_index(drop = True)
    for level in sorted(channel_df.level.unique()):
        
        #For level 0, it first creates a topic tree column which is the title of that topic.            
        if level == 0:
            topic_tree = channel_df[channel_df["level"] == level]["title"].astype(str)
            topic_tree_df = pd.DataFrame([channel_df[channel_df["level"] == level][["id"]],topic_tree.values]).T
            topic_tree_df.columns = ["child_id","topic_tree"]
            channel_df = channel_df.merge(topic_tree_df, left_on = "id", right_on = "child_id", how = "left").drop(["child_id"], axis = 1)
        
        #Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id
        topic_df_parent = channel_df[channel_df["level"] == level][["id","title","parent","topic_tree"]]
        topic_df_parent.columns = "parent_" + topic_df_parent.columns
        
        topic_df_child = channel_df[channel_df["level"] == level + 1][["id","title","parent","topic_tree"]]
        topic_df_child.columns = "child_" + topic_df_child.columns
        
        topic_df_merged = topic_df_parent.merge(topic_df_child, left_on = "parent_id", right_on = "child_parent")[["child_id","parent_id","parent_title","child_title","parent_topic_tree"]]

        #Topic tree is parent topic tree + title of the current child on that level
        topic_tree = topic_df_merged["parent_topic_tree"].astype(str) + " is the parent of " + topic_df_merged["child_title"].astype(str)
        
        topic_tree_df = pd.DataFrame([topic_df_merged["child_id"].values,topic_tree.values]).T
        topic_tree_df.columns = ["child_id","topic_tree"]
        
        channel_df = channel_df.merge(topic_tree_df, left_on = "id", right_on = "child_id", how = "left").drop(["child_id"], axis = 1)
        if "topic_tree_y" in list(channel_df.columns):
            channel_df["topic_tree"] = channel_df["topic_tree_x"].combine_first(channel_df["topic_tree_y"])
            channel_df = channel_df.drop(["topic_tree_x","topic_tree_y"], axis = 1)
        
    topics_df_topic_tree = pd.concat([topics_df_topic_tree,channel_df])

topics_df_topic_tree = topics_df_topic_tree.reset_index(drop = True)

topics_df_topic_tree.columns = ["topic_"+ column for column in topics_df_topic_tree.columns]
topics_df_topic_tree = topics_df_topic_tree.rename(columns = {"topic_topic_tree":"topic_tree"})

100%|██████████| 171/171 [00:14<00:00, 11.61it/s]


In [6]:
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?@\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?@>+', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\@', '', text)
    text = re.sub('\_', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [7]:
topics_df_topic_tree["topic_description"] = (topics_df_topic_tree["topic_description"].fillna("") + ". " + topics_df_topic_tree["topic_tree"].fillna("")).progress_apply(clean_text)
content_df["content_description"] = (content_df["content_title"].fillna("") + ". " + content_df["content_description"].fillna("") + ". " + content_df["content_text"].fillna("")).progress_apply(clean_text)

100%|██████████| 76972/76972 [00:04<00:00, 18157.81it/s]
100%|██████████| 154047/154047 [01:51<00:00, 1378.73it/s]


In [8]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return (sum_embeddings / sum_mask).squeeze(0).cpu().numpy()

def get_embeddings(tokenizer, model, sentences):
        
    embeddings = []
    for sentence in tqdm(sentences):
        encoded_input = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors='pt').to(device)
        with torch.no_grad():
            model_output = model(**encoded_input)

        #Perform pooling. In this case, mean pooling
        vec = mean_pooling(model_output, encoded_input['attention_mask'])    
        embeddings.append(vec)
    return embeddings

In [11]:
model_en = AutoModel.from_pretrained(model_paths[1])
model_en.eval()
model_en.to(device)
tokenizer_en = AutoTokenizer.from_pretrained(model_paths[1])

model_not_en = AutoModel.from_pretrained(model_paths[0])
model_not_en.eval()
model_not_en.to(device)
tokenizer_not_en = AutoTokenizer.from_pretrained(model_paths[0])

In [12]:
%%time
english_content_df = content_df[content_df['content_language'] == 'en'].reset_index(drop = True)
english_content = list(english_content_df["content_description"].values)

other_than_english_content_df = content_df[content_df['content_language'] != 'en'].reset_index(drop = True)
other_than_english_content = list(other_than_english_content_df["content_description"].values)

content_vectors_en = get_embeddings(tokenizer_en, model_en, english_content)
content_vectors_not_en = get_embeddings(tokenizer_not_en, model_not_en, other_than_english_content)

english_content_ids = list(english_content_df["content_id"].values)
other_than_english_content_ids = list(other_than_english_content_df["content_id"].values)

100%|██████████| 65939/65939 [10:13<00:00, 107.44it/s]
100%|██████████| 88108/88108 [20:42<00:00, 70.92it/s] 

CPU times: user 30min 40s, sys: 20.1 s, total: 31min
Wall time: 30min 56s





In [13]:
%%time
nbrs_en = NearestNeighbors(n_neighbors = 50, metric = 'cosine').fit(content_vectors_en)
nbrs_not_en = NearestNeighbors(n_neighbors = 50, metric = 'cosine').fit(content_vectors_not_en)

CPU times: user 300 ms, sys: 87.8 ms, total: 388 ms
Wall time: 390 ms


In [14]:
# calculate the mean F2 over submission topics
f2_scores = []
submission_topics = list(submission.topic_id.values)

for submission_topic in tqdm(submission_topics):
    
    topic_index = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == submission_topic].index.values[0]
    tlang = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == submission_topic]['topic_language'].values[0]
    topic_description = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == submission_topic]['topic_description'].values[0]
    
    if tlang == 'en':

        topic_vector = get_embeddings(tokenizer_en, model_en, [topic_description])
    
        # calculate the nearest neighbors for the target topic
        dist, nb = nbrs_en.kneighbors(topic_vector)
        
        # get the set of content IDs returned by the nearest neighbors model
        # (skipping over any content items where the language does not match)
        pred_content_ids = []
        for cindex in nb[0]:
            cid = english_content_ids[cindex]
            clang = english_content_df[english_content_df['content_id'] == cid]['content_language'].values[0]
            if clang == tlang:
                pred_content_ids.append(cid)
                
    else:
        topic_vector = get_embeddings(tokenizer_not_en, model_not_en, [topic_description])
    
        # calculate the nearest neighbors for the target topic
        dist, nb = nbrs_not_en.kneighbors(topic_vector)
        
        # get the set of content IDs returned by the nearest neighbors model
        # (skipping over any content items where the language does not match)
        pred_content_ids = []
        for cindex in nb[0]:
            cid = other_than_english_content_ids[cindex]
            clang = other_than_english_content_df[other_than_english_content_df['content_id'] == cid]['content_language'].values[0]
            if clang == tlang:
                pred_content_ids.append(cid)
        

    # trim to only the top 20 results
    pred_content_ids = set(pred_content_ids[:top_n])

    pred_col = ' '.join([pred_content_id for pred_content_id in list(pred_content_ids)])
    submission.loc[submission['topic_id'] == submission_topic,'content_ids'] = pred_col

    # get the set of ground truth content IDs correlated to the target topic
    true_content_ids = set(corr_df.loc[corr_df['topic_id'] == submission_topic,'content_id'])
    
    
    # calculate the confusion matrix variables
    tp = len(true_content_ids.intersection(pred_content_ids))
    fp = len(pred_content_ids - true_content_ids)
    fn = len(true_content_ids - pred_content_ids)

    # calculate the F2 score
    if len(true_content_ids) != 0:        
        if pred_content_ids:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f2 = tp / (tp + 0.2 * fp + 0.8*fn)
        else:
            f2 = 0
    else:
        f2 = 0

    f2_scores.append(f2)
    
print("Average F2:", np.mean(f2_scores))

  0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 65.27it/s]
 20%|██        | 1/5 [00:00<00:03,  1.13it/s]
100%|██████████| 1/1 [00:00<00:00, 64.25it/s]
 40%|████      | 2/5 [00:02<00:03,  1.07s/it]
100%|██████████| 1/1 [00:00<00:00, 84.07it/s]
 60%|██████    | 3/5 [00:02<00:01,  1.04it/s]
100%|██████████| 1/1 [00:00<00:00, 69.50it/s]
 80%|████████  | 4/5 [00:03<00:00,  1.09it/s]
100%|██████████| 1/1 [00:00<00:00, 68.65it/s]
100%|██████████| 5/5 [00:04<00:00,  1.15it/s]

Average F2: 0.12698412698412698





In [15]:
submission

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_c677789a0df4
1,t_00068291e9a4,c_88b5b091fd7b c_e6df0d885284 c_52dfafd40731 c...
2,t_00069b63a70a,c_02436b17b918 c_3695c5dc1df6 c_8577c06c226a c...
3,t_0006d41a73a8,c_29117d57eff7
4,t_4054df11a74e,c_52f9df7e611a c_3695c5dc1df6 c_8577c06c226a c...


In [18]:
submission.to_csv('/kaggle/working/submission.csv', index = False)