In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('/kaggle/input/')

In [2]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from annoy import AnnoyIndex

In [3]:
max_length = 512
top_n = 10
method  = 'tf-idf' #tf-idf, count vectorizer, transformers

In [4]:
if method == 'transformers':
    import torch
    from transformers import AutoTokenizer, AutoModel
    %env TOKENIZERS_PARALLELISM=true

    device = "cuda"
    model_path = '/kaggle/input/paraphrasemultilingualmpnetbasev2'

    model = AutoModel.from_pretrained(model_path)
    model.eval()
    model.to(device)

    tokenizer = AutoTokenizer.from_pretrained(model_path)

In [5]:
# Load the datasets
topics_df = pd.read_csv("learning-equality-curriculum-recommendations/topics.csv")
content_df = pd.read_csv("learning-equality-curriculum-recommendations/content.csv")
corr_df = pd.read_csv("learning-equality-curriculum-recommendations/correlations.csv")
submission = pd.read_csv("learning-equality-curriculum-recommendations/sample_submission.csv")

In [6]:
content_df.columns = ['content_'+ column for column in content_df.columns]

In [7]:
corr_df['content_ids'] = corr_df['content_ids'].str.split()
corr_df = corr_df.explode('content_ids').reset_index(drop = True)
corr_df = corr_df.rename(columns = {'content_ids':'content_id'})
corr_df.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [8]:
topics_df_topic_tree = pd.DataFrame()

for channel in tqdm(topics_df['channel'].unique()):
    channel_df = topics_df[(topics_df['channel'] == channel)].reset_index(drop = True)
    for level in sorted(channel_df.level.unique()):
        #For level 0, it first creates a topic tree column which is the title of that topic.            
        if level == 0:
            topic_tree = channel_df[channel_df['level'] == level]['title'].astype(str)
            topic_tree_df = pd.DataFrame([channel_df[channel_df['level'] == level][['id']],topic_tree.values]).T
            topic_tree_df.columns = ['child_id','topic_tree']
            channel_df = channel_df.merge(topic_tree_df, left_on = 'id', right_on = 'child_id', how = 'left').drop(['child_id'], axis = 1)
        
        #Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id
        topic_df_parent = channel_df[channel_df['level'] == level][['id','title','parent','topic_tree']]
        topic_df_parent.columns = 'parent_' + topic_df_parent.columns
        
        topic_df_child = channel_df[channel_df['level'] == level + 1][['id','title','parent','topic_tree']]
        topic_df_child.columns = 'child_' + topic_df_child.columns
        
        topic_df_merged = topic_df_parent.merge(topic_df_child, left_on = 'parent_id', right_on = 'child_parent')[['child_id','parent_id','parent_title','child_title','parent_topic_tree']]

        #Topic tree is parent topic tree + title of the current child on that level
        topic_tree = topic_df_merged['parent_topic_tree'].astype(str) + ' >> ' + topic_df_merged['child_title'].astype(str)
        
        topic_tree_df = pd.DataFrame([topic_df_merged['child_id'].values,topic_tree.values]).T
        topic_tree_df.columns = ['child_id','topic_tree']
        
        channel_df = channel_df.merge(topic_tree_df, left_on = 'id', right_on = 'child_id', how = 'left').drop(['child_id'], axis = 1)
        if 'topic_tree_y' in list(channel_df.columns):
            channel_df['topic_tree'] = channel_df['topic_tree_x'].combine_first(channel_df['topic_tree_y'])
            channel_df = channel_df.drop(['topic_tree_x','topic_tree_y'], axis = 1)
        
    topics_df_topic_tree = pd.concat([topics_df_topic_tree,channel_df])

topics_df_topic_tree = topics_df_topic_tree.reset_index(drop = True)

topics_df_topic_tree.columns = ['topic_'+ column for column in topics_df_topic_tree.columns]
topics_df_topic_tree = topics_df_topic_tree.rename(columns = {'topic_topic_tree':'topic_tree'})

100%|██████████| 171/171 [00:14<00:00, 11.56it/s]


In [9]:
topics_df_topic_tree['topic_description'] = (topics_df_topic_tree['topic_description'].fillna('') + '[SEP]' + topics_df_topic_tree['topic_tree'].fillna(''))
content_df['content_description'] = content_df['content_description'].fillna('') + content_df['content_text'].fillna('') + content_df['content_title'].fillna('')

In [10]:
topics_df_topic_tree = topics_df_topic_tree[topics_df_topic_tree.topic_has_content].reset_index(drop = True)
topics_df_topic_tree = topics_df_topic_tree[topics_df_topic_tree['topic_language'] == 'en'].reset_index(drop = True)
content_df = content_df[content_df['content_language'] == 'en'].reset_index(drop = True)

In [11]:
import re
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?@\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?@>+', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\@', '', text)
    text = re.sub('\_', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [12]:
if method != 'transformers':
    topics_df_topic_tree['topic_description_cleaned'] = topics_df_topic_tree['topic_description'].progress_apply(clean_text)
    content_df['content_description_cleaned'] = content_df['content_description'].progress_apply(clean_text)

100%|██████████| 28053/28053 [00:00<00:00, 35263.23it/s]
100%|██████████| 65939/65939 [00:43<00:00, 1527.41it/s]


In [13]:
from nltk.corpus import stopwords
if method != 'transformers':
    stop_words = stopwords.words('english')

    def remove_stopwords(text):
        text = ' '.join(word for word in text.split(' ') if word not in stop_words)
        return text

    topics_df_topic_tree['topic_description_cleaned'] = topics_df_topic_tree['topic_description_cleaned'].progress_apply(remove_stopwords)
    content_df['content_description_cleaned'] = content_df['content_description_cleaned'].progress_apply(remove_stopwords)
    content_df.head()

100%|██████████| 28053/28053 [00:02<00:00, 13078.63it/s]
100%|██████████| 65939/65939 [02:21<00:00, 466.94it/s]


In [14]:
import nltk

if method != 'transformers':
    stemmer = nltk.SnowballStemmer("english")

    def stemm_text(text):
        text = ' '.join(stemmer.stem(word) for word in text.split(' '))
        return text

    topics_df_topic_tree['topic_description_cleaned'] = topics_df_topic_tree['topic_description_cleaned'].progress_apply(stemm_text)
    content_df['content_description_cleaned'] = content_df['content_description_cleaned'].progress_apply(stemm_text)
    content_df.head()

100%|██████████| 28053/28053 [00:07<00:00, 3943.38it/s]
100%|██████████| 65939/65939 [07:14<00:00, 151.71it/s]


In [15]:
def get_embeddings(list_text):
    embeddings = []
    
    for text in tqdm(list_text):
        tok = torch.tensor(tokenizer.encode(text, padding = 'max_length', truncation = True)).to(device).unsqueeze(0)
        with torch.no_grad():
            vec = model(tok).last_hidden_state.squeeze(0).mean(0).cpu().numpy()
        embeddings.append(vec)
        
    return embeddings

In [16]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

if method == 'tf-idf':
    vectorizer = TfidfVectorizer()
    vectorizer.fit(content_df['content_description_cleaned'])
    content_vectors = vectorizer.transform(content_df['content_description_cleaned']).toarray()
    topic_vectors = vectorizer.transform(topics_df_topic_tree['topic_description_cleaned']).toarray()
    
elif method == 'count vectorizer':
    vectorizer = CountVectorizer()
    vectorizer.fit(content_df['content_description_cleaned'])
    content_vectors = vectorizer.transform(content_df['content_description_cleaned']).toarray()
    topic_vectors = vectorizer.transform(topics_df_topic_tree['topic_description_cleaned']).toarray()

elif method == 'transformers':
    content_vectors = get_embeddings(list(content_df["content_description"].values))
    topic_vectors = get_embeddings(list(topics_df_topic_tree["topic_description"].values))

CPU times: user 46 s, sys: 10.1 s, total: 56.1 s
Wall time: 56.2 s


In [17]:
%%time
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

if method != 'transformers':
    content_vectors = csr_matrix(content_vectors)
    topic_vectors = csr_matrix(topic_vectors)
    svd = TruncatedSVD(n_components = 10, random_state=42)
    content_vectors = svd.fit_transform(content_vectors)
    topic_vectors = svd.fit_transform(topic_vectors)

CPU times: user 3min 27s, sys: 23.2 s, total: 3min 50s
Wall time: 3min 42s


In [18]:
nbrs = NearestNeighbors(n_neighbors = 50, metric = 'cosine').fit(content_vectors)

In [19]:
topic_ids = list(topics_df_topic_tree['topic_id'].values)
content_ids = list(content_df['content_id'].values)

In [21]:
# specify the target topic to use
topic_id = 't_06bdfbed8171'
topic_index = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id].index.values[0]
tlang = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id]['topic_language'].values[0]
topic_tree = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id]['topic_tree'].values[0]

topic_vector = topic_vectors[topic_index]
dist, nb = nbrs.kneighbors([topic_vector])

# get the set of ground truth content IDs correlated to the target topic
true_content_ids = set(corr_df.loc[corr_df['topic_id'] == topic_id,'content_id'])

# get the set of content IDs returned by the nearest neighbors model
# (skipping over any content items where the language does not match)
pred_content_ids = []
for cindex in nb[0]:
    cid = content_ids[cindex]
    clang = content_df[content_df['content_id'] == cid]['content_language'].values[0]
    if  clang == tlang:
        pred_content_ids.append(cid)
        
# trim to only the top k results
k = top_n
pred_content_ids = set(pred_content_ids[:k])

# display the ground truth and predicted content item titles
print("True content:")
for cid in true_content_ids:
    content_title = content_df[content_df['content_id'] == cid].content_title.values[0]
    print("  ", cid, "\t", content_title)
    
print("Predicted content:")
if pred_content_ids:
    for cid in pred_content_ids:
        content_title = content_df[content_df['content_id'] == cid].content_title.values[0]
        print("  ", cid, "\t", content_title)

# calculate the confusion matrix variables
tp = len(true_content_ids.intersection(pred_content_ids))
fp = len(pred_content_ids - true_content_ids)
fn = len(true_content_ids - pred_content_ids)

print("Ground truth count:", len(true_content_ids))
print("Predicted count:", len(pred_content_ids))
print("True positives:", tp)
print("False positives:", fp)

# calculate the F2 score
if tp or (fp and fn):
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    print(f"F2: {round(f2*100,2)}%")

print(f'Precision at {k}: {tp/k * 100}%')

True content:
   c_3a94b8b50c48 	 Chapter 1. Foundations
   c_afab13ecb3d5 	 Chapter 2. Solving Linear Equations and Inequalities
   c_c3cd764a6335 	 Chapter 6. Percents
Predicted content:
   c_78fafdb40be0 	 The Mean 2
   c_1401ecd2e9de 	 Level 2: The Mean
   c_16a87e620109 	 Simulation showing value of t statistic
   c_93242d63f027 	 The Mean of Means
   c_a0dada46d589 	 Level 1: The Mean
   c_9753c88fad23 	 6.1 The meaning of locus
   c_11dfc6a3bd5b 	 Level 3: The Mean
   c_7fb892d6095a 	 Social theories overview (part 1)
   c_a5855dc771b8 	 The Mean
   c_e72e1a5d79f0 	 Mean
Ground truth count: 3
Predicted count: 10
True positives: 0
False positives: 10
F2: 0.0%
Precision at 10: 0.0%


In [24]:
# calculate the mean F2 over submission topics
f2_scores = []
precision_at_k = []
avg_precision_at_k = []

for topic_id in tqdm(topic_ids[:1000]):
    
    topic_index = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id].index.values[0]
    tlang = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id]['topic_language'].values[0]
    topic_description = topics_df_topic_tree[topics_df_topic_tree['topic_id'] == topic_id]['topic_description'].values[0]
    
    topic_vector = topic_vectors[topic_index]

    # calculate the nearest neighbors for the target topic
    dist, nb = nbrs.kneighbors([topic_vector])

    # get the set of ground truth content IDs correlated to the target topic
    true_content_ids = set(corr_df.loc[corr_df['topic_id'] == topic_id,'content_id'])

    # get the set of content IDs returned by the nearest neighbors model
    # (skipping over any content items where the language does not match)
    pred_content_ids = []
    for cindex in nb[0]:
        cid = content_ids[cindex]
        clang = content_df[content_df['content_id'] == cid]['content_language'].values[0]
        if clang == tlang:
            pred_content_ids.append(cid)

    # trim to only the top 20 results
    pred_content_ids = set(pred_content_ids[:top_n])

    # calculate the confusion matrix variables
    tp = len(true_content_ids.intersection(pred_content_ids))
    fp = len(pred_content_ids - true_content_ids)
    fn = len(true_content_ids - pred_content_ids)

    # calculate the F2 score
    if len(true_content_ids) != 0:        
        if pred_content_ids:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f2 = tp / (tp + 0.2 * fp + 0.8*fn)
        else:
            f2 = 0
    else:
        f2 = 0

    f2_scores.append(f2)

    precision_at_k.append(tp/k)
    avg_precision_at_k.append(sum([tp/i for i in range(1,k+1)])/k)

print("Average F2:", np.mean(f2_scores))
print("Average Precision at K:", np.mean(avg_precision_at_k))

100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s]

Average F2: 0.0
Average Precision at K: 0.0



