In [1]:
#!pip install polars

# import

In [2]:
import gc
import os
import json
import sys

sys.path.append("/kaggle/input/sentence-transformers-v222/sentence-transformers-master")

import cupy as cp
import numpy as np
import pandas as pd
#import pytorch_lightning as pl
#import polars as pl
import sklearn.metrics as metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
from pytorch_lightning import Trainer
from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from sklearn.model_selection import StratifiedGroupKFold
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)

from tqdm import tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses

# Config

In [3]:
cfg = {
    "general": {
        "project_name": "LECR",
        "input_path": "/kaggle/input",
        "seed": 42,
        "cv": True,
        #"wandb_desabled": False,
        "fold": [0, 1, 2, 3], # list (0-idx start) or null. set one element list, hold-out mode.
        #"n_splits": 15,
    },
    
    "task": {
        "select_top_n": 10,
        "metric": "cosine",
    },
    
    #"model": {
    #    "model_name_other": "/kaggle/input/lecr-retriever-model/alllang",
    #    "model_name_en": "/kaggle/input/lecr-retriever-model/english",
    #    "model_name_es": "/kaggle/input/lecr-retriever-model/spanish",
    #},
}

# Function

In [4]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [5]:
def build_training_set(topic, content):
    # Create lists for training
    topic_ids = []
    content_ids = []
    topic_inputs = []
    content_inputs = []
    language = []
    targets = []
    dists = []
    # Iterate over each topic
    for k in tqdm(range(len(topic))):
        row = topic.iloc[k]
        topic_id = row["id"]
        topic_input = row["topic_inputs"]
        lang = row["language"]
        predictions = row["predicts"].split(" ")
        ground_truth = row["content_ids"].split(" ")
        dist = row["dists"]
        if predictions[0] == "":
            #topic_ids.append(topic_id)
            #content_ids.append(np.nan)
            #topic_inputs.append(topic_inputs)
            #content_inputs.append(np.nan)
            #targets.append(np.nan)
            #dists.append(np.nan)
            continue
        for i, pred in enumerate(predictions):
            content_input = content.loc[pred, "content_inputs"]
            topic_ids.append(topic_id)
            content_ids.append(pred)
            topic_inputs.append(topic_input)
            content_inputs.append(content_input)
            language.append(lang)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
            dists.append(dist[i])
    train = pd.DataFrame(
        {"topic_ids": topic_ids, 
         "content_ids": content_ids, 
         "topic_inputs": topic_inputs, 
         "content_inputs": content_inputs, 
         "target": targets,
         "language": language,
         "dists": dists,
        }
    )
    # Release memory
    del topic_ids, content_ids, topic_inputs, content_inputs, targets
    gc.collect()
    return train

# polars
def build_training_set_pl(topic, content):
    # Create lists for training
    topic_ids = []
    content_ids = []
    topic_inputs = []
    content_inputs = []
    language = []
    targets = []
    dists = []
    # Iterate over each topic
    for k in tqdm(range(len(topic))):
        row = topic[k]
        topic_id = row["id"][0]
        topic_input = row["topic_inputs"][0]
        lang = row["language"][0]
        predictions = row["predicts"][0].split(" ")
        ground_truth = row["content_ids"][0].split(" ")
        dist = row["dists"]
        if predictions[0] == "":
            #topic_ids.append(topic_id)
            #content_ids.append(np.nan)
            #topic_inputs.append(topic_inputs)
            #content_inputs.append(np.nan)
            #targets.append(np.nan)
            #dists.append(np.nan)
            continue
        for pred in predictions:
            content_input = content.filter(pl.col("id")==pred)["content_inputs"][0]
            topic_ids.append(topic_id)
            content_ids.append(pred)
            topic_inputs.append(topic_input)
            content_inputs.append(content_input)
            language.append(lang)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
            dists.append(dist)
    train = pl.DataFrame(
        {"topic_ids": topic_ids, 
         "content_ids": content_ids, 
         "topic_inputs": topic_inputs, 
         "content_inputs": content_inputs, 
         "target": targets,
         "language": language,
         "dists": dists,
        }
    )
    # Release memory
    del topic_ids, content_ids, topic_inputs, content_inputs, targets
    gc.collect()
    return train

## Read Data

In [6]:
def read_data(fold_n, mode="train"):
    content_df = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/content.pkl")
    correlations_df = pd.read_csv(f"{cfg['general']['input_path']}/learning-equality-curriculum-recommendations/correlations.csv")
    topics_df = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/topics.pkl")
    sub_df = pd.read_csv(f"{cfg['general']['input_path']}/learning-equality-curriculum-recommendations/sample_submission.csv")
    fold = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/fold.pkl")
    
    print(f"fold_{fold_n}")
    topics_df_fold = topics_df.merge(fold, how="inner", left_on="id", right_on="topic_id")
    topics_df_fold = topics_df_fold.drop(["id"], axis=1)
    if mode=="train":
        topics_df_fold_indices = topics_df_fold["fold"]!=fold_n
    elif mode=="valid":
        topics_df_fold_indices = topics_df_fold["fold"]==fold_n
    topics_df_fold = topics_df[topics_df_fold_indices]

    topics_df_fold.rename(columns=lambda x: "topic_" + x, inplace=True)
    content_df.rename(columns=lambda x: "content_" + x, inplace=True)

    correlations = correlations_df.copy()
    correlations.content_ids = correlations.content_ids.str.split()
    correlations = correlations.explode("content_ids").rename(columns={"content_ids": "content_id"})
    correlations = correlations.merge(topics_df_fold, how="inner", on="topic_id")
    correlations = correlations.merge(content_df, how="inner", on="content_id")
    correlations.rename(columns={"content_content_inputs" : "content_inputs"}, inplace=True)
    correlations.rename(columns={"topic_topic_inputs" : "topic_inputs"}, inplace=True)

    topics_df = topics_df_fold.rename(columns=lambda x: x[6:]).reset_index(drop=True)
    content_df = content_df.rename(columns=lambda x: x[8:]).reset_index(drop=True)
    content_df = content_df[content_df["id"].isin(correlations["content_id"].tolist())].reset_index(drop=True)
    return topics_df, content_df, correlations_df

## valid data

In [7]:
for fold_n in cfg["general"]["fold"]:
    topics_df, content_df, correlations_df = read_data(fold_n, mode="valid")
    
    topics_en_df = topics_df[topics_df["language"]=="en"].reset_index(drop=True)
    content_en_df = content_df[content_df["language"]=="en"].reset_index(drop=True)
    topics_es_df = topics_df[topics_df["language"]=="es"].reset_index(drop=True)
    content_es_df = content_df[content_df["language"]=="es"].reset_index(drop=True)
    topics_other_df = topics_df[(topics_df["language"]!="en") & (topics_df["language"]!="es")].reset_index(drop=True)
    content_other_df = content_df[(content_df["language"]!="en") & (content_df["language"]!="es")].reset_index(drop=True)
    topics_df = pd.concat([topics_en_df, topics_es_df, topics_other_df]).reset_index(drop=True)
    content_df = pd.concat([content_en_df, content_es_df, content_other_df]).reset_index(drop=True)
    
    seed_everything(cfg["general"]["seed"], workers=True)

    # english
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/english")
    topics_embeddings_en = model.encode(topics_en_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_en = model.encode(content_en_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    # spanish
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/spanish")
    topics_embeddings_es = model.encode(topics_es_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_es = model.encode(content_es_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    # other
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/alllang")
    topics_embeddings_other = model.encode(topics_other_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_other = model.encode(content_other_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    topics_embeddings_en = cp.array(topics_embeddings_en)
    topics_embeddings_es = cp.array(topics_embeddings_es)
    topics_embeddings_other = cp.array(topics_embeddings_other)
    content_embeddings_en = cp.array(content_embeddings_en)
    content_embeddings_es = cp.array(content_embeddings_es)
    content_embeddings_other = cp.array(content_embeddings_other)
    
    #k_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
    k_list = [5, 10, 50, 100]

    # english
    if len(topics_embeddings_en) != 0:
        n_neighbors = min(len(topics_embeddings_en), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_en)
        dists_en, indices_en = nn.kneighbors(topics_embeddings_en)
        del topics_embeddings_en, content_embeddings_en, nn
        torch.cuda.empty_cache()
        gc.collect()

    # spanish
    if len(topics_embeddings_es) != 0:
        n_neighbors = min(len(topics_embeddings_es), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_es)
        dists_es, indices_es = nn.kneighbors(topics_embeddings_es)
        del topics_embeddings_es, content_embeddings_es, nn
        torch.cuda.empty_cache()
        gc.collect()

    # other
    if len(topics_embeddings_other) != 0:
        n_neighbors = min(len(topics_embeddings_other), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_other)
        dists_other, indices_other = nn.kneighbors(topics_embeddings_other)
        del topics_embeddings_other, content_embeddings_other, nn
        torch.cuda.empty_cache()
        gc.collect()
        new_dists_other = []
        new_indices_other = []
        content_other_df["dists"] = np.nan
        for i in tqdm(topics_other_df.index):
            content_other_df.loc[indices_other[i].get(), "dists"] = dists_other[i].get()
            lang = topics_other_df.iloc[i]["language"]
            _content_df = content_other_df.iloc[indices_other[i].get()]
            _dists, _indices = _content_df[_content_df["language"]==lang]["dists"], _content_df[_content_df["language"]==lang].index
            new_dists_other.append(_dists.tolist())
            new_indices_other.append(_indices.tolist())
        content_other_df = content_other_df.drop(["dists"], axis=1)
        del _content_df, _indices, indices_other
        gc.collect()

    for k in k_list:
        print(f"[{k}]")

        # english
        predicts_en = []
        predicts_en_dists = []
        if len(topics_en_df) != 0:
            for i, d in tqdm(zip(indices_en, dists_en)):
                _content_id = content_en_df["id"].to_numpy()[i.get()].tolist()
                predicts_en.append(_content_id[:k])
                predicts_en_dists.append(d[:k].get())
        predicts_en = pd.Series(predicts_en)
        predicts_en_dists = pd.Series(predicts_en_dists)
        predicts_en = predicts_en.apply(lambda x: ' '.join(x))

        # spanish
        predicts_es = []
        predicts_es_dists = []
        if len(topics_es_df) != 0:
            for i, d in tqdm(zip(indices_es, dists_es)):
                _content_id = content_es_df["id"].to_numpy()[i.get()].tolist()
                predicts_es.append(_content_id[:k])
                predicts_es_dists.append(d[:k].get())
        predicts_es = pd.Series(predicts_es)
        predicts_es_dists = pd.Series(predicts_es_dists)
        predicts_es = predicts_es.apply(lambda x: ' '.join(x))

        # other
        predicts_other = []
        predicts_other_dists = []
        if len(topics_other_df) != 0:
            for i, d in tqdm(zip(new_indices_other, new_dists_other)):
                _content_id = content_other_df["id"].to_numpy()[i].tolist()
                predicts_other.append(_content_id[:k])
                predicts_other_dists.append(d[:k])
        predicts_other = pd.Series(predicts_other)
        predicts_other_dists = pd.Series(predicts_other_dists)
        predicts_other = predicts_other.apply(lambda x: ' '.join(x))

        # concat
        predicts = pd.concat([predicts_en, predicts_es, predicts_other]).reset_index(drop=True)
        dists = pd.concat([predicts_en_dists, predicts_es_dists, predicts_other_dists]).reset_index(drop=True)
        del predicts_en, predicts_es, predicts_other, predicts_en_dists, predicts_es_dists, predicts_other_dists
        gc.collect()
        print(predicts)

        _topics_df = topics_df.copy()
        _topics_df["predicts"] = predicts
        _topics_df["dists"] = dists
        _topics_df = _topics_df.merge(correlations_df, how="left", left_on=["id"], right_on=["topic_id"])
        _topics_df = _topics_df.dropna(subset=["topic_id", "content_ids"])
        score = f2_score(_topics_df["content_ids"], _topics_df["predicts"])
        print(score)
        pos_score = get_pos_score(_topics_df["content_ids"], _topics_df["predicts"])
        print(f"Our max positive score is {pos_score}")
        if k == 5 or k == 10 or k == 50 or k == 100:
            print("save 2nd valid data")
            content_df_copy = content_df.copy()
            content_df_copy.set_index("id", inplace = True)
            valid = build_training_set(_topics_df, content_df_copy)
            print(f"Our validation set has {len(valid)} rows")
            valid.to_pickle(f"valid_fold_k{k}_fold{fold_n}.pkl")
            print(valid.head())
            del content_df_copy, valid

        torch.cuda.empty_cache()
        gc.collect()

fold_0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/168 [00:00<?, ?it/s]

Batches:   0%|          | 0/341 [00:00<?, ?it/s]

Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Batches:   0%|          | 0/107 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:00<00:00, 536.94it/s]


[5]


5375it [00:00, 7794.44it/s]
1477it [00:00, 7767.78it/s]
50it [00:00, 24989.90it/s]


0       c_b8d730238789 c_d26e23e98356 c_3a2bf4a358da c...
1       c_404baaa0a2fe c_830fb82cddde c_13ab18f45197 c...
2       c_83b06ac8a3ca c_6f1196a4e8f5 c_ca56e164a2af c...
3       c_73c5c0d98bbd c_bcf40879b895 c_540f1dc2cb9c c...
4       c_3f70e452b7b0 c_cc41357fcc90 c_8e48dd7a81bd c...
                              ...                        
6897    c_f4737ddf417c c_a56dde14a1fc c_97b8efd9f864 c...
6898    c_ce57efbdd3b8 c_4a81a6858b9f c_c1cf7fd3e45a c...
6899    c_60b8df5f7763 c_3dd41dcaf284 c_aa2cdf30fb59 c...
6900    c_0685d6dab7a4 c_792134ba7444 c_6764b0d499ab c...
6901    c_4a81a6858b9f c_4dc2cfdfe42f c_6cdb121e493e c...
Length: 6902, dtype: object
0.4353
Our max positive score is 0.56588
save 2nd valid data


100%|██████████| 4749/4749 [00:00<00:00, 5393.43it/s]


Our validation set has 23745 rows
        topic_ids     content_ids  \
0  t_000d1fb3f2f5  c_b8d730238789   
1  t_000d1fb3f2f5  c_d26e23e98356   
2  t_000d1fb3f2f5  c_3a2bf4a358da   
3  t_000d1fb3f2f5  c_7868f655c31e   
4  t_000d1fb3f2f5  c_acaa5b1ab542   

                                        topic_inputs  \
0  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
1  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
2  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
3  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
4  2.1.2 - Logarithms [T_SEP] Camara Education Et...   

                                      content_inputs  target language  \
0  Relationship between exponentials & logarithms...       1       en   
1  Relationship between exponentials & logarithms...       1       en   
2  Evaluate logarithms [C_SEP] Evaluate basic log...       1       en   
3  Relationship between exponentials & logarithms...       1       en   
4  Relationship between exponentials & lo

5375it [00:00, 10358.00it/s]
1477it [00:00, 10815.46it/s]
50it [00:00, 38444.58it/s]


0       c_b8d730238789 c_d26e23e98356 c_3a2bf4a358da c...
1       c_404baaa0a2fe c_830fb82cddde c_13ab18f45197 c...
2       c_83b06ac8a3ca c_6f1196a4e8f5 c_ca56e164a2af c...
3       c_73c5c0d98bbd c_bcf40879b895 c_540f1dc2cb9c c...
4       c_3f70e452b7b0 c_cc41357fcc90 c_8e48dd7a81bd c...
                              ...                        
6897    c_f4737ddf417c c_a56dde14a1fc c_97b8efd9f864 c...
6898    c_ce57efbdd3b8 c_4a81a6858b9f c_c1cf7fd3e45a c...
6899    c_60b8df5f7763 c_3dd41dcaf284 c_aa2cdf30fb59 c...
6900    c_0685d6dab7a4 c_792134ba7444 c_6764b0d499ab c...
6901    c_4a81a6858b9f c_4dc2cfdfe42f c_6cdb121e493e c...
Length: 6902, dtype: object
0.4073
Our max positive score is 0.6691
save 2nd valid data


100%|██████████| 4749/4749 [00:01<00:00, 4205.06it/s]


Our validation set has 47490 rows
        topic_ids     content_ids  \
0  t_000d1fb3f2f5  c_b8d730238789   
1  t_000d1fb3f2f5  c_d26e23e98356   
2  t_000d1fb3f2f5  c_3a2bf4a358da   
3  t_000d1fb3f2f5  c_7868f655c31e   
4  t_000d1fb3f2f5  c_acaa5b1ab542   

                                        topic_inputs  \
0  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
1  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
2  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
3  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
4  2.1.2 - Logarithms [T_SEP] Camara Education Et...   

                                      content_inputs  target language  \
0  Relationship between exponentials & logarithms...       1       en   
1  Relationship between exponentials & logarithms...       1       en   
2  Evaluate logarithms [C_SEP] Evaluate basic log...       1       en   
3  Relationship between exponentials & logarithms...       1       en   
4  Relationship between exponentials & lo

5375it [00:00, 10209.22it/s]
1477it [00:00, 10951.17it/s]
50it [00:00, 33748.83it/s]


0       c_b8d730238789 c_d26e23e98356 c_3a2bf4a358da c...
1       c_404baaa0a2fe c_830fb82cddde c_13ab18f45197 c...
2       c_83b06ac8a3ca c_6f1196a4e8f5 c_ca56e164a2af c...
3       c_73c5c0d98bbd c_bcf40879b895 c_540f1dc2cb9c c...
4       c_3f70e452b7b0 c_cc41357fcc90 c_8e48dd7a81bd c...
                              ...                        
6897    c_f4737ddf417c c_a56dde14a1fc c_97b8efd9f864 c...
6898    c_ce57efbdd3b8 c_4a81a6858b9f c_c1cf7fd3e45a c...
6899    c_60b8df5f7763 c_3dd41dcaf284 c_aa2cdf30fb59 c...
6900    c_0685d6dab7a4 c_792134ba7444 c_6764b0d499ab c...
6901    c_4a81a6858b9f c_4dc2cfdfe42f c_6cdb121e493e c...
Length: 6902, dtype: object
0.2072
Our max positive score is 0.81634
save 2nd valid data


100%|██████████| 4749/4749 [00:03<00:00, 1503.62it/s]


Our validation set has 237450 rows
        topic_ids     content_ids  \
0  t_000d1fb3f2f5  c_b8d730238789   
1  t_000d1fb3f2f5  c_d26e23e98356   
2  t_000d1fb3f2f5  c_3a2bf4a358da   
3  t_000d1fb3f2f5  c_7868f655c31e   
4  t_000d1fb3f2f5  c_acaa5b1ab542   

                                        topic_inputs  \
0  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
1  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
2  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
3  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
4  2.1.2 - Logarithms [T_SEP] Camara Education Et...   

                                      content_inputs  target language  \
0  Relationship between exponentials & logarithms...       1       en   
1  Relationship between exponentials & logarithms...       1       en   
2  Evaluate logarithms [C_SEP] Evaluate basic log...       1       en   
3  Relationship between exponentials & logarithms...       1       en   
4  Relationship between exponentials & l

5375it [00:00, 7159.95it/s]
1477it [00:00, 4513.80it/s]
50it [00:00, 16574.35it/s]


0       c_b8d730238789 c_d26e23e98356 c_3a2bf4a358da c...
1       c_404baaa0a2fe c_830fb82cddde c_13ab18f45197 c...
2       c_83b06ac8a3ca c_6f1196a4e8f5 c_ca56e164a2af c...
3       c_73c5c0d98bbd c_bcf40879b895 c_540f1dc2cb9c c...
4       c_3f70e452b7b0 c_cc41357fcc90 c_8e48dd7a81bd c...
                              ...                        
6897    c_f4737ddf417c c_a56dde14a1fc c_97b8efd9f864 c...
6898    c_ce57efbdd3b8 c_4a81a6858b9f c_c1cf7fd3e45a c...
6899    c_60b8df5f7763 c_3dd41dcaf284 c_aa2cdf30fb59 c...
6900    c_0685d6dab7a4 c_792134ba7444 c_6764b0d499ab c...
6901    c_4a81a6858b9f c_4dc2cfdfe42f c_6cdb121e493e c...
Length: 6902, dtype: object
0.1316
Our max positive score is 0.8592
save 2nd valid data


100%|██████████| 4749/4749 [00:05<00:00, 862.05it/s]


Our validation set has 472900 rows
        topic_ids     content_ids  \
0  t_000d1fb3f2f5  c_b8d730238789   
1  t_000d1fb3f2f5  c_d26e23e98356   
2  t_000d1fb3f2f5  c_3a2bf4a358da   
3  t_000d1fb3f2f5  c_7868f655c31e   
4  t_000d1fb3f2f5  c_acaa5b1ab542   

                                        topic_inputs  \
0  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
1  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
2  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
3  2.1.2 - Logarithms [T_SEP] Camara Education Et...   
4  2.1.2 - Logarithms [T_SEP] Camara Education Et...   

                                      content_inputs  target language  \
0  Relationship between exponentials & logarithms...       1       en   
1  Relationship between exponentials & logarithms...       1       en   
2  Evaluate logarithms [C_SEP] Evaluate basic log...       1       en   
3  Relationship between exponentials & logarithms...       1       en   
4  Relationship between exponentials & l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/151 [00:00<?, ?it/s]

Batches:   0%|          | 0/507 [00:00<?, ?it/s]

Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/175 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 126/126 [00:00<00:00, 525.66it/s]


[5]


4805it [00:00, 9556.22it/s]
4636it [00:00, 10370.32it/s]
126it [00:00, 26119.82it/s]


0       c_978da5beda25 c_d3c78d8badaf c_54288b0808bf c...
1       c_132fb9cdf793 c_86f126d7f1f8 c_2a3d6c373f5e c...
2       c_7962308634d8 c_2496b6a87310 c_5c805ba5fa3a c...
3       c_52b430b084b3 c_e70ab14e83aa c_dc840c74de69 c...
4       c_64ba76cf3354 c_baa6fa0f289b c_db16c3e983d8 c...
                              ...                        
9562    c_0ad8b30c15bf c_aaee7b51d8a7 c_016cb9cf0b7b c...
9563    c_60859f80ce53 c_042362063860 c_2b6fe561d6ef c...
9564    c_8d4495e8dd6f c_c8afdc5d35e0 c_004db813868a c...
9565    c_5f678fecd934 c_4296e1773043 c_ec8c8f6480d0 c...
9566    c_29747ddb5fe0 c_60859f80ce53 c_042362063860 c...
Length: 9567, dtype: object
0.2334
Our max positive score is 0.29264
save 2nd valid data


100%|██████████| 7919/7919 [00:01<00:00, 5302.49it/s]


Our validation set has 39595 rows
        topic_ids     content_ids  \
0  t_0020bde404c1  c_978da5beda25   
1  t_0020bde404c1  c_d3c78d8badaf   
2  t_0020bde404c1  c_54288b0808bf   
3  t_0020bde404c1  c_87e75e053ccf   
4  t_0020bde404c1  c_a60e8bb62aa7   

                                        topic_inputs  \
0  6.4.2 Writing of the electron arrangement of i...   
1  6.4.2 Writing of the electron arrangement of i...   
2  6.4.2 Writing of the electron arrangement of i...   
3  6.4.2 Writing of the electron arrangement of i...   
4  6.4.2 Writing of the electron arrangement of i...   

                                      content_inputs  target language  \
0  Introduction to ions [C_SEP] Difference betwee...       0       en   
1                         Halogens [C_SEP]  [C_SEP]        0       en   
2            Alkaline Earth Metals [C_SEP]  [C_SEP]        0       en   
3  Mini-video on ion size [C_SEP] Correcting a mi...       0       en   
4   Alkaline Earth Metals Practice [C_SEP

4805it [00:00, 9714.64it/s]
4636it [00:00, 10467.75it/s]
126it [00:00, 35499.58it/s]


0       c_978da5beda25 c_d3c78d8badaf c_54288b0808bf c...
1       c_132fb9cdf793 c_86f126d7f1f8 c_2a3d6c373f5e c...
2       c_7962308634d8 c_2496b6a87310 c_5c805ba5fa3a c...
3       c_52b430b084b3 c_e70ab14e83aa c_dc840c74de69 c...
4       c_64ba76cf3354 c_baa6fa0f289b c_db16c3e983d8 c...
                              ...                        
9562    c_0ad8b30c15bf c_aaee7b51d8a7 c_016cb9cf0b7b c...
9563    c_60859f80ce53 c_042362063860 c_2b6fe561d6ef c...
9564    c_8d4495e8dd6f c_c8afdc5d35e0 c_004db813868a c...
9565    c_5f678fecd934 c_4296e1773043 c_ec8c8f6480d0 c...
9566    c_29747ddb5fe0 c_60859f80ce53 c_042362063860 c...
Length: 9567, dtype: object
0.2258
Our max positive score is 0.3576
save 2nd valid data


100%|██████████| 7919/7919 [00:02<00:00, 3938.13it/s]


Our validation set has 79190 rows
        topic_ids     content_ids  \
0  t_0020bde404c1  c_978da5beda25   
1  t_0020bde404c1  c_d3c78d8badaf   
2  t_0020bde404c1  c_54288b0808bf   
3  t_0020bde404c1  c_87e75e053ccf   
4  t_0020bde404c1  c_a60e8bb62aa7   

                                        topic_inputs  \
0  6.4.2 Writing of the electron arrangement of i...   
1  6.4.2 Writing of the electron arrangement of i...   
2  6.4.2 Writing of the electron arrangement of i...   
3  6.4.2 Writing of the electron arrangement of i...   
4  6.4.2 Writing of the electron arrangement of i...   

                                      content_inputs  target language  \
0  Introduction to ions [C_SEP] Difference betwee...       0       en   
1                         Halogens [C_SEP]  [C_SEP]        0       en   
2            Alkaline Earth Metals [C_SEP]  [C_SEP]        0       en   
3  Mini-video on ion size [C_SEP] Correcting a mi...       0       en   
4   Alkaline Earth Metals Practice [C_SEP

4805it [00:00, 7461.05it/s]
4636it [00:00, 9522.47it/s]
126it [00:00, 35887.70it/s]


0       c_978da5beda25 c_d3c78d8badaf c_54288b0808bf c...
1       c_132fb9cdf793 c_86f126d7f1f8 c_2a3d6c373f5e c...
2       c_7962308634d8 c_2496b6a87310 c_5c805ba5fa3a c...
3       c_52b430b084b3 c_e70ab14e83aa c_dc840c74de69 c...
4       c_64ba76cf3354 c_baa6fa0f289b c_db16c3e983d8 c...
                              ...                        
9562    c_0ad8b30c15bf c_aaee7b51d8a7 c_016cb9cf0b7b c...
9563    c_60859f80ce53 c_042362063860 c_2b6fe561d6ef c...
9564    c_8d4495e8dd6f c_c8afdc5d35e0 c_004db813868a c...
9565    c_5f678fecd934 c_4296e1773043 c_ec8c8f6480d0 c...
9566    c_29747ddb5fe0 c_60859f80ce53 c_042362063860 c...
Length: 9567, dtype: object
0.1284
Our max positive score is 0.4798
save 2nd valid data


100%|██████████| 7919/7919 [00:05<00:00, 1417.61it/s]


Our validation set has 395950 rows
        topic_ids     content_ids  \
0  t_0020bde404c1  c_978da5beda25   
1  t_0020bde404c1  c_d3c78d8badaf   
2  t_0020bde404c1  c_54288b0808bf   
3  t_0020bde404c1  c_87e75e053ccf   
4  t_0020bde404c1  c_a60e8bb62aa7   

                                        topic_inputs  \
0  6.4.2 Writing of the electron arrangement of i...   
1  6.4.2 Writing of the electron arrangement of i...   
2  6.4.2 Writing of the electron arrangement of i...   
3  6.4.2 Writing of the electron arrangement of i...   
4  6.4.2 Writing of the electron arrangement of i...   

                                      content_inputs  target language  \
0  Introduction to ions [C_SEP] Difference betwee...       0       en   
1                         Halogens [C_SEP]  [C_SEP]        0       en   
2            Alkaline Earth Metals [C_SEP]  [C_SEP]        0       en   
3  Mini-video on ion size [C_SEP] Correcting a mi...       0       en   
4   Alkaline Earth Metals Practice [C_SE

4805it [00:00, 9506.06it/s]
4636it [00:00, 10332.90it/s]
126it [00:00, 25558.94it/s]


0       c_978da5beda25 c_d3c78d8badaf c_54288b0808bf c...
1       c_132fb9cdf793 c_86f126d7f1f8 c_2a3d6c373f5e c...
2       c_7962308634d8 c_2496b6a87310 c_5c805ba5fa3a c...
3       c_52b430b084b3 c_e70ab14e83aa c_dc840c74de69 c...
4       c_64ba76cf3354 c_baa6fa0f289b c_db16c3e983d8 c...
                              ...                        
9562    c_0ad8b30c15bf c_aaee7b51d8a7 c_016cb9cf0b7b c...
9563    c_60859f80ce53 c_042362063860 c_2b6fe561d6ef c...
9564    c_8d4495e8dd6f c_c8afdc5d35e0 c_004db813868a c...
9565    c_5f678fecd934 c_4296e1773043 c_ec8c8f6480d0 c...
9566    c_29747ddb5fe0 c_60859f80ce53 c_042362063860 c...
Length: 9567, dtype: object
0.0847
Our max positive score is 0.52698
save 2nd valid data


100%|██████████| 7919/7919 [00:09<00:00, 816.24it/s]


Our validation set has 791900 rows
        topic_ids     content_ids  \
0  t_0020bde404c1  c_978da5beda25   
1  t_0020bde404c1  c_d3c78d8badaf   
2  t_0020bde404c1  c_54288b0808bf   
3  t_0020bde404c1  c_87e75e053ccf   
4  t_0020bde404c1  c_a60e8bb62aa7   

                                        topic_inputs  \
0  6.4.2 Writing of the electron arrangement of i...   
1  6.4.2 Writing of the electron arrangement of i...   
2  6.4.2 Writing of the electron arrangement of i...   
3  6.4.2 Writing of the electron arrangement of i...   
4  6.4.2 Writing of the electron arrangement of i...   

                                      content_inputs  target language  \
0  Introduction to ions [C_SEP] Difference betwee...       0       en   
1                         Halogens [C_SEP]  [C_SEP]        0       en   
2            Alkaline Earth Metals [C_SEP]  [C_SEP]        0       en   
3  Mini-video on ion size [C_SEP] Correcting a mi...       0       en   
4   Alkaline Earth Metals Practice [C_SE

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/113 [00:00<?, ?it/s]

Batches:   0%|          | 0/244 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/80 [00:00<?, ?it/s]

Batches:   0%|          | 0/169 [00:00<?, ?it/s]

100%|██████████| 2534/2534 [00:07<00:00, 323.94it/s]


[5]


3610it [00:00, 9055.26it/s]
1015it [00:00, 10239.36it/s]
2534it [00:00, 14582.08it/s]


0       c_dbef0d3da6d3 c_51c75c9184c1 c_a8995fd091ac c...
1       c_361c5772f429 c_cf3a2ac1538f c_7f39ccde0259 c...
2       c_4c68661b1604 c_94c02e07bdca c_f8fb9147a295 c...
3       c_7f62b7dd5bc1 c_b0b0d61a7e1a c_d22165896a7f c...
4       c_1efc93fe0507 c_f812736b4a2a c_c1695e02abc0 c...
                              ...                        
7154    c_353342733063 c_3c19848ae033 c_4714ca4a7865 c...
7155    c_7f30f76217b5 c_f8e9ce6c9f28 c_f70526cf14d5 c...
7156    c_8514055e061e c_9f66e515aec9 c_c195810d9c18 c...
7157    c_c18a41736c67 c_c4308e528ecb c_4e68062121d0 c...
7158    c_e6d4f0640a84 c_63ed183780d9 c_771466f768d4 c...
Length: 7159, dtype: object
0.3397
Our max positive score is 0.43527
save 2nd valid data


100%|██████████| 4762/4762 [00:01<00:00, 4492.80it/s]


Our validation set has 23810 rows
        topic_ids     content_ids  \
0  t_006038bb5e62  c_4c68661b1604   
1  t_006038bb5e62  c_94c02e07bdca   
2  t_006038bb5e62  c_f8fb9147a295   
3  t_006038bb5e62  c_5fc9bb8ec933   
4  t_006038bb5e62  c_76046e1b5dd6   

                                        topic_inputs  \
0  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
1  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
2  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
3  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
4  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   

                                      content_inputs  target language  \
0  Level 1: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
1  Level 2: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
2  Level 3: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
3  Level 1: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
4       Lines of Symmetry Practice [C_SEP

3610it [00:00, 10029.12it/s]
1015it [00:00, 11226.58it/s]
2534it [00:00, 11644.27it/s]


0       c_dbef0d3da6d3 c_51c75c9184c1 c_a8995fd091ac c...
1       c_361c5772f429 c_cf3a2ac1538f c_7f39ccde0259 c...
2       c_4c68661b1604 c_94c02e07bdca c_f8fb9147a295 c...
3       c_7f62b7dd5bc1 c_b0b0d61a7e1a c_d22165896a7f c...
4       c_1efc93fe0507 c_f812736b4a2a c_c1695e02abc0 c...
                              ...                        
7154    c_353342733063 c_3c19848ae033 c_4714ca4a7865 c...
7155    c_7f30f76217b5 c_f8e9ce6c9f28 c_f70526cf14d5 c...
7156    c_8514055e061e c_9f66e515aec9 c_c195810d9c18 c...
7157    c_c18a41736c67 c_c4308e528ecb c_4e68062121d0 c...
7158    c_e6d4f0640a84 c_63ed183780d9 c_771466f768d4 c...
Length: 7159, dtype: object
0.3305
Our max positive score is 0.53998
save 2nd valid data


100%|██████████| 4762/4762 [00:01<00:00, 4106.15it/s]


Our validation set has 47620 rows
        topic_ids     content_ids  \
0  t_006038bb5e62  c_4c68661b1604   
1  t_006038bb5e62  c_94c02e07bdca   
2  t_006038bb5e62  c_f8fb9147a295   
3  t_006038bb5e62  c_5fc9bb8ec933   
4  t_006038bb5e62  c_76046e1b5dd6   

                                        topic_inputs  \
0  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
1  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
2  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
3  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
4  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   

                                      content_inputs  target language  \
0  Level 1: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
1  Level 2: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
2  Level 3: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
3  Level 1: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
4       Lines of Symmetry Practice [C_SEP

3610it [00:00, 9205.06it/s]
1015it [00:00, 10624.21it/s]
2534it [00:00, 11087.77it/s]


0       c_dbef0d3da6d3 c_51c75c9184c1 c_a8995fd091ac c...
1       c_361c5772f429 c_cf3a2ac1538f c_7f39ccde0259 c...
2       c_4c68661b1604 c_94c02e07bdca c_f8fb9147a295 c...
3       c_7f62b7dd5bc1 c_b0b0d61a7e1a c_d22165896a7f c...
4       c_1efc93fe0507 c_f812736b4a2a c_c1695e02abc0 c...
                              ...                        
7154    c_353342733063 c_3c19848ae033 c_4714ca4a7865 c...
7155    c_7f30f76217b5 c_f8e9ce6c9f28 c_f70526cf14d5 c...
7156    c_8514055e061e c_9f66e515aec9 c_c195810d9c18 c...
7157    c_c18a41736c67 c_c4308e528ecb c_4e68062121d0 c...
7158    c_e6d4f0640a84 c_63ed183780d9 c_771466f768d4 c...
Length: 7159, dtype: object
0.1841
Our max positive score is 0.73416
save 2nd valid data


100%|██████████| 4762/4762 [00:03<00:00, 1503.30it/s]


Our validation set has 238100 rows
        topic_ids     content_ids  \
0  t_006038bb5e62  c_4c68661b1604   
1  t_006038bb5e62  c_94c02e07bdca   
2  t_006038bb5e62  c_f8fb9147a295   
3  t_006038bb5e62  c_5fc9bb8ec933   
4  t_006038bb5e62  c_76046e1b5dd6   

                                        topic_inputs  \
0  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
1  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
2  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
3  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
4  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   

                                      content_inputs  target language  \
0  Level 1: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
1  Level 2: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
2  Level 3: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
3  Level 1: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
4       Lines of Symmetry Practice [C_SE

3610it [00:00, 9709.14it/s]
1015it [00:00, 9700.72it/s]
2534it [00:00, 11562.15it/s]


0       c_dbef0d3da6d3 c_51c75c9184c1 c_a8995fd091ac c...
1       c_361c5772f429 c_cf3a2ac1538f c_7f39ccde0259 c...
2       c_4c68661b1604 c_94c02e07bdca c_f8fb9147a295 c...
3       c_7f62b7dd5bc1 c_b0b0d61a7e1a c_d22165896a7f c...
4       c_1efc93fe0507 c_f812736b4a2a c_c1695e02abc0 c...
                              ...                        
7154    c_353342733063 c_3c19848ae033 c_4714ca4a7865 c...
7155    c_7f30f76217b5 c_f8e9ce6c9f28 c_f70526cf14d5 c...
7156    c_8514055e061e c_9f66e515aec9 c_c195810d9c18 c...
7157    c_c18a41736c67 c_c4308e528ecb c_4e68062121d0 c...
7158    c_e6d4f0640a84 c_63ed183780d9 c_771466f768d4 c...
Length: 7159, dtype: object
0.1194
Our max positive score is 0.7939
save 2nd valid data


100%|██████████| 4762/4762 [00:05<00:00, 807.84it/s]


Our validation set has 476200 rows
        topic_ids     content_ids  \
0  t_006038bb5e62  c_4c68661b1604   
1  t_006038bb5e62  c_94c02e07bdca   
2  t_006038bb5e62  c_f8fb9147a295   
3  t_006038bb5e62  c_5fc9bb8ec933   
4  t_006038bb5e62  c_76046e1b5dd6   

                                        topic_inputs  \
0  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
1  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
2  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
3  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   
4  Assessments [T_SEP] Maths G3 to G10 | Maths | ...   

                                      content_inputs  target language  \
0  Level 1: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
1  Level 2: Symmetry about a line [C_SEP] v0.1 [C...       0       en   
2  Level 3: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
3  Level 1: Reflectional Symmetry [C_SEP] v0.1 [C...       1       en   
4       Lines of Symmetry Practice [C_SE

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/274 [00:00<?, ?it/s]

Batches:   0%|          | 0/666 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

100%|██████████| 493/493 [00:01<00:00, 436.54it/s]


[5]


8759it [00:00, 9082.34it/s]
605it [00:00, 12887.66it/s]
493it [00:00, 21711.84it/s]


0       c_6ddb1a9acadd c_7ff92a954a3d c_06571f90b3ba c...
1       c_c69fc19ab8f2 c_ac82abd6da1b c_e7daeadb242c c...
2       c_dde078b8ea7a c_2f241eb548e2 c_c062c17e9cba c...
3       c_f59f2a600907 c_87ad67b205d0 c_572923ccc03e c...
4       c_0d7305ef8165 c_01d56e0e228e c_9c51b0537c5b c...
                              ...                        
9852    c_74b9dc5c85ad c_8d4fd4381603 c_68b9aee7084a c...
9853    c_2eb451ff94a7 c_8cc184fbf715 c_ff90dd31104f c...
9854    c_5af91f956ee9 c_b5f3b0b26dc3 c_e903c8b2fef0 c...
9855    c_d91820d777f9 c_ce8ec8706696 c_de2d9c569e5f c...
9856    c_d45faaa9f758 c_3bcd31f4255f c_cdb915a15dc5 c...
Length: 9857, dtype: object
0.4596
Our max positive score is 0.53656
save 2nd valid data


100%|██████████| 7573/7573 [00:01<00:00, 5279.12it/s]


Our validation set has 37865 rows
        topic_ids     content_ids  \
0  t_0008a1bd84ba  c_6ddb1a9acadd   
1  t_0008a1bd84ba  c_7ff92a954a3d   
2  t_0008a1bd84ba  c_06571f90b3ba   
3  t_0008a1bd84ba  c_bc6425b43a74   
4  t_0008a1bd84ba  c_185375ddd0d4   

                                        topic_inputs  \
0  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
1  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
2  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
3  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
4  12. 20: Bird Reproduction [T_SEP] Libretext Op...   

                                      content_inputs  target language  \
0  12.3: Vertebrate Reproduction [C_SEP]  [C_SEP]...       0       en   
1  12. 20: Bird Reproduction [C_SEP]  [C_SEP] Is ...       1       en   
2  12. 19: Bird Structure and Function [C_SEP]  [...       0       en   
3  12. 15: Reptile Reproduction [C_SEP]  [C_SEP] ...       0       en   
4  12.2: Vertebrate Characteristics [C_SE

8759it [00:00, 9309.59it/s]
605it [00:00, 13370.18it/s]
493it [00:00, 20164.73it/s]


0       c_6ddb1a9acadd c_7ff92a954a3d c_06571f90b3ba c...
1       c_c69fc19ab8f2 c_ac82abd6da1b c_e7daeadb242c c...
2       c_dde078b8ea7a c_2f241eb548e2 c_c062c17e9cba c...
3       c_f59f2a600907 c_87ad67b205d0 c_572923ccc03e c...
4       c_0d7305ef8165 c_01d56e0e228e c_9c51b0537c5b c...
                              ...                        
9852    c_74b9dc5c85ad c_8d4fd4381603 c_68b9aee7084a c...
9853    c_2eb451ff94a7 c_8cc184fbf715 c_ff90dd31104f c...
9854    c_5af91f956ee9 c_b5f3b0b26dc3 c_e903c8b2fef0 c...
9855    c_d91820d777f9 c_ce8ec8706696 c_de2d9c569e5f c...
9856    c_d45faaa9f758 c_3bcd31f4255f c_cdb915a15dc5 c...
Length: 9857, dtype: object
0.4655
Our max positive score is 0.66863
save 2nd valid data


100%|██████████| 7573/7573 [00:02<00:00, 3550.78it/s]


Our validation set has 75730 rows
        topic_ids     content_ids  \
0  t_0008a1bd84ba  c_6ddb1a9acadd   
1  t_0008a1bd84ba  c_7ff92a954a3d   
2  t_0008a1bd84ba  c_06571f90b3ba   
3  t_0008a1bd84ba  c_bc6425b43a74   
4  t_0008a1bd84ba  c_185375ddd0d4   

                                        topic_inputs  \
0  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
1  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
2  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
3  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
4  12. 20: Bird Reproduction [T_SEP] Libretext Op...   

                                      content_inputs  target language  \
0  12.3: Vertebrate Reproduction [C_SEP]  [C_SEP]...       0       en   
1  12. 20: Bird Reproduction [C_SEP]  [C_SEP] Is ...       1       en   
2  12. 19: Bird Structure and Function [C_SEP]  [...       0       en   
3  12. 15: Reptile Reproduction [C_SEP]  [C_SEP] ...       0       en   
4  12.2: Vertebrate Characteristics [C_SE

8759it [00:00, 9029.21it/s]
605it [00:00, 12051.68it/s]
493it [00:00, 14873.42it/s]


0       c_6ddb1a9acadd c_7ff92a954a3d c_06571f90b3ba c...
1       c_c69fc19ab8f2 c_ac82abd6da1b c_e7daeadb242c c...
2       c_dde078b8ea7a c_2f241eb548e2 c_c062c17e9cba c...
3       c_f59f2a600907 c_87ad67b205d0 c_572923ccc03e c...
4       c_0d7305ef8165 c_01d56e0e228e c_9c51b0537c5b c...
                              ...                        
9852    c_74b9dc5c85ad c_8d4fd4381603 c_68b9aee7084a c...
9853    c_2eb451ff94a7 c_8cc184fbf715 c_ff90dd31104f c...
9854    c_5af91f956ee9 c_b5f3b0b26dc3 c_e903c8b2fef0 c...
9855    c_d91820d777f9 c_ce8ec8706696 c_de2d9c569e5f c...
9856    c_d45faaa9f758 c_3bcd31f4255f c_cdb915a15dc5 c...
Length: 9857, dtype: object
0.2585
Our max positive score is 0.85765
save 2nd valid data


100%|██████████| 7573/7573 [00:05<00:00, 1484.15it/s]


Our validation set has 378650 rows
        topic_ids     content_ids  \
0  t_0008a1bd84ba  c_6ddb1a9acadd   
1  t_0008a1bd84ba  c_7ff92a954a3d   
2  t_0008a1bd84ba  c_06571f90b3ba   
3  t_0008a1bd84ba  c_bc6425b43a74   
4  t_0008a1bd84ba  c_185375ddd0d4   

                                        topic_inputs  \
0  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
1  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
2  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
3  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
4  12. 20: Bird Reproduction [T_SEP] Libretext Op...   

                                      content_inputs  target language  \
0  12.3: Vertebrate Reproduction [C_SEP]  [C_SEP]...       0       en   
1  12. 20: Bird Reproduction [C_SEP]  [C_SEP] Is ...       1       en   
2  12. 19: Bird Structure and Function [C_SEP]  [...       0       en   
3  12. 15: Reptile Reproduction [C_SEP]  [C_SEP] ...       0       en   
4  12.2: Vertebrate Characteristics [C_S

8759it [00:00, 8818.21it/s]
605it [00:00, 7691.86it/s]
493it [00:00, 11258.31it/s]


0       c_6ddb1a9acadd c_7ff92a954a3d c_06571f90b3ba c...
1       c_c69fc19ab8f2 c_ac82abd6da1b c_e7daeadb242c c...
2       c_dde078b8ea7a c_2f241eb548e2 c_c062c17e9cba c...
3       c_f59f2a600907 c_87ad67b205d0 c_572923ccc03e c...
4       c_0d7305ef8165 c_01d56e0e228e c_9c51b0537c5b c...
                              ...                        
9852    c_74b9dc5c85ad c_8d4fd4381603 c_68b9aee7084a c...
9853    c_2eb451ff94a7 c_8cc184fbf715 c_ff90dd31104f c...
9854    c_5af91f956ee9 c_b5f3b0b26dc3 c_e903c8b2fef0 c...
9855    c_d91820d777f9 c_ce8ec8706696 c_de2d9c569e5f c...
9856    c_d45faaa9f758 c_3bcd31f4255f c_cdb915a15dc5 c...
Length: 9857, dtype: object
0.1614
Our max positive score is 0.89446
save 2nd valid data


100%|██████████| 7573/7573 [00:09<00:00, 788.39it/s]


Our validation set has 757298 rows
        topic_ids     content_ids  \
0  t_0008a1bd84ba  c_6ddb1a9acadd   
1  t_0008a1bd84ba  c_7ff92a954a3d   
2  t_0008a1bd84ba  c_06571f90b3ba   
3  t_0008a1bd84ba  c_bc6425b43a74   
4  t_0008a1bd84ba  c_185375ddd0d4   

                                        topic_inputs  \
0  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
1  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
2  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
3  12. 20: Bird Reproduction [T_SEP] Libretext Op...   
4  12. 20: Bird Reproduction [T_SEP] Libretext Op...   

                                      content_inputs  target language  \
0  12.3: Vertebrate Reproduction [C_SEP]  [C_SEP]...       0       en   
1  12. 20: Bird Reproduction [C_SEP]  [C_SEP] Is ...       1       en   
2  12. 19: Bird Structure and Function [C_SEP]  [...       0       en   
3  12. 15: Reptile Reproduction [C_SEP]  [C_SEP] ...       0       en   
4  12.2: Vertebrate Characteristics [C_S

## train data

In [8]:
for fold_n in cfg["general"]["fold"]:
    topics_df, content_df, correlations_df = read_data(fold_n, mode="train")
    
    topics_en_df = topics_df[topics_df["language"]=="en"].reset_index(drop=True)
    content_en_df = content_df[content_df["language"]=="en"].reset_index(drop=True)
    topics_es_df = topics_df[topics_df["language"]=="es"].reset_index(drop=True)
    content_es_df = content_df[content_df["language"]=="es"].reset_index(drop=True)
    topics_other_df = topics_df[(topics_df["language"]!="en") & (topics_df["language"]!="es")].reset_index(drop=True)
    content_other_df = content_df[(content_df["language"]!="en") & (content_df["language"]!="es")].reset_index(drop=True)
    topics_df = pd.concat([topics_en_df, topics_es_df, topics_other_df]).reset_index(drop=True)
    content_df = pd.concat([content_en_df, content_es_df, content_other_df]).reset_index(drop=True)
    
    seed_everything(cfg["general"]["seed"], workers=True)

    # english
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/english")
    topics_embeddings_en = model.encode(topics_en_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_en = model.encode(content_en_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    # spanish
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/spanish")
    topics_embeddings_es = model.encode(topics_es_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_es = model.encode(content_es_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    # other
    model = SentenceTransformer(f"/kaggle/input/lecr-retriever-model/LECR_retriever_fold{fold_n}_model/alllang")
    topics_embeddings_other = model.encode(topics_other_df["topic_inputs"].tolist()) #.cpu().detach().numpy()
    content_embeddings_other = model.encode(content_other_df["content_inputs"].tolist()) #.cpu().detach().numpy()
    del model
    torch.cuda.empty_cache()
    gc.collect()

    topics_embeddings_en = cp.array(topics_embeddings_en)
    topics_embeddings_es = cp.array(topics_embeddings_es)
    topics_embeddings_other = cp.array(topics_embeddings_other)
    content_embeddings_en = cp.array(content_embeddings_en)
    content_embeddings_es = cp.array(content_embeddings_es)
    content_embeddings_other = cp.array(content_embeddings_other)
    
    #k_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
    k_list = [5, 10, 50, 100]

    # english
    if len(topics_embeddings_en) != 0:
        n_neighbors = min(len(topics_embeddings_en), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_en)
        dists_en, indices_en = nn.kneighbors(topics_embeddings_en)
        del topics_embeddings_en, content_embeddings_en, nn
        torch.cuda.empty_cache()
        gc.collect()

    # spanish
    if len(topics_embeddings_es) != 0:
        n_neighbors = min(len(topics_embeddings_es), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_es)
        dists_es, indices_es = nn.kneighbors(topics_embeddings_es)
        del topics_embeddings_es, content_embeddings_es, nn
        torch.cuda.empty_cache()
        gc.collect()

    # other
    if len(topics_embeddings_other) != 0:
        n_neighbors = min(len(topics_embeddings_other), 1000)
        n_neighbors = n_neighbors if n_neighbors != 0 else 1
        nn = NearestNeighbors(n_neighbors, metric=cfg["task"]["metric"])
        nn.fit(content_embeddings_other)
        dists_other, indices_other = nn.kneighbors(topics_embeddings_other)
        del topics_embeddings_other, content_embeddings_other, nn
        torch.cuda.empty_cache()
        gc.collect()
        new_dists_other = []
        new_indices_other = []
        content_other_df["dists"] = np.nan
        for i in tqdm(topics_other_df.index):
            content_other_df.loc[indices_other[i].get(), "dists"] = dists_other[i].get()
            lang = topics_other_df.iloc[i]["language"]
            _content_df = content_other_df.iloc[indices_other[i].get()]
            _dists, _indices = _content_df[_content_df["language"]==lang]["dists"], _content_df[_content_df["language"]==lang].index
            new_dists_other.append(_dists.tolist())
            new_indices_other.append(_indices.tolist())
        content_other_df = content_other_df.drop(["dists"], axis=1)
        del _content_df, _indices, indices_other
        gc.collect()

    for k in k_list:
        print(f"[{k}]")

        # english
        predicts_en = []
        predicts_en_dists = []
        if len(topics_en_df) != 0:
            for i, d in tqdm(zip(indices_en, dists_en)):
                _content_id = content_en_df["id"].to_numpy()[i.get()].tolist()
                predicts_en.append(_content_id[:k])
                predicts_en_dists.append(d[:k].get())
        predicts_en = pd.Series(predicts_en)
        predicts_en_dists = pd.Series(predicts_en_dists)
        predicts_en = predicts_en.apply(lambda x: ' '.join(x))

        # spanish
        predicts_es = []
        predicts_es_dists = []
        if len(topics_es_df) != 0:
            for i, d in tqdm(zip(indices_es, dists_es)):
                _content_id = content_es_df["id"].to_numpy()[i.get()].tolist()
                predicts_es.append(_content_id[:k])
                predicts_es_dists.append(d[:k].get())
        predicts_es = pd.Series(predicts_es)
        predicts_es_dists = pd.Series(predicts_es_dists)
        predicts_es = predicts_es.apply(lambda x: ' '.join(x))

        # other
        predicts_other = []
        predicts_other_dists = []
        if len(topics_other_df) != 0:
            for i, d in tqdm(zip(new_indices_other, new_dists_other)):
                _content_id = content_other_df["id"].to_numpy()[i].tolist()
                predicts_other.append(_content_id[:k])
                predicts_other_dists.append(d[:k])
        predicts_other = pd.Series(predicts_other)
        predicts_other_dists = pd.Series(predicts_other_dists)
        predicts_other = predicts_other.apply(lambda x: ' '.join(x))

        # concat
        predicts = pd.concat([predicts_en, predicts_es, predicts_other]).reset_index(drop=True)
        dists = pd.concat([predicts_en_dists, predicts_es_dists, predicts_other_dists]).reset_index(drop=True)
        del predicts_en, predicts_es, predicts_other, predicts_en_dists, predicts_es_dists, predicts_other_dists
        gc.collect()
        print(predicts)

        _topics_df = topics_df.copy()
        _topics_df["predicts"] = predicts
        _topics_df["dists"] = dists
        _topics_df = _topics_df.merge(correlations_df, how="left", left_on=["id"], right_on=["topic_id"])
        _topics_df = _topics_df.dropna(subset=["topic_id", "content_ids"])
        score = f2_score(_topics_df["content_ids"], _topics_df["predicts"])
        print(score)
        pos_score = get_pos_score(_topics_df["content_ids"], _topics_df["predicts"])
        print(f"Our max positive score is {pos_score}")
        if k == 100:
            print("save 2nd train data")
            content_df_copy = content_df.copy()
            content_df_copy.set_index("id", inplace = True)
            train = build_training_set(_topics_df, content_df_copy)
            print(f"Our training set has {len(train)} rows")
            train.to_pickle(f"train_fold_k{k}_fold{fold_n}.pkl")
            print(train.head())
            del content_df_copy, train

        torch.cuda.empty_cache()
        gc.collect()

fold_0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/963 [00:00<?, ?it/s]

Batches:   0%|          | 0/1858 [00:00<?, ?it/s]

Batches:   0%|          | 0/389 [00:00<?, ?it/s]

Batches:   0%|          | 0/942 [00:00<?, ?it/s]

Batches:   0%|          | 0/840 [00:00<?, ?it/s]

Batches:   0%|          | 0/1789 [00:00<?, ?it/s]

100%|██████████| 26851/26851 [01:36<00:00, 277.31it/s]


[5]


30786it [00:03, 8585.45it/s]
12433it [00:01, 8955.90it/s]
26851it [00:03, 8787.69it/s]


0        c_65ae9a83563d c_608736c267f1 c_99a94951d9ea c...
1        c_11a1dc0bfb99 c_447222ff490e c_de6f5fbe3f97 c...
2        c_70cee4e16c16 c_6426cd6190c2 c_7ff92a954a3d c...
3        c_ade5a4b4e971 c_f84bee7dc0d9 c_235f882bc66b c...
4        c_e7daeadb242c c_f16047c6f924 c_4a017a7b88b1 c...
                               ...                        
70065    c_3bb33bd78c3a c_802181e7ca75 c_ddcfa5783784 c...
70066    c_79903740e1e8 c_a484656c2956 c_730f3c40904f c...
70067    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
70068    c_46f852a49c08 c_0dd3eab0f444 c_bb1db83147d6 c...
70069    c_2c55fbee8ba3 c_ee583e1c7719 c_011eb3c737f4 c...
Length: 70070, dtype: object
0.5118
Our max positive score is 0.6206
[10]


30786it [00:03, 8422.19it/s]
12433it [00:01, 9134.84it/s]
26851it [00:03, 8720.12it/s]


0        c_65ae9a83563d c_608736c267f1 c_99a94951d9ea c...
1        c_11a1dc0bfb99 c_447222ff490e c_de6f5fbe3f97 c...
2        c_70cee4e16c16 c_6426cd6190c2 c_7ff92a954a3d c...
3        c_ade5a4b4e971 c_f84bee7dc0d9 c_235f882bc66b c...
4        c_e7daeadb242c c_f16047c6f924 c_4a017a7b88b1 c...
                               ...                        
70065    c_3bb33bd78c3a c_802181e7ca75 c_ddcfa5783784 c...
70066    c_79903740e1e8 c_a484656c2956 c_730f3c40904f c...
70067    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
70068    c_46f852a49c08 c_0dd3eab0f444 c_bb1db83147d6 c...
70069    c_2c55fbee8ba3 c_ee583e1c7719 c_011eb3c737f4 c...
Length: 70070, dtype: object
0.5038
Our max positive score is 0.76215
[50]


30786it [00:03, 8112.02it/s]
12433it [00:01, 9023.99it/s]
26851it [00:02, 9160.46it/s]


0        c_65ae9a83563d c_608736c267f1 c_99a94951d9ea c...
1        c_11a1dc0bfb99 c_447222ff490e c_de6f5fbe3f97 c...
2        c_70cee4e16c16 c_6426cd6190c2 c_7ff92a954a3d c...
3        c_ade5a4b4e971 c_f84bee7dc0d9 c_235f882bc66b c...
4        c_e7daeadb242c c_f16047c6f924 c_4a017a7b88b1 c...
                               ...                        
70065    c_3bb33bd78c3a c_802181e7ca75 c_ddcfa5783784 c...
70066    c_79903740e1e8 c_a484656c2956 c_730f3c40904f c...
70067    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
70068    c_46f852a49c08 c_0dd3eab0f444 c_bb1db83147d6 c...
70069    c_2c55fbee8ba3 c_ee583e1c7719 c_011eb3c737f4 c...
Length: 70070, dtype: object
0.2684
Our max positive score is 0.95262
[100]


30786it [00:03, 8509.75it/s]
12433it [00:01, 9049.41it/s]
26851it [00:03, 8680.16it/s]


0        c_65ae9a83563d c_608736c267f1 c_99a94951d9ea c...
1        c_11a1dc0bfb99 c_447222ff490e c_de6f5fbe3f97 c...
2        c_70cee4e16c16 c_6426cd6190c2 c_7ff92a954a3d c...
3        c_ade5a4b4e971 c_f84bee7dc0d9 c_235f882bc66b c...
4        c_e7daeadb242c c_f16047c6f924 c_4a017a7b88b1 c...
                               ...                        
70065    c_3bb33bd78c3a c_802181e7ca75 c_ddcfa5783784 c...
70066    c_79903740e1e8 c_a484656c2956 c_730f3c40904f c...
70067    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
70068    c_46f852a49c08 c_0dd3eab0f444 c_bb1db83147d6 c...
70069    c_2c55fbee8ba3 c_ee583e1c7719 c_011eb3c737f4 c...
Length: 70070, dtype: object
0.1662
Our max positive score is 0.9806
save 2nd train data


100%|██████████| 56768/56768 [01:12<00:00, 785.05it/s]


Our training set has 5674396 rows
        topic_ids     content_ids  \
0  t_00069b63a70a  c_11a1dc0bfb99   
1  t_00069b63a70a  c_447222ff490e   
2  t_00069b63a70a  c_de6f5fbe3f97   
3  t_00069b63a70a  c_71d6bae3f656   
4  t_00069b63a70a  c_f4b4a29231be   

                                        topic_inputs  \
0  Transcripts [T_SEP] MIT Blossoms | Engineering...   
1  Transcripts [T_SEP] MIT Blossoms | Engineering...   
2  Transcripts [T_SEP] MIT Blossoms | Engineering...   
3  Transcripts [T_SEP] MIT Blossoms | Engineering...   
4  Transcripts [T_SEP] MIT Blossoms | Engineering...   

                                      content_inputs  target language  \
0  Flow Charts: Logical..: Written Transcript of ...       1       en   
1  Thermodynamics: Ener..: Written Transcript of ...       0       en   
2  From Psychology to L..: Written Transcript of ...       0       en   
3  Engineering Innovati..: Written Transcript of ...       0       en   
4  Sustainable Energy: ..: Download Writt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/980 [00:00<?, ?it/s]

Batches:   0%|          | 0/1781 [00:00<?, ?it/s]

Batches:   0%|          | 0/290 [00:00<?, ?it/s]

Batches:   0%|          | 0/810 [00:00<?, ?it/s]

Batches:   0%|          | 0/837 [00:00<?, ?it/s]

Batches:   0%|          | 0/1766 [00:00<?, ?it/s]

100%|██████████| 26775/26775 [01:40<00:00, 266.09it/s]


[5]


31356it [00:03, 8487.61it/s]
9274it [00:01, 7495.31it/s]
26775it [00:03, 7934.98it/s]


0        c_9ca4249fa1a8 c_57a9b8e9a3dc c_77092515597d c...
1        c_447222ff490e c_f37bb15a7a42 c_11a1dc0bfb99 c...
2        c_8790b074383e c_7ff92a954a3d c_70cee4e16c16 c...
3        c_235f882bc66b c_ade5a4b4e971 c_f84bee7dc0d9 c...
4        c_b8d730238789 c_3a2bf4a358da c_7cc189e7acb0 c...
                               ...                        
67400    c_86627e3a623c c_802181e7ca75 c_3bb33bd78c3a c...
67401    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67402    c_4ed38ea2bff2 c_9e67dc64550d c_112de3281469 c...
67403    c_0dd3eab0f444 c_46f852a49c08 c_fc0244e23044 c...
67404    c_0b197a901f63 c_e438b625e16f c_d7f2fb1ebedf c...
Length: 67405, dtype: object
0.5391
Our max positive score is 0.64886
[10]


31356it [00:03, 8520.54it/s]
9274it [00:01, 7704.74it/s]
26775it [00:03, 8047.88it/s]


0        c_9ca4249fa1a8 c_57a9b8e9a3dc c_77092515597d c...
1        c_447222ff490e c_f37bb15a7a42 c_11a1dc0bfb99 c...
2        c_8790b074383e c_7ff92a954a3d c_70cee4e16c16 c...
3        c_235f882bc66b c_ade5a4b4e971 c_f84bee7dc0d9 c...
4        c_b8d730238789 c_3a2bf4a358da c_7cc189e7acb0 c...
                               ...                        
67400    c_86627e3a623c c_802181e7ca75 c_3bb33bd78c3a c...
67401    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67402    c_4ed38ea2bff2 c_9e67dc64550d c_112de3281469 c...
67403    c_0dd3eab0f444 c_46f852a49c08 c_fc0244e23044 c...
67404    c_0b197a901f63 c_e438b625e16f c_d7f2fb1ebedf c...
Length: 67405, dtype: object
0.5288
Our max positive score is 0.78937
[50]


31356it [00:03, 8358.32it/s]
9274it [00:01, 7964.13it/s]
26775it [00:03, 8336.09it/s]


0        c_9ca4249fa1a8 c_57a9b8e9a3dc c_77092515597d c...
1        c_447222ff490e c_f37bb15a7a42 c_11a1dc0bfb99 c...
2        c_8790b074383e c_7ff92a954a3d c_70cee4e16c16 c...
3        c_235f882bc66b c_ade5a4b4e971 c_f84bee7dc0d9 c...
4        c_b8d730238789 c_3a2bf4a358da c_7cc189e7acb0 c...
                               ...                        
67400    c_86627e3a623c c_802181e7ca75 c_3bb33bd78c3a c...
67401    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67402    c_4ed38ea2bff2 c_9e67dc64550d c_112de3281469 c...
67403    c_0dd3eab0f444 c_46f852a49c08 c_fc0244e23044 c...
67404    c_0b197a901f63 c_e438b625e16f c_d7f2fb1ebedf c...
Length: 67405, dtype: object
0.2781
Our max positive score is 0.95883
[100]


31356it [00:03, 8154.92it/s]
9274it [00:01, 6271.39it/s]
26775it [00:03, 8279.84it/s]


0        c_9ca4249fa1a8 c_57a9b8e9a3dc c_77092515597d c...
1        c_447222ff490e c_f37bb15a7a42 c_11a1dc0bfb99 c...
2        c_8790b074383e c_7ff92a954a3d c_70cee4e16c16 c...
3        c_235f882bc66b c_ade5a4b4e971 c_f84bee7dc0d9 c...
4        c_b8d730238789 c_3a2bf4a358da c_7cc189e7acb0 c...
                               ...                        
67400    c_86627e3a623c c_802181e7ca75 c_3bb33bd78c3a c...
67401    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67402    c_4ed38ea2bff2 c_9e67dc64550d c_112de3281469 c...
67403    c_0dd3eab0f444 c_46f852a49c08 c_fc0244e23044 c...
67404    c_0b197a901f63 c_e438b625e16f c_d7f2fb1ebedf c...
Length: 67405, dtype: object
0.1718
Our max positive score is 0.98043
save 2nd train data


100%|██████████| 53598/53598 [01:08<00:00, 786.37it/s]


Our training set has 5357333 rows
        topic_ids     content_ids  \
0  t_00069b63a70a  c_447222ff490e   
1  t_00069b63a70a  c_f37bb15a7a42   
2  t_00069b63a70a  c_11a1dc0bfb99   
3  t_00069b63a70a  c_7604fa0eda36   
4  t_00069b63a70a  c_de6f5fbe3f97   

                                        topic_inputs  \
0  Transcripts [T_SEP] MIT Blossoms | Engineering...   
1  Transcripts [T_SEP] MIT Blossoms | Engineering...   
2  Transcripts [T_SEP] MIT Blossoms | Engineering...   
3  Transcripts [T_SEP] MIT Blossoms | Engineering...   
4  Transcripts [T_SEP] MIT Blossoms | Engineering...   

                                      content_inputs  target language  \
0  Thermodynamics: Ener..: Written Transcript of ...       0       en   
1  Quantifying the Ener..: Written Transcript of ...       0       en   
2  Flow Charts: Logical..: Written Transcript of ...       1       en   
3  Counting Systems..: Written Transcript of this...       0       en   
4  From Psychology to L..: Written Transc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/1018 [00:00<?, ?it/s]

Batches:   0%|          | 0/1906 [00:00<?, ?it/s]

Batches:   0%|          | 0/403 [00:00<?, ?it/s]

Batches:   0%|          | 0/941 [00:00<?, ?it/s]

Batches:   0%|          | 0/762 [00:00<?, ?it/s]

Batches:   0%|          | 0/1684 [00:00<?, ?it/s]

100%|██████████| 24367/24367 [01:32<00:00, 262.25it/s]


[5]


32551it [00:03, 8716.35it/s]
12895it [00:01, 8960.63it/s]
24367it [00:03, 7948.56it/s]


0        c_f37bb15a7a42 c_11a1dc0bfb99 c_c900f57fae94 c...
1        c_8790b074383e c_a79ad9f018c8 c_70cee4e16c16 c...
2        c_ade5a4b4e971 c_235f882bc66b c_e9c9073483e6 c...
3        c_c1db2bde6960 c_b8d730238789 c_7868f655c31e c...
4        c_e7daeadb242c c_fbd300d2cc72 c_5f4b5200a725 c...
                               ...                        
69808    c_016def92802a c_26b8ceadf0da c_3bb33bd78c3a c...
69809    c_79903740e1e8 c_e630f8201f14 c_e52a63ed124b c...
69810    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
69811    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
69812    c_3dfa5cf20a06 c_2c55fbee8ba3 c_157e2611928f c...
Length: 69813, dtype: object
0.5176
Our max positive score is 0.62778
[10]


32551it [00:03, 8540.88it/s]
12895it [00:01, 8892.17it/s]
24367it [00:03, 8107.41it/s]


0        c_f37bb15a7a42 c_11a1dc0bfb99 c_c900f57fae94 c...
1        c_8790b074383e c_a79ad9f018c8 c_70cee4e16c16 c...
2        c_ade5a4b4e971 c_235f882bc66b c_e9c9073483e6 c...
3        c_c1db2bde6960 c_b8d730238789 c_7868f655c31e c...
4        c_e7daeadb242c c_fbd300d2cc72 c_5f4b5200a725 c...
                               ...                        
69808    c_016def92802a c_26b8ceadf0da c_3bb33bd78c3a c...
69809    c_79903740e1e8 c_e630f8201f14 c_e52a63ed124b c...
69810    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
69811    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
69812    c_3dfa5cf20a06 c_2c55fbee8ba3 c_157e2611928f c...
Length: 69813, dtype: object
0.5083
Our max positive score is 0.76841
[50]


32551it [00:04, 7487.23it/s]
12895it [00:01, 8581.59it/s]
24367it [00:02, 8725.58it/s]


0        c_f37bb15a7a42 c_11a1dc0bfb99 c_c900f57fae94 c...
1        c_8790b074383e c_a79ad9f018c8 c_70cee4e16c16 c...
2        c_ade5a4b4e971 c_235f882bc66b c_e9c9073483e6 c...
3        c_c1db2bde6960 c_b8d730238789 c_7868f655c31e c...
4        c_e7daeadb242c c_fbd300d2cc72 c_5f4b5200a725 c...
                               ...                        
69808    c_016def92802a c_26b8ceadf0da c_3bb33bd78c3a c...
69809    c_79903740e1e8 c_e630f8201f14 c_e52a63ed124b c...
69810    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
69811    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
69812    c_3dfa5cf20a06 c_2c55fbee8ba3 c_157e2611928f c...
Length: 69813, dtype: object
0.2683
Our max positive score is 0.95286
[100]


32551it [00:03, 8224.83it/s]
12895it [00:01, 8858.00it/s]
24367it [00:03, 7922.53it/s]


0        c_f37bb15a7a42 c_11a1dc0bfb99 c_c900f57fae94 c...
1        c_8790b074383e c_a79ad9f018c8 c_70cee4e16c16 c...
2        c_ade5a4b4e971 c_235f882bc66b c_e9c9073483e6 c...
3        c_c1db2bde6960 c_b8d730238789 c_7868f655c31e c...
4        c_e7daeadb242c c_fbd300d2cc72 c_5f4b5200a725 c...
                               ...                        
69808    c_016def92802a c_26b8ceadf0da c_3bb33bd78c3a c...
69809    c_79903740e1e8 c_e630f8201f14 c_e52a63ed124b c...
69810    c_9e67dc64550d c_4ed38ea2bff2 c_036efdd9e8c1 c...
69811    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
69812    c_3dfa5cf20a06 c_2c55fbee8ba3 c_157e2611928f c...
Length: 69813, dtype: object
0.1658
Our max positive score is 0.97912
save 2nd train data


100%|██████████| 56755/56755 [01:11<00:00, 788.47it/s]


Our training set has 5673320 rows
        topic_ids     content_ids  \
0  t_00069b63a70a  c_f37bb15a7a42   
1  t_00069b63a70a  c_11a1dc0bfb99   
2  t_00069b63a70a  c_c900f57fae94   
3  t_00069b63a70a  c_de6f5fbe3f97   
4  t_00069b63a70a  c_8d79af4224eb   

                                        topic_inputs  \
0  Transcripts [T_SEP] MIT Blossoms | Engineering...   
1  Transcripts [T_SEP] MIT Blossoms | Engineering...   
2  Transcripts [T_SEP] MIT Blossoms | Engineering...   
3  Transcripts [T_SEP] MIT Blossoms | Engineering...   
4  Transcripts [T_SEP] MIT Blossoms | Engineering...   

                                      content_inputs  target language  \
0  Quantifying the Ener..: Written Transcript of ...       0       en   
1  Flow Charts: Logical..: Written Transcript of ...       1       en   
2  An Introduction to t..: Written Transcript of ...       0       en   
3  From Psychology to L..: Written Transcript of ...       0       en   
4  Averages: Still Flaw..: Written Transc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Batches:   0%|          | 0/857 [00:00<?, ?it/s]

Batches:   0%|          | 0/1771 [00:00<?, ?it/s]

Batches:   0%|          | 0/416 [00:00<?, ?it/s]

Batches:   0%|          | 0/924 [00:00<?, ?it/s]

Batches:   0%|          | 0/826 [00:00<?, ?it/s]

Batches:   0%|          | 0/1754 [00:00<?, ?it/s]

100%|██████████| 26408/26408 [01:40<00:00, 262.54it/s]


[5]


27402it [00:03, 8757.76it/s]
13305it [00:01, 7998.19it/s]
26408it [00:02, 9190.12it/s]


0        c_9ca4249fa1a8 c_dbef0d3da6d3 c_65ae9a83563d c...
1        c_de6f5fbe3f97 c_447222ff490e c_f37bb15a7a42 c...
2        c_ade5a4b4e971 c_84b23a7b4b8c c_f84bee7dc0d9 c...
3        c_c1db2bde6960 c_d26e23e98356 c_b8d730238789 c...
4        c_2429552008e3 c_22eeee248488 c_699022324f4f c...
                               ...                        
67110    c_67ecb6eebadf c_140c2d4b333c c_e6b95de6962f c...
67111    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67112    c_4ed38ea2bff2 c_9e67dc64550d c_036efdd9e8c1 c...
67113    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
67114    c_e438b625e16f c_21e246a8609e c_d7f2fb1ebedf c...
Length: 67115, dtype: object
0.5057
Our max positive score is 0.62022
[10]


27402it [00:03, 8348.83it/s]
13305it [00:01, 7837.53it/s]
26408it [00:03, 8203.94it/s]


0        c_9ca4249fa1a8 c_dbef0d3da6d3 c_65ae9a83563d c...
1        c_de6f5fbe3f97 c_447222ff490e c_f37bb15a7a42 c...
2        c_ade5a4b4e971 c_84b23a7b4b8c c_f84bee7dc0d9 c...
3        c_c1db2bde6960 c_d26e23e98356 c_b8d730238789 c...
4        c_2429552008e3 c_22eeee248488 c_699022324f4f c...
                               ...                        
67110    c_67ecb6eebadf c_140c2d4b333c c_e6b95de6962f c...
67111    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67112    c_4ed38ea2bff2 c_9e67dc64550d c_036efdd9e8c1 c...
67113    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
67114    c_e438b625e16f c_21e246a8609e c_d7f2fb1ebedf c...
Length: 67115, dtype: object
0.4934
Our max positive score is 0.75701
[50]


27402it [00:03, 8476.25it/s]
13305it [00:01, 7894.60it/s]
26408it [00:03, 8093.40it/s]


0        c_9ca4249fa1a8 c_dbef0d3da6d3 c_65ae9a83563d c...
1        c_de6f5fbe3f97 c_447222ff490e c_f37bb15a7a42 c...
2        c_ade5a4b4e971 c_84b23a7b4b8c c_f84bee7dc0d9 c...
3        c_c1db2bde6960 c_d26e23e98356 c_b8d730238789 c...
4        c_2429552008e3 c_22eeee248488 c_699022324f4f c...
                               ...                        
67110    c_67ecb6eebadf c_140c2d4b333c c_e6b95de6962f c...
67111    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67112    c_4ed38ea2bff2 c_9e67dc64550d c_036efdd9e8c1 c...
67113    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
67114    c_e438b625e16f c_21e246a8609e c_d7f2fb1ebedf c...
Length: 67115, dtype: object
0.2624
Our max positive score is 0.94883
[100]


27402it [00:03, 8423.23it/s]
13305it [00:01, 7551.44it/s]
26408it [00:03, 7343.32it/s]


0        c_9ca4249fa1a8 c_dbef0d3da6d3 c_65ae9a83563d c...
1        c_de6f5fbe3f97 c_447222ff490e c_f37bb15a7a42 c...
2        c_ade5a4b4e971 c_84b23a7b4b8c c_f84bee7dc0d9 c...
3        c_c1db2bde6960 c_d26e23e98356 c_b8d730238789 c...
4        c_2429552008e3 c_22eeee248488 c_699022324f4f c...
                               ...                        
67110    c_67ecb6eebadf c_140c2d4b333c c_e6b95de6962f c...
67111    c_79903740e1e8 c_e630f8201f14 c_1079bc17c6e8 c...
67112    c_4ed38ea2bff2 c_9e67dc64550d c_036efdd9e8c1 c...
67113    c_0dd3eab0f444 c_46f852a49c08 c_bb1db83147d6 c...
67114    c_e438b625e16f c_21e246a8609e c_d7f2fb1ebedf c...
Length: 67115, dtype: object
0.1627
Our max positive score is 0.97785
save 2nd train data


100%|██████████| 53944/53944 [01:07<00:00, 793.56it/s]


Our training set has 5391535 rows
        topic_ids     content_ids  \
0  t_00069b63a70a  c_de6f5fbe3f97   
1  t_00069b63a70a  c_447222ff490e   
2  t_00069b63a70a  c_f37bb15a7a42   
3  t_00069b63a70a  c_c900f57fae94   
4  t_00069b63a70a  c_71d6bae3f656   

                                        topic_inputs  \
0  Transcripts [T_SEP] MIT Blossoms | Engineering...   
1  Transcripts [T_SEP] MIT Blossoms | Engineering...   
2  Transcripts [T_SEP] MIT Blossoms | Engineering...   
3  Transcripts [T_SEP] MIT Blossoms | Engineering...   
4  Transcripts [T_SEP] MIT Blossoms | Engineering...   

                                      content_inputs  target language  \
0  From Psychology to L..: Written Transcript of ...       0       en   
1  Thermodynamics: Ener..: Written Transcript of ...       0       en   
2  Quantifying the Ener..: Written Transcript of ...       0       en   
3  An Introduction to t..: Written Transcript of ...       0       en   
4  Engineering Innovati..: Written Transc