In [32]:
#drive mounting for timestamping
#from google.colab import drive
#drive.mount('/content/drive')
import os

working_directory = '/home/jacky_macpro/workspaces/project_2/'
data_directory = os.path.join(working_directory, 'data/')

In [33]:
import pandas as pd
import numpy as np


In [34]:
topics = pd.read_csv(data_directory + 'topics.csv')
correlations = pd.read_csv(data_directory + 'correlations.csv')
content = pd.read_csv(data_directory + 'content.csv')

In [35]:
submission = pd.read_csv(data_directory + 'sample_submission.csv')

In [36]:
def explore_curriculum_data(topics_df, content):
    """
    Perform exploratory data analysis on curriculum recommendation datasets

    Parameters:
    topics_df (pd.DataFrame): Topics dataset
    content (pd.DataFrame): Content dataset

    Returns:
    dict: Dictionary containing analysis results
    """
    analysis = {}

    # Basic dataset info
    analysis['topics_shape'] = topics_df.shape
    analysis['content_shape'] = content.shape

    # Check missing values
    analysis['topics_missing'] = topics_df.isnull().sum()
    analysis['content_missing'] = content.isnull().sum()

    # Analyze topics dataset
    analysis['topic_level_counts'] = topics_df['level'].value_counts()
    analysis['topic_language_counts'] = topics_df['language'].value_counts()

    # Analyze content dataset
    analysis['content_language_counts'] = content['language'].value_counts()
    analysis['content_kind_counts'] = content['kind'].value_counts()

    # Calculate text length statistics
    content['title_length'] = content['title'].str.len()
    content['description_length'] = content['description'].fillna('').str.len()

    analysis['content_title_stats'] = content['title_length'].describe()
    analysis['content_desc_stats'] = content['description_length'].describe()

    # Check relationships
    topics_per_language = topics_df.groupby('language').size()
    content_per_language = content.groupby('language').size()
    analysis['topics_to_content_ratio'] = (topics_per_language / content_per_language).fillna(0)

    return analysis

# Run the analysis
results = explore_curriculum_data(topics, content)

# Print key findings
print("Dataset Dimensions:")
print(f"Topics dataset: {results['topics_shape'][0]:,} rows × {results['topics_shape'][1]} columns")
print(f"Content dataset: {results['content_shape'][0]:,} rows × {results['content_shape'][1]} columns\n")

print("Topic Levels Distribution:")
print(results['topic_level_counts'])
print("\nTop 5 Languages by Topic Count:")
print(results['topic_language_counts'].head())
print("\nTop 5 Content Types:")
print(results['content_kind_counts'].head())
print("\nContent Title Length Statistics:")
print(results['content_title_stats'].round(2))

Dataset Dimensions:
Topics dataset: 76,972 rows × 9 columns
Content dataset: 154,047 rows × 8 columns

Topic Levels Distribution:
level
4     38810
3     14898
5      9744
6      6210
2      4874
1      1104
7      1028
0       171
8       119
9        12
10        2
Name: count, dtype: int64

Top 5 Languages by Topic Count:
language
en    36161
es    13910
pt     4177
ar     3701
fr     3701
Name: count, dtype: int64

Top 5 Content Types:
kind
video       61487
document    33873
html5       32563
exercise    25925
audio         199
Name: count, dtype: int64

Content Title Length Statistics:
count    154038.00
mean         33.83
std          18.11
min           1.00
25%          21.00
50%          31.00
75%          44.00
max         177.00
Name: title_length, dtype: float64


In [37]:
print(topics.shape)
topics.head(10)

(76972, 9)


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
5,t_0008768bdee6,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,5223e0,supplemental,4,gu,t_0da7a331d666,True
6,t_0008a1bd84ba,12. 20: Bird Reproduction,,ebc86c,supplemental,5,en,t_c44ac9711007,True
7,t_000c0c854f0b,Nepal,,f2e966,source,1,en,t_f9c4ef0d6290,False
8,t_000d1fb3f2f5,2.1.2 - Logarithms,,e77b55,aligned,5,en,t_b897d168db90,True
9,t_000feba42136,As vacinas,,8e286a,source,4,pt,t_dfc8ec591573,True


In [38]:
print(content.shape)
content.head(10)

(154047, 10)


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license,title_length,description_length
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,,48.0,53
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,,30.0,29
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,,23.0,31
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND,19.0,118
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA,32.0,32
5,c_00019840d110,5.12E: Regulation of the Calvin Cycle,,html5,LEARNING OBJECTIVES\n\nOutline the three major...,en,CSU and Merlot,CC BY-NC-SA,37.0,0
6,c_0001ec56e20f,Reflexionemos sobre lo que vemos y escuchamos,,document,Lección\n\n7\n\nReflexionemos sobre lo que\nve...,es,Publicado por el Lic. Edelberto Andino(edelber...,CC BY-NC-SA,45.0,0
7,c_00025aaa1533,अंग्रेजी ओके प्लीज 1.2,source_url=http://www.prathamopenschool.org/Co...,video,,mr,,,22.0,76
8,c_00027d03ca7d,4.E: Genomes and Chromosomes (Exercises),,html5,4.3\n\n(BPA) Answer the following questions wi...,en,CSU and Merlot,CC BY-NC-SA,40.0,0
9,c_000314eb850f,La banca 12: los bonos del tesoro (deuda pública),Introducción a la deuda y los fondos públicos....,video,,es,,,49.0,194


In [39]:
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [40]:
#print(submission.shape)
submission.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


In [41]:
import os
import sys
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Constant for defining sequence length
MAX_LEN = 128  # Adjust as required


class VecModel(nn.Module):
    """
    VecModel is a neural network wrapper for a pretrained transformer model,
    with optional batch normalization and linear projection layer for output vectors.

    Attributes:
    -----------
    backbone : torch.nn.Module
        The base transformer model loaded from Hugging Face's transformers library.
    has_top : bool
        Flag to determine if a batch normalization and linear projection layer should be applied.
    bn : torch.nn.BatchNorm1d (optional)
        Batch normalization layer for the output vector.
    top : torch.nn.Linear (optional)
        Linear layer to apply a projection to the output vector if `has_top` is True.
    """

    def __init__(self, model_name: str, size: int, has_top: bool = True):
        """
        Initializes the VecModel with a specified transformer model backbone,
        batch normalization, and projection layer.

        Parameters:
        -----------
        model_name : str
            Name of the pretrained model to be used as backbone.
        size : int
            Dimension of the output vector after the transformer model.
        has_top : bool, default=True
            Indicates if batch normalization and projection layer are included.
        """
        super(VecModel, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_config(config)
        self.has_top = has_top

        # Define batch normalization and projection layers if has_top is True
        if self.has_top:
            self.bn = nn.BatchNorm1d(size)
            self.top = nn.Linear(size, size)

    def forward(self, ids: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
        """
        Defines the forward pass through the model.

        Parameters:
        -----------
        ids : torch.Tensor
            Tensor containing token ids.
        mask : torch.Tensor
            Attention mask to indicate which tokens are padded.

        Returns:
        --------
        torch.Tensor
            Normalized output vector after optional batch normalization and projection.
        """
        # Pass input through the transformer model
        outputs = self.backbone(input_ids=ids, attention_mask=mask)[0]

        # Masking and mean pooling on the hidden states
        masked_output = (outputs[:, 1:MAX_LEN//2, :] * mask[:, 1:MAX_LEN//2, None])
        out = masked_output.mean(dim=1)

        # Apply batch normalization and projection if required
        if self.has_top:
            out = self.top(self.bn(out))

        return F.normalize(out, p=2, dim=1)

    def save(self, path: str):
        """
        Saves the model state to the specified file path.

        Parameters:
        -----------
        path : str
            File path where the model state should be saved.
        """
        torch.save(self.state_dict(), path)

    def load(self, path: str):
        """
        Loads the model state from the specified file path.

        Parameters:
        -----------
        path : str
            File path from which to load the model state.
        """
        self.load_state_dict(torch.load(path, map_location='cpu'))


In [42]:
#ignoring the topics without content
print('before dropping', topics.shape)
topics = topics[topics["has_content"]].reset_index(drop=True)
print('after dropping',topics.shape)

before dropping (76972, 9)
after dropping (61517, 9)


In [43]:
#creating mapping dictionaries for parent and title
title_map = topics.set_index("id")["title"].to_dict()
parent_map = topics.set_index("id")["parent"].to_dict()

#addign parent title columns
topics["parent_title"] = topics["parent"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#addding grandpa title
topics["grandpa"] = topics["parent"].apply(lambda x: parent_map.get(x))
topics["grandpa_title"] = topics["grandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#adding great grandpa title
topics["ggrandpa"] = topics["grandpa"].apply(lambda x: parent_map.get(x))
topics["ggrandpa_title"] = topics["ggrandpa"].apply(lambda x: title_map.get(x, ""))
print(topics.shape)

#fill in missing parent values
topics["parent"] = topics["parent"].fillna(topics["id"])

sub_df = pd.read_csv(data_directory + "sample_submission.csv")

topics["sub"] = topics["id"].isin(sub_df["topic_id"])
topics.shape


(61517, 10)
(61517, 12)
(61517, 14)


(61517, 15)

In [44]:
def extract_number(x, is_subtopic=False):
    chapter, rest = x.split(":", 1)

    if is_subtopic:
        chapter = chapter.split(".", 1)[0]
        x = f"{chapter}:{rest}"
    chapter = int(chapter)

    return chapter, x

def extract_chapters(df, is_subtopic):
    titles = df["title"].fillna("").values

    chapters = np.zeros(len(titles))
    new_titles = np.array(titles)

    for i, title in enumerate(titles):
        try:
            chapters[i], new_titles[i] = extract_number(title, is_subtopic)
        except:
            pass

    df["chapter"], df["title"] = chapters, new_titles
    df["chapter"] = df["chapter"].astype(int)

    df.loc[df["chapter"] == 0, "chapter"] = None
    return df

content["t"] = content["title"].fillna("") + " | " + content["kind"].fillna("") + " | " + content["description"].fillna("")

topics = extract_chapters(topics, False)
content = extract_chapters(content, True)

topics["t"] = topics["title"] + " @ " + topics["parent_title"] + " @ " + topics["grandpa_title"] + " @ " + topics["ggrandpa_title"] + " | " + topics["description"].fillna("")



In [45]:
corr_df = pd.read_csv(data_directory + "correlations.csv")

corr_df["content_ids"] = corr_df["content_ids"].apply(lambda x: x.split())

corr_df = corr_df.explode('content_ids').reset_index(drop=True).rename(columns={"content_ids": "content_id"})

corr_df.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d
1,t_00004da3a1b2,c_376c5a8eb028
2,t_00004da3a1b2,c_5bc0e1e2cba0
3,t_00004da3a1b2,c_76231f9d0b5e
4,t_00068291e9a4,c_639ea2ef9c95


In [46]:
# google/mt5-base, good for multilingual
# google/electra-base-discriminator OR google/electra-smnall-discriminator
# xlnet-large-cased OR xlnet-base-cased
# roberta-base OR roberta-large

from transformers import AutoTokenizer

# used later in inference
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [47]:
#initialize model weight
model_w = [1.0]

#define array of models (expansion for later use)
models = [VecModel(MODEL_NAME, 768, has_top=False)]

# evaluate
for model in models:
    model.eval()


#models[0].load("/kaggle/input/lecr-models-v36-mini-full/vec_model_v36_mini_full.pth")

In [48]:
import torch
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

# matching the nearest neighbors for the content df, takes in topics/content (vectorized/text)
def get_matches(V_topic, V_content, topic_ids, content_ids, n_neighbors=5):
    # Initialize the NearestNeighbors model with parallel processing (n_jobs=-1 for all CPU cores)
    neighbors_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine', n_jobs=-1)
    neighbors_model.fit(V_content)
    
    # Perform the nearest neighbor search
    dists, indices = neighbors_model.kneighbors(V_topic)
    
    # Create the result dataframe
    res_df = pd.DataFrame({
        "topic_id": np.repeat(topic_ids, n_neighbors),
        'content_id': content_ids[indices.ravel()],
        'vec_dist': dists.ravel()
    })
    
    return res_df

# Convert string to vector (with GPU acceleration for inference)
def inference_one(text, device):
    encoding = tokenizer(text, truncation=True, max_length=MAX_LEN)
    encoding = {key: torch.as_tensor(val).unsqueeze(0).to(device) for key, val in encoding.items()}

    with torch.no_grad():
        ids, mask = encoding["input_ids"], encoding["attention_mask"]
        vec = torch.cat([model_w[i] * models[i](ids, mask) for i in range(len(models))], axis=1)

    return vec.detach().cpu().numpy()[0]  # Move tensor back to CPU after computation

# Applies inference_one to entire df (using GPU for inference)
def inference(df, device):
    V = np.zeros((df.shape[0], 768))  # Assuming 768 is the output vector size

    # Process each text and generate the corresponding vector, using GPU for inference
    for i, text in tqdm(enumerate(df["t"].values), total=df.shape[0]):
        V[i] = inference_one(text, device)

    return V

# Assuming you have the topics dataframe (topics) already
topic_sub_df = topics[topics["sub"]].reset_index(drop=True)

# Initialize the device (GPU) - You can set this to 'cuda' for Nvidia or 'cpu' for AMD (ROCm compatible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get vectorized topics using GPU for inference
V_topic = inference(topic_sub_df, device)
V_topic.shape


100%|██████████| 5/5 [00:01<00:00,  4.61it/s]


(5, 768)

In [49]:
#content created by language
content = content[content["language"].isin(topic_sub_df["language"].unique())].reset_index(drop=True)
content.shape

(82424, 12)

In [50]:
# vectorize content as well inference(topic_sub_df, tokenizer, model_w, models, MAX_LEN)
V_content = inference(content, device)
V_content.shape

100%|██████████| 82424/82424 [2:48:37<00:00,  8.15it/s]


(82424, 768)

In [51]:
# only use the topics that are not subtopics
train_topics_df = topics[~topics["sub"]].reset_index(drop=True)

# vectorize primary topics (excludes subtopics)
V_topic_train = inference(train_topics_df, device)
V_topic_train.shape



100%|██████████| 61512/61512 [2:09:11<00:00,  7.94it/s]


(61512, 768)

In [52]:
res_dfs = list()

# aquiring matches by language (don't want to recommend Russian Geometry videos to American 6th graders)
for lang in topic_sub_df["language"].unique():
    topic_ix = np.where(topic_sub_df["language"] == lang)[0]
    content_ix = np.where(content["language"] == lang)[0]

    res_dfs.append(get_matches(V_topic[topic_ix], V_content[content_ix],
                               topic_sub_df["id"].values[topic_ix],
                               content["id"].values[content_ix],
                               n_neighbors=20))

# resulting dataframe
res_df = pd.concat(res_dfs)
res_df.shape

(100, 3)

In [53]:
# function for obtaining best score for each content item in the dataset when computing the nearest topic
def get_max_train(df, V):
    max_train_scores = np.zeros(df.shape[0])
    matches = np.zeros(df.shape[0])

    for lang in df["language"].unique():
        topic_ix = np.where(train_topics_df["language"] == lang)[0]
        ix = np.where(df["language"] == lang)[0]

        neighbors_model = NearestNeighbors(n_neighbors=1, metric='cosine', n_jobs=-1)
        neighbors_model.fit(V_topic_train[topic_ix])
        dists, idx = neighbors_model.kneighbors(V[ix])

        max_train_scores[ix] = dists.ravel()
        matches[ix] = topic_ix[idx.ravel()]

    return max_train_scores, matches

In [54]:
# getting the model where
content_max_train_scores, _ = get_max_train(content, V_content)

content["max_train_score"] = content_max_train_scores

In [55]:
topic_max_train_scores, topic_matches = get_max_train(topic_sub_df, V_topic)

topic_sub_df["max_train_score"] = topic_max_train_scores
topic_sub_df["matched_topic"] = train_topics_df["id"].values[topic_matches.astype(int)]

In [56]:
del V_content, V_topic, V_topic_train
del models

import gc
gc.collect()

1199

In [57]:
second_degree_match_df = topic_sub_df.merge(corr_df, left_on="matched_topic", right_on="topic_id")[["id", "content_id"]]
second_degree_match_df["second_degree"] = True
second_degree_match_df.rename(columns={"id": "topic_id"}, inplace=True)
second_degree_match_df.shape

(22, 3)

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn

# applies the same transformations to the entire network but this time uses tfidf for the concatenated topics
def get_matches(topics, content):
    tfidf = TfidfVectorizer(analyzer="char", ngram_range=(4, 4), min_df=2)

    tfidf.fit(pd.concat([content["t"], topics["t"]]))
    print(len(tfidf.vocabulary_), topics.shape[0], content.shape[0])

    topics = topics[topics["sub"]].reset_index(drop=True)

    V_topic = tfidf.transform(topics["title"] + " " + topics["t"])
    V_content = tfidf.transform(content["title"] + " " + content["t"])

    d = awesome_cossim_topn(V_topic, V_content.T, 20, 0.01, use_threads=True, n_jobs=4)

    non_zeros = d.nonzero()

    name_indices = non_zeros[0]
    gt_indices = non_zeros[1]

    content_ids = content["id"].values
    topic_ids = topics["id"].values

    left_side = np.empty(gt_indices.size, dtype=object)
    right_side = np.empty(gt_indices.size, dtype=object)
    match_score = np.zeros(gt_indices.size)

    for index in range(gt_indices.size):
        left_side[index] = topic_ids[name_indices[index]]
        right_side[index] = content_ids[gt_indices[index]]
        match_score[index] = d.data[index]

    res_df = pd.DataFrame({"topic_id": left_side,
                           'content_id': right_side,
                           'match_score': match_score
                          })
    return res_df

In [59]:
# calculating matches for each language
res_dfs = list()

for lang in topic_sub_df["language"].unique():
    print(lang)
    content_lang = content[(content["language"] == lang)].reset_index(drop=True)
    topics_lang = topics[(topics["language"] == lang)].reset_index(drop=True)

    if content_lang.shape[0] > 0:
        res_dfs.append(get_matches(topics_lang, content_lang))

    print("----")

res_df2 = pd.concat(res_dfs)
res_df2.shape

bg
33279 2420 6050


  d = awesome_cossim_topn(V_topic, V_content.T, 20, 0.01, use_threads=True, n_jobs=4)


----
pt
35721 3425 10435
----
en
92662 28053 65939
----


(100, 3)

In [60]:
# rename columns to avoid name clashes when merging
topics.columns = [f"topic_{col}" for col in topics.columns]
content.columns = [f"content_{col}" for col in content.columns]

corr_df["target"] = 1

topics["key"] = topics["topic_title"].fillna("") + " | " + topics["topic_description"].fillna("")

train_df = topics[~topics["topic_sub"]]

#create table 'lookup' to indicate duplicated relationships between content and topics
lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel":"dup_count"}, inplace=True)


dup_df = topics[topics["topic_sub"]][["topic_id", "key"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "dup_count"]]
dup_df

Unnamed: 0,topic_id,content_id,dup_count
0,t_00069b63a70a,c_01a0e6e59063,1
1,t_00069b63a70a,c_037b8be79422,1
2,t_00069b63a70a,c_03b5ed305bcc,1
3,t_00069b63a70a,c_04a65a739d47,1
4,t_00069b63a70a,c_05ff8bd1fd30,1
...,...,...,...
149,t_00069b63a70a,c_fbb631d460b8,2
150,t_00069b63a70a,c_fbf85d018b8a,1
151,t_00069b63a70a,c_fda21411f22d,1
152,t_00069b63a70a,c_fe0ef07ada86,1


In [61]:
res_df2.rename(columns={'match_score':'match_score_df2'}, inplace=True)

In [62]:
dup_df.rename(columns={'pdup_count':'pdup_count_dup'}, inplace=True)

In [63]:
# Define working directory and ensure the folder exists
timestamps_dir = os.path.join(working_directory, 'timestamps_1.1/')
os.makedirs(timestamps_dir, exist_ok=True)

# 1. Save res_df to CSV
res_df.to_csv(os.path.join(timestamps_dir, 'res_df.csv'), index=False)
res_df2.to_csv(os.path.join(timestamps_dir, 'res_df2.csv'), index=False)

# 2. Save dup_df to CSV
dup_df.to_csv(os.path.join(timestamps_dir, 'dup_df.csv'), index=False)

# 3. Save second_degree_match_df to CSV
second_degree_match_df.to_csv(os.path.join(timestamps_dir, 'second_degree_match_df.csv'), index=False)

# 4. Save topics DataFrame to CSV
topics.to_csv(os.path.join(timestamps_dir, 'topics.csv'), index=False)
train_df.to_csv(os.path.join(timestamps_dir, 'train_df.csv'), index=False)
lookup.to_csv(os.path.join(timestamps_dir, 'lookup.csv'), index=False)

# 5. Save corr_df to CSV
corr_df.to_csv(os.path.join(timestamps_dir, 'corr_df.csv'), index=False)

# 6. Save content DataFrame to CSV
content.to_csv(os.path.join(timestamps_dir, 'content.csv'), index=False)

# 7. Save topic_sub_df DataFrame to CSV
topic_sub_df.to_csv(os.path.join(timestamps_dir, 'topic_sub_df.csv'), index=False)



In [64]:
# 1. Read res_df from CSV
res_df = pd.read_csv(os.path.join(timestamps_dir, 'res_df.csv'))
res_df2 = pd.read_csv(os.path.join(timestamps_dir, 'res_df2.csv'))

# 2. Read dup_df from CSV
dup_df = pd.read_csv(os.path.join(timestamps_dir, 'dup_df.csv'))

# 3. Read second_degree_match_df from CSV
second_degree_match_df = pd.read_csv(os.path.join(timestamps_dir, 'second_degree_match_df.csv'))

# 4. Read topics DataFrame from CSV
topics = pd.read_csv(os.path.join(timestamps_dir, 'topics.csv'))
train_df = pd.read_csv(os.path.join(timestamps_dir, 'train_df.csv'))
lookup = pd.read_csv(os.path.join(timestamps_dir, 'lookup.csv'))


# 5. Read corr_df from CSV
corr_df = pd.read_csv(os.path.join(timestamps_dir, 'corr_df.csv'))

# 6. Read content DataFrame from CSV
content = pd.read_csv(os.path.join(timestamps_dir, 'content.csv'))

# 7. Read topic_sub_df DataFrame from CSV
topic_sub_df = pd.read_csv(os.path.join(timestamps_dir, 'topic_sub_df.csv'))

In [65]:
res_df = res_df.merge(res_df2, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
res_df = res_df.merge(second_degree_match_df, how="outer", on=["topic_id", "content_id"])
#res_df.shape


topics["key"] = topics["topic_title"].fillna("")

train_df = topics[~topics["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "tdup_count"}, inplace=True)

dup_df = topics[topics["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "tdup_count"]]

res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
#res_df.shape

topics["key"] = topics["topic_parent"].fillna(topics["topic_id"])

train_df = topics[~topics["topic_sub"]]


lookup = train_df.merge(corr_df, on="topic_id").groupby(["key", "content_id"])["topic_channel"].count().reset_index()
lookup.rename(columns={"topic_channel": "pdup_count"}, inplace=True)

dup_df = topics[topics["topic_sub"]].merge(lookup, on=["key"])[["topic_id", 'content_id', "pdup_count"]]
dup_df.rename(columns={'pdup_count':'pdup_count_dup'}, inplace=True) 
res_df = res_df.merge(dup_df, how="outer", on=["topic_id", "content_id"])
#res_df.shape


res_df = res_df.merge(topics[topics["topic_sub"]], on="topic_id", how="left")
res_df = res_df.merge(content, on="content_id", how="left")

#res_df.shape



res_df["topic_language"] = res_df["topic_language"].astype("category")
res_df["topic_category"] = res_df["topic_category"].astype("category")
res_df["content_kind"] = res_df["content_kind"].astype("category")
res_df["topic_channel"] = res_df["topic_channel"].astype("category")

res_df["len_topic_title"] = res_df["topic_title"].fillna("").apply(len)
res_df["len_topic_description"] = res_df["topic_description"].fillna("").apply(len)
res_df["len_content_title"] = res_df["content_title"].fillna("").apply(len)
res_df["len_content_description"] = res_df["content_description"].fillna("").apply(len)
res_df["len_content_text"] = res_df["content_text"].fillna("").apply(len)
res_df["match_score_max"] = res_df.groupby("topic_id")["match_score_df2"].transform("max")
res_df["match_score_min"] = res_df.groupby("topic_id")["match_score_df2"].transform("min")

res_df["vec_dist_max"] = res_df.groupby("topic_id")["vec_dist"].transform("max")
res_df["vec_dist_min"] = res_df.groupby("topic_id")["vec_dist"].transform("min")

res_df["dup_count"] = res_df["dup_count"].fillna(0)
res_df["total_count"] = res_df.groupby("topic_id")["content_id"].transform("count")
res_df["dup_count_mean"] = res_df.groupby("topic_id")["dup_count"].transform("mean")

res_df["tdup_count"] = res_df["tdup_count"].fillna(0)
res_df["tdup_count_mean"] = res_df.groupby("topic_id")["tdup_count"].transform("mean")

res_df["pdup_count"] = res_df["pdup_count_dup"].fillna(0)
res_df["pdup_count_mean"] = res_df.groupby("topic_id")["pdup_count_dup"].transform("mean")

res_df["same_chapter"] = res_df["topic_chapter"] == res_df["content_chapter"]
res_df["starts_same"] = res_df["topic_title"].apply(lambda x: x.split(" ", 1)[0]) == res_df["content_title"].apply(lambda x: str(x).split(" ", 1)[0])

#res_df["content_is_train"] = res_df["content_is_train"].astype(bool)

#res_df.loc[~res_df["content_is_train"], "content_max_train_score"] = None
res_df["second_degree"].fillna(False, inplace=True)
res_df["topic_max_train_score"] = res_df["topic_id"].map(topic_sub_df.set_index("id")["max_train_score"].to_dict())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  res_df["second_degree"].fillna(False, inplace=True)
  res_df["second_degree"].fillna(False, inplace=True)


In [66]:
res_df.columns

Index(['topic_id', 'content_id', 'vec_dist', 'match_score_df2', 'dup_count',
       'second_degree', 'tdup_count', 'pdup_count_dup', 'topic_title',
       'topic_description', 'topic_channel', 'topic_category', 'topic_level',
       'topic_language', 'topic_parent', 'topic_has_content',
       'topic_parent_title', 'topic_grandpa', 'topic_grandpa_title',
       'topic_ggrandpa', 'topic_ggrandpa_title', 'topic_sub', 'topic_chapter',
       'topic_t', 'key', 'content_title', 'content_description',
       'content_kind', 'content_text', 'content_language',
       'content_copyright_holder', 'content_license', 'content_title_length',
       'content_description_length', 'content_t', 'content_chapter',
       'content_max_train_score', 'len_topic_title', 'len_topic_description',
       'len_content_title', 'len_content_description', 'len_content_text',
       'match_score_max', 'match_score_min', 'vec_dist_max', 'vec_dist_min',
       'total_count', 'dup_count_mean', 'tdup_count_mean', 'pdu

In [69]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

N_FOLDS = 4
gkf = GroupKFold(N_FOLDS)


lgb_param = {"objective": "binary",
             "metric": 'auc',
             "boosting": "gbdt",
             "learning_rate": 0.05,
             "is_unbalance": True,
             "min_data_in_leaf": 1024,
             "num_leaves": 31,
             "feature_fraction": 0.8,
             "subsample": 0.5,
             "subsample_freq": 1,
            }

y_oof = np.zeros(res_df.shape[0])

features = ["match_score_df2", "match_score_max", "match_score_min",
            "vec_dist", "vec_dist_max", "vec_dist_min", "len_content_text",
            "len_topic_title", "len_topic_description", "len_content_title", "len_content_description",
             "dup_count", "total_count", "dup_count_mean",
            "tdup_count", "tdup_count_mean", "pdup_count", "pdup_count_mean",
            "topic_language", "topic_category", "content_kind", "topic_level",
            "same_chapter", "starts_same", "topic_channel", #'content_is_train',
            "content_max_train_score", "topic_max_train_score", "second_degree"
           ]

scores = []


for f, (train_ind, val_ind) in enumerate(gkf.split(res_df, groups=res_df["topic_id"])):
    print("Fold", f)
    train_df, val_df = res_df.iloc[train_ind].reset_index(drop=True), res_df.iloc[val_ind].reset_index(drop=True)

    lgb_train = lgb.Dataset(train_df[features], train_df["target"])
    lgb_val = lgb.Dataset(val_df[features], val_df["target"])

    model = lgb.train(lgb_param, lgb_train, num_boost_round=800, #early_stopping_rounds=20,
                      valid_sets=[lgb_val],
                      verbose_eval=200)
    model.save_model(working_directory + f'/lgbv1/lgb_{f}.txt')

    y_oof[val_ind] += model.predict(val_df[features])
    scores.append(roc_auc_score(val_df["target"], y_oof[val_ind]))
    print()

print(scores)

Fold 0


KeyError: 'target'

In [None]:
# lgbm time hehehe
import lightgbm as lgb

N_FOLDS = 4
res_df["pred"] = 0

features = ["match_score", "match_score_max", "match_score_min",
            "vec_dist", "vec_dist_max", "vec_dist_min", "len_content_text",
            "len_topic_title", "len_topic_description", "len_content_title",
            "len_content_description", "dup_count", "total_count", "dup_count_mean",
            "tdup_count", "tdup_count_mean", "pdup_count", "pdup_count_mean",
            "topic_language", "topic_category", "content_kind", "topic_level",
            "same_chapter", "starts_same", "topic_channel", #"content_is_train", 
            "content_max_train_score", "topic_max_train_score", "second_degree"
           ]

model_dir = working_directory + 'lgbv1/'

for f in range(N_FOLDS):
    lgb_model = lgb.Booster(model_file=model_dir + f"lgb_{f}.txt")
    res_df["pred"] += lgb_model.predict(res_df[features]) / N_FOLDS