In [None]:
##all-MiniLM-L6-v2

import numpy as np
import pandas as pd
from pathlib import Path
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import gc 
import random 


#!pip3 install lightgbm
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score


#!pip3 install sentence-transformers


from sentence_transformers import SentenceTransformer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


data_path = Path("/kaggle/working/nlp-cse-uoi-2025/data_new")
TARGET_TOTAL_SAMPLES = 800_000 
SBERT_MODEL_NAME = 'all-MiniLM-L6-v2' 
EMBEDDING_DIM = 384 


print("Loading data...")
with open(data_path / "authors.txt") as f: 
    authors_data = [i.split("|--|") for i in f.read().splitlines()]
    authors = pd.DataFrame({
        "article_id": np.int32(np.array(authors_data)[:, 0]),
        "authors": np.array(authors_data)[:, 1]
    })

edgelist = pd.read_csv(data_path / "edgelist.txt", names=["article_id", "cited_id"], header=None, sep=",", dtype=np.int32) 

with open(data_path / "abstracts.txt") as f:
    abstracts_data = [i.split("|--|") for i in f.read().splitlines()]
    abstracts = pd.DataFrame({
        "article_id": np.int32(np.array(abstracts_data)[:, 0]),
        "abstract": np.array(abstracts_data)[:, 1]
    })

test_df = pd.read_csv(data_path / "test.txt", header=None, names=['col1', 'col2'], dtype=np.int32)

assert len(authors) == len(abstracts)
data = authors.merge(abstracts, on="article_id") 

data = data.sort_values(by='article_id').reset_index(drop=True) 

article_id_to_idx = pd.Series(data.index, index=data['article_id']).to_dict() 

del authors, abstracts, authors_data, abstracts_data
gc.collect()

data.info(verbose=True, memory_usage="deep")
print(test_df.head())


sbert_pretrained_dir = data_path / "sbert_pretrained" 
sbert_means_csv = data_path / f"sbert_mean_vec_{SBERT_MODEL_NAME.replace('/', '_')}.csv"

sbert_mean_vectors = None

if sbert_means_csv.exists():
    print(f"Loading pre-computed SBERT mean vectors from {sbert_means_csv}")
    sbert_mean_vectors = pd.read_csv(sbert_means_csv, header=None).values.astype(np.float32)
else:
    sbert_pretrained_dir.mkdir(parents=True, exist_ok=True)
    print(f"Downloading/Loading SBERT model: {SBERT_MODEL_NAME}... This may take a while.")
    
    model_sbert = SentenceTransformer(SBERT_MODEL_NAME)

    print("Computing SBERT mean vectors for abstracts...")
    
    sbert_mean_vectors = model_sbert.encode(data['abstract'].tolist(), convert_to_numpy=True, show_progress_bar=True, batch_size=32) 
    sbert_mean_vectors = sbert_mean_vectors.astype(np.float32) 

    pd.DataFrame(sbert_mean_vectors).to_csv(sbert_means_csv, header=False, index=False)
    print(f"Saved SBERT mean vectors to {sbert_means_csv}") 

    del model_sbert
    gc.collect()



print(f"Shape of SBERT_mean_vectors: {sbert_mean_vectors.shape}") 

all_positive_edges = edgelist.values.tolist()
print(f"Loaded {len(all_positive_edges)} raw positive edges from edgelist.txt.")

num_positive_samples_to_use = min(len(all_positive_edges), TARGET_TOTAL_SAMPLES // 2) 
num_negative_samples_to_generate = TARGET_TOTAL_SAMPLES - num_positive_samples_to_use 

random.seed(42)
positive_edges = random.sample(all_positive_edges, num_positive_samples_to_use) 
print(f"Using {len(positive_edges)} positive edges for training.")

existing_edges_set = set()
for p_id, c_id in all_positive_edges:
    existing_edges_set.add(tuple(sorted((p_id, c_id))))

print(f"Generating {num_negative_samples_to_generate} negative samples...")
negative_edges = []
all_article_ids = data['article_id'].unique()
num_articles = len(all_article_ids)

while len(negative_edges) < num_negative_samples_to_generate:  
    idx_pair = np.random.choice(num_articles, 2, replace=False) 
    article_id1 = all_article_ids[idx_pair[0]]
    article_id2 = all_article_ids[idx_pair[1]]

    current_pair = tuple(sorted((article_id1, article_id2)))

    if current_pair not in existing_edges_set: 
        negative_edges.append((article_id1, article_id2))

print(f"Generated {len(negative_edges)} negative samples.")

pairs = np.array(positive_edges + negative_edges, dtype=np.int32)
labels = np.array([1] * len(positive_edges) + [0] * len(negative_edges), dtype=np.float32)

indices = np.arange(len(pairs))
np.random.shuffle(indices) 
pairs_shuffled = pairs[indices]
labels_shuffled = labels[indices]

print(f"Total samples for training: {len(pairs_shuffled)} (Positive: {len(positive_edges)}, Negative: {len(negative_edges)})")

del positive_edges, negative_edges
gc.collect()

FEATURE_SIZE = 4 * EMBEDDING_DIM + 1 + 1 + 1 

def compute_features_batch(article_id_pairs, embeddings, data_df, article_id_map):
    num_pairs = len(article_id_pairs)
    features_array = np.zeros((num_pairs, FEATURE_SIZE), dtype=np.float32)

    processed_authors = {} 
    for idx, row in data_df.iterrows():
        authors_str = row['authors'] 
        processed_authors[row['article_id']] = set(authors_str.lower().replace(" ", "").split(';')) if authors_str else set()

    for i, (id1, id2) in enumerate(article_id_pairs):
        idx1 = article_id_map.get(id1)
        idx2 = article_id_map.get(id2)
       
        if idx1 is None or idx2 is None:
            features_array[i] = np.zeros(FEATURE_SIZE, dtype=np.float32)
            continue

        vec_i = embeddings[idx1] 
        vec_j = embeddings[idx2] 

        combined_embeddings = np.concatenate([ 
            vec_i,
            vec_j,
            np.abs(vec_i - vec_j),
            vec_i * vec_j
        ])

        authors_i_set = processed_authors.get(id1, set())
        authors_j_set = processed_authors.get(id2, set())

        intersection_len = len(authors_i_set & authors_j_set) 
        union_len = len(authors_i_set | authors_j_set) 
        author_sim = intersection_len / union_len if union_len > 0 else 0.0 
        shared_authors_count = intersection_len
        
        vec_i_reshaped = vec_i.reshape(1, -1)
        vec_j_reshaped = vec_j.reshape(1, -1)
        if np.linalg.norm(vec_i) == 0.0 or np.linalg.norm(vec_j) == 0.0:
            abstract_cos_sim = 0.0
        else:
            abstract_cos_sim = cosine_similarity(vec_i_reshaped, vec_j_reshaped)[0][0]
       
        all_features = np.concatenate([combined_embeddings, [author_sim, shared_authors_count, abstract_cos_sim]])
        features_array[i] = all_features

    return features_array

print("Computing features for training data...") 
X_features = compute_features_batch(pairs_shuffled, sbert_mean_vectors, data, article_id_to_idx)
y = labels_shuffled

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features) 

del X_features, labels_shuffled, pairs 
gc.collect()

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    stratify=y, 
    random_state=42
)

del X_scaled, y 
gc.collect()

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")


print("\nStarting LightGBM model training...")

lgb_params = { 
    'objective': 'binary', 
    'metric': 'auc',
    'boosting_type': 'gbdt', 
    'num_leaves': 31, 
    'learning_rate': 0.05,
    'n_estimators': 2000, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_threads': -1, 
    'verbose': -1,
    'seed': 42,
    'n_jobs': -1 
}

model = lgb.LGBMClassifier(**lgb_params)

model.fit(X_train, y_train, 
          eval_set=[(X_test, y_test)],
          eval_metric='auc',
          callbacks=[lgb.log_evaluation(period=100), lgb.early_stopping(100, verbose=False)])

y_pred_proba_train = model.predict_proba(X_train)[:, 1] 
y_pred_proba_test = model.predict_proba(X_test)[:, 1]
y_pred_test_class = (y_pred_proba_test > 0.5).astype(int)
#we calculate our metrics
train_auc = roc_auc_score(y_train, y_pred_proba_train)
test_auc = roc_auc_score(y_test, y_pred_proba_test)
test_accuracy = accuracy_score(y_test, y_pred_test_class)

print(f"\nLightGBM Train AUC: {train_auc:.4f}")
print(f"LightGBM Test AUC: {test_auc:.4f}")
print(f"LightGBM Test Accuracy: {test_accuracy*100:.2f}%")



print("\nMaking predictions on test.txt...")
test_results = []

processed_authors_test = {}  
for idx, row in data.iterrows():
    authors_str = row['authors']
    processed_authors_test[row['article_id']] = set(authors_str.lower().replace(" ", "").split(';')) if authors_str else set()

test_pairs = test_df.values
PREDICTION_BATCH_SIZE = 1024

for i in range(0, len(test_pairs), PREDICTION_BATCH_SIZE):
    batch_pairs = test_pairs[i:i + PREDICTION_BATCH_SIZE]
    batch_indices = np.arange(i, min(i + PREDICTION_BATCH_SIZE, len(test_pairs)))

    batch_feature_vectors = np.zeros((len(batch_pairs), FEATURE_SIZE), dtype=np.float32)
    for j, (id1, id2) in enumerate(batch_pairs):
        idx1 = article_id_to_idx.get(id1)
        idx2 = article_id_to_idx.get(id2)

        if idx1 is None or idx2 is None:
            batch_feature_vectors[j] = np.zeros(FEATURE_SIZE, dtype=np.float32)
            continue

        vec_i = sbert_mean_vectors[idx1] 
        vec_j = sbert_mean_vectors[idx2] 

        combined_embeddings = np.concatenate([
            vec_i,
            vec_j,
            np.abs(vec_i - vec_j),
            vec_i * vec_j
        ])

        authors_i_set = processed_authors_test.get(id1, set())
        authors_j_set = processed_authors_test.get(id2, set())

        intersection_len = len(authors_i_set & authors_j_set)
        union_len = len(authors_i_set | authors_j_set)
        author_sim = intersection_len / union_len if union_len > 0 else 0.0
        shared_authors_count = intersection_len

        vec_i_reshaped = vec_i.reshape(1, -1)
        vec_j_reshaped = vec_j.reshape(1, -1)
        if np.linalg.norm(vec_i) == 0.0 or np.linalg.norm(vec_j) == 0.0:
            abstract_cos_sim = 0.0
        else:
            abstract_cos_sim = cosine_similarity(vec_i_reshaped, vec_j_reshaped)[0][0]

        all_features = np.concatenate([combined_embeddings, [author_sim, shared_authors_count, abstract_cos_sim]])
        batch_feature_vectors[j] = all_features

    scaled_batch_vectors = scaler.transform(batch_feature_vectors)
    batch_predictions = model.predict_proba(scaled_batch_vectors)[:, 1]

    for k, prob in zip(batch_indices, batch_predictions):
        test_results.append((f"{k}", prob))

result_df = pd.DataFrame(test_results, columns=["ID", "Label"]) 
result_df.to_csv(data_path / "submission.csv", index=False)

print(f"Submission file created at {data_path / 'submission.csv'}")
print(result_df.head())

2025-06-15 15:26:39.256752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750001199.539223      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750001199.618137      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading data...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138499 entries, 0 to 138498
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   article_id  138499 non-null  int32 
 1   authors     138499 non-null  object
 2   abstract    138499 non-null  object
dtypes: int32(1), object(2)
memory usage: 162.7 MB
    col1    col2
0  34977   59394
1  22518   46602
2  36762   22813
3  44960  110384
4  29015   26366
Loading pre-computed SBERT mean vectors from /kaggle/working/nlp-cse-uoi-2025/data_new/sbert_mean_vec_all-MiniLM-L6-v2.csv
Shape of SBERT_mean_vectors: (138499, 384)
Loaded 1091955 raw positive edges from edgelist.txt.
Using 400000 positive edges for training.
Generating 400000 negative samples...
Generated 400000 negative samples.
Total samples for training: 800000 (Positive: 400000, Negative: 400000)
Computing features for training data...
Shape of X_train: (640000, 1539), y_train: (640000,)
Shape of X_test: 