In [None]:
# 0. imports
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

DATA_DIR = "C:/Users/Dell/Desktop/nlp/kaggle/data_new/"

# 1. load two‐column text files 

def load_two_col_txt(path, col_names):
    records = []
    with open(path, encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.rstrip("\n").split("|--|", 1)
            if len(parts) == 2:
                records.append(parts)
            else:
                records.append([None, parts[0]])
    return pd.DataFrame(records, columns=col_names)


# 2.  Load abstracts, authors, citation edges, and test pairs

abstracts = load_two_col_txt(
    os.path.join(DATA_DIR, "abstracts.txt"),
    ["paper_id", "abstract"]
)
authors   = load_two_col_txt(
    os.path.join(DATA_DIR, "authors.txt"),
    ["paper_id", "authors"]
)

edgelist = pd.read_csv(
    os.path.join(DATA_DIR, "edgelist.txt"),
    sep=",", names=["src","dst"], dtype=str, header=None
)
test_pairs = pd.read_csv(
    os.path.join(DATA_DIR, "test.txt"),
    sep=",", names=["src","dst"], dtype=str, header=None
)


for df in (abstracts, authors):
    df["paper_id"] = df["paper_id"].str.strip()
edgelist["src"] = edgelist["src"].str.strip()
edgelist["dst"] = edgelist["dst"].str.strip()
test_pairs["src"] = test_pairs["src"].str.strip()
test_pairs["dst"] = test_pairs["dst"].str.strip()


inter = set(edgelist["src"]) & set(abstracts["paper_id"])

# 3.  Build directed citation graph

G = nx.DiGraph()
G.add_edges_from(edgelist.values)

# 4. negative sampling and TF–IDF feature batching

# tf-idf helper

def tfidf_in_batches(texts, batch_size=100_000):
    mats = []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i:i+batch_size].fillna('')
        mats.append(vectorizer.transform(chunk))
    return mats

# negative sampling 

def negative_sampling(G, n_neg):
    nodes = np.array(list(G.nodes()))

    k = n_neg * 3
    us = np.random.choice(nodes, size=k, replace=True)
    vs = np.random.choice(nodes, size=k, replace=True)

    neg = []
    seen = set()
    for u, v in zip(us, vs):
        if u == v or G.has_edge(u, v) or (u, v) in seen:
            continue
        seen.add((u, v))
        neg.append((u, v))
        if len(neg) >= n_neg:
            break

    return pd.DataFrame(neg, columns=['src','dst'])

def build_samples(pos_edges, neg_ratio=1):
    pos = pos_edges.copy(); pos['label'] = 1
    n_neg = len(pos) * neg_ratio
    neg = negative_sampling(G, int(n_neg))
    neg['label'] = 0
    # returns Dataframe with columns src, dst, label
    return pd.concat([pos, neg], ignore_index=True)

# jaccard helper

def jaccard_auth(a: str, b: str):
    sa, sb = set(a.split(',')), set(b.split(','))
    return len(sa & sb) / len(sa | sb) if sa or sb else 0.0

vectorizer = TfidfVectorizer(max_features=5_000, stop_words='english')
vectorizer.fit(abstracts['abstract'])

# 5.  Fit TF–IDF vectorizer on all abstracts once

def extract_features(df: pd.DataFrame):
    merged = (
        df.merge(abstracts, left_on='src', right_on='paper_id')
          .merge(abstracts, left_on='dst', right_on='paper_id', suffixes=('_src', '_dst'))

          .merge(authors,  left_on='src', right_on='paper_id')
          .merge(authors,  left_on='dst', right_on='paper_id',
                 suffixes=('_auth_src', '_auth_dst'))
    )

    #tf-idf cosine similarity

    tf_src_chunks = tfidf_in_batches(merged['abstract_src'])
    tf_dst_chunks = tfidf_in_batches(merged['abstract_dst'])
    cosine_sim = np.concatenate([
        (a.multiply(b)).sum(axis=1).A1
        for a, b in zip(tf_src_chunks, tf_dst_chunks)
    ])

    #author jaccard

    author_jac = [
        jaccard_auth(a, b)
        for a, b in zip(merged['authors_auth_src'],
                        merged['authors_auth_dst'])
    ]


    return pd.DataFrame({
        'cosine': cosine_sim,
        'author_jac': author_jac,
    })


# 6.  Prepare training & validation data

samples = build_samples(edgelist, neg_ratio=1)
inter_src = set(samples["src"]) & set(abstracts["paper_id"])
inter_dst = set(samples["dst"]) & set(abstracts["paper_id"])
print("Matching src IDs:", len(inter_src), "of", len(samples), 
      " | Matching dst IDs:", len(inter_dst), "of", len(samples))

print("Samples[0:5] src,dst =", samples[["src","dst"]].iloc[:5].values)
print("Abstract IDs sample:", list(abstracts["paper_id"].iloc[:5]))
X = extract_features(samples)

y = samples['label']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7.  PyTorch Dataset and DataLoader for batching

class PairDataset(Dataset):
    def __init__(self, X_np, y_np=None):
        self.X = torch.tensor(X_np, dtype=torch.float32)
        self.y = None
        if y_np is not None:
            self.y = torch.tensor(y_np, dtype=torch.float32).unsqueeze(1)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

# 8.  Define MLP model architecture

class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

# 9.  Training & evaluation functions

def train_epoch(model, loader, loss_fn, optim, device):
    model.train()
    running = 0.0
    for Xb, yb in loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optim.zero_grad()
        logits = model(Xb)
        loss = loss_fn(logits, yb)
        loss.backward()
        optim.step()
        running += loss.item() * Xb.size(0)
    return running / len(loader.dataset)

def eval_epoch(model, loader, loss_fn, device):
    model.eval()
    running = 0.0
    all_logits, all_labels = [], []
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            loss = loss_fn(logits, yb)
            running += loss.item() * Xb.size(0)
            all_logits.append(logits.cpu())
            all_labels.append(yb.cpu())
            
    avg_loss = running / len(loader.dataset)
    probs = torch.sigmoid(torch.cat(all_logits)).numpy()
    labels = torch.cat(all_labels).numpy()
    sk_loss = log_loss(labels, probs)
    return avg_loss, sk_loss

# 10.  5‐fold Stratified Cross‐Validation with early stopping

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_scaled, y)):
    X_tr, X_val = X_scaled[tr_idx], X_scaled[val_idx]
    y_tr, y_val = y.iloc[tr_idx].values, y.iloc[val_idx].values
   
    train_ds = PairDataset(X_tr, y_tr)
    val_ds   = PairDataset(X_val, y_val)
    train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=2048, shuffle=False)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = MLP(in_dim=X.shape[1]).to(device)
    loss_fn = nn.BCEWithLogitsLoss()
    optim   = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    ()
    best_sk = np.inf
    patience, patience_cnt = 5, 0
    for epoch in range(30):
        train_epoch(model, train_loader, loss_fn, optim, device)
        _, val_sk = eval_epoch(model, val_loader, loss_fn, device)
        if val_sk < best_sk - 1e-4:
            best_sk = val_sk
            patience_cnt = 0
        else:
            patience_cnt += 1
        if patience_cnt >= patience:
            break

    cv_scores.append(best_sk)
    print(f"Fold {fold}: best log-loss = {best_sk:.5f}")

print(f"Mean CV log-loss: {np.mean(cv_scores):.5f}")

# 11.  Train final model on full dataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'
full_ds = PairDataset(X_scaled, y.values)
full_dl = DataLoader(full_ds, batch_size=1024, shuffle=True)

final_model = MLP(X_scaled.shape[1]).to(device)
crit = nn.BCEWithLogitsLoss()
opt  = torch.optim.Adam(final_model.parameters(), lr=1e-3, weight_decay=1e-5)

for epoch in range(30):
    loss = train_epoch(final_model, full_dl, crit, opt, device)
print(f"Full training finished – last epoch loss {loss:.5f}")

# 12.  Prepare test features, predict, and save submission

X_test_df = extract_features(test_pairs)
X_test    = scaler.transform(X_test_df)
test_ds = PairDataset(X_test)
test_dl = DataLoader(test_ds, batch_size=2048, shuffle=False)
final_model.eval()
logits = []
with torch.no_grad():
    for xb in test_dl:
        logits.append(final_model(xb.to(device)).cpu())

probs = torch.sigmoid(torch.cat(logits)).squeeze(1).numpy()  

submission = pd.DataFrame({
    'ID': test_pairs.index,      
    'probability': probs         
})
submission.to_csv('submission.csv', index=False)
print(submission.head())
print('Saved submission.csv ')

Loaded: 138499 abstracts; 138499 authors; 1091955 edges; 106692 test pairs
end of load
Common IDs: 100336 from 1091955 edges
end of load
end of build citation graph
1
2
3
4
7
7.1
Matching src IDs: 138480 of 2183910  | Matching dst IDs: 138499 of 2183910
Samples[0:5] src,dst = [['0' '1']
 ['0' '2']
 ['1' '3']
 ['1' '5']
 ['1' '6']]
Abstract IDs sample: ['0', '1', '2', '3', '4']
5
6
6.1
6.1
7.2
8
9
10
 entered mlp 
11
12
13
14
15
16
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
finished inside k forld for epoch
Fold 0: best log-loss = 0.34963
15
16
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train epoch
END of eval epoch
END of train