In [None]:
#imports
import os, warnings, multiprocessing as mp
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import networkx as nx

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
# load
def load_two_col_txt(path, cols):
    rec = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        for ln in f:
            if '|--|' in ln:
                pid, txt = ln.rstrip('\n').split('|--|', 1)
            else:                      # fallback
                pid, txt = None, ln.rstrip('\n')
            rec.append([pid, txt])
    return pd.DataFrame(rec, columns=cols)


In [None]:

#path
DATA_DIR = "C:/Users/Dell/Desktop/NLP/kaggle/data_new/"

#files
abstracts = load_two_col_txt(os.path.join(DATA_DIR, 'abstracts.txt'),
                             ['paper_id', 'abstract'])
authors   = load_two_col_txt(os.path.join(DATA_DIR, 'authors.txt'),
                             ['paper_id', 'authors'])
edges     = pd.read_csv(os.path.join(DATA_DIR, 'edgelist.txt'),
                        names=['src', 'dst'], dtype=str, header=None)
test_pairs= pd.read_csv(os.path.join(DATA_DIR, 'test.txt'),
                        names=['src', 'dst'], dtype=str, header=None)
#removes spaces
for df in (abstracts, authors):
    df['paper_id'] = df['paper_id'].str.strip()
edges['src'] = edges['src'].str.strip(); edges['dst']=edges['dst'].str.strip()
test_pairs['src']=test_pairs['src'].str.strip(); test_pairs['dst']=test_pairs['dst'].str.strip()


In [None]:
def simple_tok(t): return [w for w in t.lower().split() if w.isalpha()]
WINDOW = 3  # size of the sliding-window for co-occurrence edges

pr_sum, cc_avg = {}, {}
for _, row in abstracts.iterrows():
    pid, txt = row['paper_id'], row['abstract'] or ''
    # Tokenization
    toks = simple_tok(txt)
    # If the abstract has no valid tokens, store zeros and skip the rest
    if not toks:
        pr_sum[pid] = cc_avg[pid] = 0.0
        continue
    # Build co-occurrence graph
    Gt = nx.Graph()
    Gt.add_nodes_from(toks)
    for i in range(len(toks)):
        for j in range(i+1, min(i+WINDOW, len(toks))):
            if toks[i]!=toks[j]:
                Gt.add_edge(toks[i], toks[j])
    pr_sum[pid] = sum(nx.pagerank(Gt, alpha=.85).values())
    cc_avg[pid] = np.mean(list(nx.clustering(Gt).values()))


KeyboardInterrupt



In [None]:
# Build an undirected graph from the edge list
Gu = nx.Graph()
Gu.add_edges_from(edges.values)     # μη κατευθυνόμενο γρ. για Node2Vec

# Train a Node2Vec model to obtain low-dimensional node embeddings
from node2vec import Node2Vec
n2v = Node2Vec(Gu, dimensions=32, walk_length=20, num_walks=20,
               workers= 1 , seed=42, quiet=True)
n2v_model = n2v.fit(window=10, min_count=1, batch_words=4)

# Collect the trained embeddings into a convenient dictionary
dim = n2v_model.wv.vector_size
zero_vec = np.zeros(dim, dtype=np.float32)
emb = {str(n): n2v_model.wv[str(n)] for n in Gu.nodes()}

# cosine similarity between two node embeddings
def n2v_cos(u, v):
    vu, vv = emb.get(u, zero_vec), emb.get(v, zero_vec)
    if (vu is zero_vec) or (vv is zero_vec): return 0.0
    return float(np.dot(vu, vv) /
                 (np.linalg.norm(vu)*np.linalg.norm(vv) + 1e-8))


In [None]:
# Label the observed edges as positive
edges['label']=1
nodes = np.array(list(Gu.nodes()))
deg   = np.array([Gu.degree(n) for n in nodes], dtype=float)
deg_p = deg / deg.sum()

# 1 negative for every 4 positives
NEG_RATIO = 0.25
n_neg = int(len(edges)*NEG_RATIO)
neg = pd.DataFrame({
        'src': np.random.choice(nodes, n_neg, p=deg_p),
        'dst': np.random.choice(nodes, n_neg, p=deg_p)})
# # Remove pairs that actually exist in the real edge list
neg = neg[~neg.apply(tuple,1).isin(edges.apply(tuple,1))]
neg['label']=0

# Merge positive and negative samples
samples = pd.concat([edges, neg], ignore_index=True)

In [None]:
tfv = TfidfVectorizer(max_features=10000, stop_words='english')
tfv.fit(abstracts['abstract'].fillna(''))
A = tfv.transform(abstracts['abstract'].fillna(''))
pid2idx = {p:i for i,p in enumerate(abstracts['paper_id'])}

# Cosine-like similarity between papers u and v using their TF-ID vectors 
def tfidf_cos(u,v):
    iu,pv = pid2idx.get(u,-1), pid2idx.get(v,-1)
    return (A[iu].multiply(A[pv])).sum() if iu>=0 and pv>=0 else 0.0

# Author-based similarity: Jaccard on author lists
auth = dict(zip(authors['paper_id'], authors['authors']))
def jac(a,b):
    sa, sb = set(a.split(',')), set(b.split(','))
    return 0. if not (sa or sb) else len(sa&sb)/len(sa|sb)

# Topology-based similarity: number of common neighbors
def com_nei(u,v):
    try: return len(list(nx.common_neighbors(Gu,u,v)))
    except: return 0


In [None]:
# Compute each feature column (N pairs x 1) and keep as 2-D array
X_cos     = np.array([tfidf_cos(u,v) for u,v in zip(samples.src, samples.dst)]).reshape(-1,1)
X_jac     = np.array([jac(auth.get(u,''),auth.get(v,'')) for u,v in zip(samples.src,samples.dst)]).reshape(-1,1)
X_comm    = np.array([com_nei(u,v)   for u,v in zip(samples.src, samples.dst)]).reshape(-1,1)
X_pr      = np.array([pr_sum.get(u,0)+pr_sum.get(v,0) for u,v in zip(samples.src,samples.dst)]).reshape(-1,1)
X_cc      = np.array([cc_avg.get(u,0)+cc_avg.get(v,0) for u,v in zip(samples.src,samples.dst)]).reshape(-1,1)
X_n2v     = np.array([n2v_cos(u,v)   for u,v in zip(samples.src, samples.dst)]).reshape(-1,1)

X_all = np.hstack([X_cos, X_jac, X_comm, X_pr, X_cc, X_n2v])
y_all = samples['label'].values
print("Feature matrix:", X_all.shape)


In [None]:
X_tr, X_va, y_tr, y_va = train_test_split(
        X_all, y_all, test_size=0.2, stratify=y_all, random_state=42)
scaler = StandardScaler(with_mean=False)
X_tr = scaler.fit_transform(X_tr);  X_va = scaler.transform(X_va)

In [None]:
# Handle class imbalance with explicit weights
pos_w = (len(y_all) - y_all.sum()) / y_all.sum()  

# Hyper-parameter grid for RandomForest
param_grid = {
    "n_estimators":    [200],     
    "max_depth":       [10, None],
    "min_samples_leaf":[2],           
}

# Base RandomForest model (class-weighted)
base_rf = RandomForestClassifier(
            class_weight={0:1.0, 1:pos_w},
            random_state=42,
            n_jobs=-1)

# Stratified K-fold CV: preserves class ratio in each split
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Grid-search with negative log-loss (lower = better)
grid = GridSearchCV(
        estimator   = base_rf,
        param_grid  = param_grid,
        scoring     = "neg_log_loss",
        cv          = cv,
        verbose     = 1,
        n_jobs      = -1)

grid.fit(X_tr, y_tr)

# Retrieve the best model & evaluate on held-out validation data
print("Best RF params:", grid.best_params_)
rf_best = grid.best_estimator_

print("Hold-out log-loss:",
      log_loss(y_va, rf_best.predict_proba(X_va)[:, 1]))


In [None]:
# Build every feature column for the test link pairs
xt_cos  = np.array([tfidf_cos(u,v) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)
xt_jac  = np.array([jac(auth.get(u,''),auth.get(v,'')) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)
xt_com  = np.array([com_nei(u,v) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)
xt_pr   = np.array([pr_sum.get(u,0)+pr_sum.get(v,0) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)
xt_cc   = np.array([cc_avg.get(u,0)+cc_avg.get(v,0) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)
xt_n2v  = np.array([n2v_cos(u,v) for u,v in zip(test_pairs.src,test_pairs.dst)]).reshape(-1,1)

# Concatenate the six columns, then scale with the SAME scaler that was fitted on the training features
X_test = scaler.transform(
            np.hstack([xt_cos, xt_jac, xt_com, xt_pr, xt_cc, xt_n2v])
         )

# Predict link probabilities with the tuned RandomForest
probs = rf_best.predict_proba(X_test)[:, 1]      

pd.DataFrame({
        "ID":    test_pairs.index,
        "Label": probs
}).to_csv("submission_node2vec_rf.csv", index=False)

print("Saved submission_node2vec_rf.csv")