Conncect with google drive and Importing the paths

In [4]:
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = '/content/drive/MyDrive/data_nlp'
PRED_PATH = os.path.join(BASE_DIR, 'predictions.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


install some packages

In [5]:
!pip install --quiet spacy networkx scikit-learn joblib
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Importing the libraries

In [3]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import spacy
import re
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer

Load the dataset

In [6]:
abstracts = pd.read_csv(f'{BASE_DIR}/abstracts.txt', sep=r'\|\-\-\|', engine='python', names=['id', 'text'], header=None)
authors   = pd.read_csv(f'{BASE_DIR}/authors.txt',   sep=r'\|\-\-\|', engine='python', names=['id', 'auth'], header=None)
edgelist  = pd.read_csv(f'{BASE_DIR}/edgelist.txt', names=['src', 'tgt'], header=None)

meta = abstracts.merge(authors, on='id').reset_index(drop=True)
all_ids = meta['id'].values
id2idx = {}
for i, pid in enumerate(all_ids):
    id2idx[pid] = i

Graph creation for identifying edges between papaers

In [7]:
G = nx.Graph()
G.add_edges_from(edgelist.values.tolist())
neighbors = {n: set(G.neighbors(n)) for n in G.nodes()}
degrees = np.array([G.degree(pid) for pid in all_ids], dtype=float)

Text preprocessing

In [8]:
raw_texts = meta['text'].fillna('').tolist()
proc_texts = [" ".join(re.findall(r"\b[a-zA-Z]{3,}\b", t.lower())) for t in raw_texts]

Embeddings with SentenceTransformer()

In [9]:
model_embed = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
X_doc_vecs = model_embed.encode(proc_texts, show_progress_bar=True, batch_size=64, convert_to_numpy=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2165 [00:00<?, ?it/s]

Author sets

In [10]:
auth_sets = meta['auth'].fillna('').str.split(',').map(lambda L: {a.strip() for a in L}).tolist()

Function for JaccardSimilarity

In [11]:
def jaccard(a, b):
    return float(len(a & b)) / len(a | b) if (a | b) else 0.0

Negative/Positive pair creation

In [12]:
pos = edgelist.values.tolist()
rng = np.random.default_rng(42)
neg = []

while len(neg) < len(pos):
    u, v = rng.choice(all_ids, 2, replace=False)
    if not G.has_edge(u, v):
        neg.append([u, v])

pairs = np.array(pos + neg)
labels = np.array([1] * len(pos) + [0] * len(neg))

Feature Extraction

In [18]:
n = len(pairs)
similarities = []
distances = []
author_similarities = []
degree_differences = []
common_neighbors = []
adamic_adar_scores = []
token_differences = []

for u, v in pairs:
    idx_u = id2idx[u]
    idx_v = id2idx[v]

    vec_u = X_doc_vecs[idx_u]
    vec_v = X_doc_vecs[idx_v]

    sim = np.dot(vec_u, vec_v)
    dist = np.linalg.norm(vec_u - vec_v)
    auth_sim = jaccard(auth_sets[idx_u], auth_sets[idx_v])
    deg_diff = abs(degrees[idx_u] - degrees[idx_v])

    neighbors_u = neighbors[u]
    neighbors_v = neighbors[v]
    common = neighbors_u & neighbors_v
    cn = len(common)

    aa_score = sum(1 / np.log(degrees[id2idx[w]]) for w in common if degrees[id2idx[w]] > 1)

    tok_diff = abs(len(proc_texts[idx_u].split()) - len(proc_texts[idx_v].split()))

    similarities.append(sim)
    distances.append(dist)
    author_similarities.append(auth_sim)
    degree_differences.append(deg_diff)
    common_neighbors.append(cn)
    adamic_adar_scores.append(aa_score)
    token_differences.append(tok_diff)

X = np.array([
    similarities,
    distances,
    author_similarities,
    degree_differences,
    common_neighbors,
    adamic_adar_scores,
    token_differences
]).T

Train-Test split

In [19]:
X_tr, X_val, y_tr, y_val = train_test_split(X, labels, test_size=0.1, stratify=labels, random_state=42)

XGBoost model training

In [20]:
model = XGBClassifier(
    objective='binary:logistic',
    n_jobs=-1,
    random_state=42,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1
)
model.fit(X_tr, y_tr)

validation

In [21]:
val_preds = model.predict_proba(X_val)[:, 1]

load test set - pairs

In [22]:
pairs_test = pd.read_csv(os.path.join(BASE_DIR, 'test.txt'), names=['src', 'tgt']).values

extract test set features

In [23]:
n_test = len(pairs_test)

similarities_test = []
distances_test = []
author_similarities_test = []
degree_differences_test = []
common_neighbors_test = []
adamic_adar_scores_test = []
token_differences_test = []

for u, v in pairs_test:
    idx_u = id2idx[u]
    idx_v = id2idx[v]

    vec_u = X_doc_vecs[idx_u]
    vec_v = X_doc_vecs[idx_v]

    sim = np.dot(vec_u, vec_v)
    dist = np.linalg.norm(vec_u - vec_v)
    auth_sim = jaccard(auth_sets[idx_u], auth_sets[idx_v])
    deg_diff = abs(degrees[idx_u] - degrees[idx_v])

    common = neighbors[u] & neighbors[v]
    cn = len(common)

    aa_score = sum(1 / np.log(degrees[id2idx[w]]) for w in common if degrees[id2idx[w]] > 1)

    tok_diff = abs(len(proc_texts[idx_u].split()) - len(proc_texts[idx_v].split()))

    similarities_test.append(sim)
    distances_test.append(dist)
    author_similarities_test.append(auth_sim)
    degree_differences_test.append(deg_diff)
    common_neighbors_test.append(cn)
    adamic_adar_scores_test.append(aa_score)
    token_differences_test.append(tok_diff)

X_test = np.array([
    similarities_test,
    distances_test,
    author_similarities_test,
    degree_differences_test,
    common_neighbors_test,
    adamic_adar_scores_test,
    token_differences_test
]).T


Model prediction and save .csv file

In [24]:
preds = model.predict_proba(X_test)[:, 1]
pd.DataFrame({'ID': range(len(preds)), 'label': preds}).to_csv(PRED_PATH, index=False, float_format='%.6f')