In [None]:
# 0.imports
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

# 1. Helper to read "id|--|text" files into a DataFrame
def load_two_col_txt(path, col_names):
    records=[]
    with open(path, encoding="utf-8", errors="ignore") as f:
        for line in f:
            if "|--|" in line:
                pid, text = line.rstrip("\n").split("|--|", 1)
            else:
                pid, text = None, line.rstrip("\n")
            records.append([pid, text])
    return pd.DataFrame(records, columns=col_names)

# 2.  Load input files

DATA_DIR = "C:/Users/gkara/Desktop/5th year/summer/epeksergasia glwssas/data_new/"

#load
abstracts = load_two_col_txt(os.path.join(DATA_DIR, "abstracts.txt"),
                             ["paper_id","abstract"])
authors   = load_two_col_txt(os.path.join(DATA_DIR, "authors.txt"),
                             ["paper_id","authors"])
edgelist  = pd.read_csv(os.path.join(DATA_DIR, "edgelist.txt"),
                       sep=",", names=["src","dst"], dtype=str, header=None)
test_pairs= pd.read_csv(os.path.join(DATA_DIR, "test.txt"),
                       sep=",", names=["src","dst"], dtype=str, header=None)

#clean whitespace from all paper IDs

for df in (abstracts, authors):
    df["paper_id"] = df["paper_id"].str.strip()
edgelist["src"] = edgelist["src"].str.strip()
edgelist["dst"] = edgelist["dst"].str.strip()
test_pairs["src"] = test_pairs["src"].str.strip()
test_pairs["dst"] = test_pairs["dst"].str.strip()

# 3.  Build positive & negative samples

# Mark all existing citations as positive examples
edgelist["label"] = 1
nodes = pd.unique(edgelist[["src","dst"]].values.ravel())

# Set negative sampling ratio
NEG_RATIO = 0.2
n_pos = len(edgelist)
n_neg = int(n_pos * NEG_RATIO)

# Sample random pairs of nodes as negative examples
neg = pd.DataFrame({
    "src": np.random.choice(nodes, size=n_neg, replace=True),
    "dst": np.random.choice(nodes, size=n_neg, replace=True)
})
neg = neg[~neg.apply(tuple,1).isin(edgelist.apply(tuple,1))].head(n_neg)
neg["label"] = 0

samples = pd.concat([edgelist, neg], ignore_index=True)

# 4. TF-IDF + cosine feature
tfv = TfidfVectorizer(max_features=10000, stop_words="english")
tfv.fit(abstracts["abstract"].fillna(""))

# Transform all abstracts into TF–IDF matrix
A = tfv.transform(abstracts["abstract"].fillna(""))
id2idx = {pid:i for i,pid in enumerate(abstracts["paper_id"])}

def pair_cosine_batch(df):
    idx_u = df["src"].map(id2idx).fillna(-1).astype(int).values
    idx_v = df["dst"].map(id2idx).fillna(-1).astype(int).values
    valid = (idx_u>=0)&(idx_v>=0)
    sims = np.zeros(len(df), dtype=np.float32)
    Au = A[idx_u[valid]];   Av = A[idx_v[valid]]
    sims_valid = (Au.multiply(Av)).sum(axis=1).A1
    sims[valid] = sims_valid
    return sims.reshape(-1,1)

X_cos = pair_cosine_batch(samples)

# 5. Author-Jaccard

# Build mapping from paper_id to authors string
auth_map = dict(zip(authors["paper_id"], authors["authors"]))
au_src_list = samples["src"].map(auth_map).fillna("").values
au_dst_list = samples["dst"].map(auth_map).fillna("").values

def jaccard_auth(a, b):
    sa = set(a.split(",")) if a else set()
    sb = set(b.split(",")) if b else set()
    if not sa and not sb:
        return 0.0
    return len(sa & sb) / len(sa | sb)

author_jac = np.array([jaccard_auth(a,b) for a,b in zip(au_src_list, au_dst_list)]).reshape(-1,1)

# 6.  Compute Common Neighbors feature
G = nx.DiGraph()
G.add_edges_from(edgelist[["src","dst"]].values)
Gu = G.to_undirected()

cn = []
for u,v in zip(samples["src"], samples["dst"]):
    try:
        cn.append(len(list(nx.common_neighbors(Gu,u,v))))
    except:
        cn.append(0)
X_common = np.array(cn).reshape(-1,1)

# 7.  Assemble feature matrix and labels

X_all = np.hstack([X_cos, author_jac, X_common])
y = samples["label"].astype(int).values

# 8. Split + scale

X_tr, X_va, y_tr, y_va = train_test_split(
    X_all, y, stratify=y, test_size=0.2, random_state=42
)
scaler = StandardScaler(with_mean=False)
X_tr = scaler.fit_transform(X_tr)
X_va = scaler.transform(X_va)


# 9. GridSearchCV LogisticRegression with class_weight

param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
base_clf = LogisticRegression(
    penalty="l2",
    class_weight="balanced",
    solver="saga",
    max_iter=5000,
    random_state=42
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    base_clf, param_grid,
    scoring="neg_log_loss",
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid.fit(X_tr, y_tr)

print("Best C:", grid.best_params_["C"])
clf = grid.best_estimator_

proba_va = clf.predict_proba(X_va)[:,1]
print("Validation log-loss:", log_loss(y_va, proba_va))

# 10.  Predict on test set and save submission

X_test = np.hstack([
    pair_cosine_batch(test_pairs),
    np.array([jaccard_auth(auth_map.get(u,""), auth_map.get(v,"")) 
              for u,v in zip(test_pairs["src"], test_pairs["dst"])]).reshape(-1,1),
    np.array([
        len(list(nx.common_neighbors(Gu, u, v))) 
        for u,v in zip(test_pairs["src"], test_pairs["dst"])
    ]).reshape(-1,1)
])
X_test = scaler.transform(X_test)
proba_test = clf.predict_proba(X_test)[:,1]

submission = pd.DataFrame({
    "ID": np.arange(len(test_pairs)),
    "Label": proba_test
})
submission.to_csv("submission_improved.csv", index=False)
print("Saved submission_improved.csv")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C: 100
Validation log-loss: 0.22143236893686868
Saved submission_improved.csv
