In [9]:
!pip install networkx pandas numpy scikit-learn tqdm




In [10]:
import networkx as nx
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Uyarıları gizle
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Cargar el grafo de entrenamiento (GEXF)
G = nx.read_gexf('social_network_training_1.gexf')
print(f'Nodos: {G.number_of_nodes()}, Enlaces: {G.number_of_edges()}')

# Cargar el conjunto de prueba (pares a predecir)
test_df = pd.read_csv('test_1.csv')
test_df.head()

Nodos: 1133, Enlaces: 4451


Unnamed: 0,Id
0,458-558
1,449-849
2,278-324
3,97-717
4,425-951


In [12]:
# Funciones para calcular características
def get_features(u, v, G):
    cn = len(list(nx.common_neighbors(G, u, v)))
    try:
        jaccard = list(nx.jaccard_coefficient(G, [(u, v)]))[0][2]
    except:
        jaccard = 0
    try:
        adamic = list(nx.adamic_adar_index(G, [(u, v)]))[0][2]
    except:
        adamic = 0
    try:
        pref_attach = list(nx.preferential_attachment(G, [(u, v)]))[0][2]
    except:
        pref_attach = 0
    return [cn, jaccard, adamic, pref_attach]

In [13]:
# Enlaces existentes (positivos)
edges = list(G.edges())
positive_samples = [(u, v, 1) for u, v in edges]

# No enlaces (negativos): seleccionar aleatoriamente
import random
nodes = list(G.nodes())
negatives = set()
while len(negatives) < len(positive_samples):
    u, v = random.sample(nodes, 2)
    if not G.has_edge(u, v):
        negatives.add((u, v))
negative_samples = [(u, v, 0) for u, v in negatives]

# Combinar y extraer características
all_samples = positive_samples + negative_samples
X = []
y = []
for u, v, label in tqdm(all_samples):
    feats = get_features(u, v, G)
    X.append(feats)
    y.append(label)

  0%|          | 0/8902 [00:00<?, ?it/s]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

np.float64(0.8698475871966185)

In [15]:
X_test = []
pairs = []
for row in tqdm(test_df['Id']):
    u, v = row.split('-')
    pairs.append((u, v))
    X_test.append(get_features(u, v, G))

y_test_pred = clf.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'Id': test_df['Id'], 'Predicted': y_test_pred})
submission.to_csv('submission.csv', index=False)
submission.head()

  0%|          | 0/636827 [00:00<?, ?it/s]

Unnamed: 0,Id,Predicted
0,458-558,0.42908
1,449-849,0.139929
2,278-324,0.642154
3,97-717,0.028607
4,425-951,0.199439
