In [None]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression
import mlflow
import mlflow.sklearn
import networkx as nx

In [None]:
mlflow.start_run()
df = dd.read_csv("train.csv")


In [None]:
df.head()

Unnamed: 0,source_node,destination_node
0,1,690569
1,1,315892
2,1,189226
3,2,834328
4,2,1615927


In [None]:
# Convert the source and destination nodes to Dask arrays
source_nodes = df['source_node'].to_dask_array(lengths=True)
destination_nodes = df['destination_node'].to_dask_array(lengths=True)

# Create a directed graph using NetworkX
G = nx.DiGraph()
edges = [(source, destination) for source, destination in zip(source_nodes.compute(), destination_nodes.compute())]
G.add_edges_from(edges)

In [None]:

# Calculate Common Neighbors
def common_neighbors_score(G, u, v):
    common_neighbors = set(G.successors(u)).intersection(G.successors(v))
    return len(common_neighbors)

In [None]:

# Calculate Jaccard's Coefficient
def jaccard_coefficient_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    intersection = u_neighbors.intersection(v_neighbors)
    union = u_neighbors.union(v_neighbors)
    return len(intersection) / len(union)


In [None]:
# Calculate Preferential Attachment
def preferential_attachment_score(G, u, v):
    u_neighbors = set(G.successors(u))
    v_neighbors = set(G.successors(v))
    return len(u_neighbors) * len(v_neighbors)

In [None]:


# Calculate scores for all pairs of nodes
for u, v in G.edges():
    common_neighbors = common_neighbors_score(G, u, v)
    jaccard_coefficient = jaccard_coefficient_score(G, u, v)
    preferential_attachment = preferential_attachment_score(G, u, v)
    
    # Log scores and other relevant information using MLflow
    with mlflow.start_run(nested=True):
        mlflow.log_param("Source Node", u)
        mlflow.log_param("Destination Node", v)
        mlflow.log_metric("Common Neighbors Score", common_neighbors)
        mlflow.log_metric("Preferential Attachment Score", preferential_attachment)
    
    print(f"Nodes {u} and {v}:")
    print(f"Common Neighbors Score: {common_neighbors}")
    print(f"Jaccard's Coefficient Score: {jaccard_coefficient}")
    print(f"Preferential Attachment Score: {preferential_attachment}")
    print("-------------")

# End the MLflow run
mlflow.end_run()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 0
-------------
Nodes 1519887 and 23572:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 98
-------------
Nodes 1214432 and 553971:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 180
-------------
Nodes 1214432 and 201115:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 450
-------------
Nodes 1214432 and 38390:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 90
-------------
Nodes 1214432 and 1420628:
Common Neighbors Score: 0
Jaccard's Coefficient Score: 0.0
Preferential Attachment Score: 135
-------------
Nodes 1214432 and 1149460:
Common Neighbors Score: 1
Jaccard's Coefficient Score: 0.018518518518518517
Preferential Attachment Score: 600
-------------
Nodes 1214432 and 1047741:
Common Ne