# Exploratory Data Analysis of graph data

**Libraries**

1. NetworkX: https://networkx.org/
2. scikit-network: https://scikit-network.readthedocs.io/en/latest/

In [None]:
# Load modules
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(13,6)})

import networkx as nx

In [None]:
%%bash
cd ../input
ls

In [None]:
# TODO
# - Create synthetic datasets or Find an undirected graph
# - Plot graph, degrees distribution, edge prob distributions
# - Find central nodes

## Datasets

You are free to use any **probabilistic graph** available. A small dataset will be given. However, since not many
real-life probabilistic networks are publicly available, you may work use synthetic datasets as follows: you
may take any undirected network and simply assign probabilities to the edges by using a probability
distribution such as uniform, normal, power-law, etc.

Two excellent repositories for graph data are:
1. http://snap.stanford.edu/data/index.html
2. http://networkrepository.com/

**Graph Embedding with Self Clustering: Facebook** 

http://snap.stanford.edu/data/gemsec-Facebook.html

# Collins

In [None]:
# Load datasets
collins_df = pd.read_csv("../input/collins.csv",header=None)

In [None]:
collins_df.columns = ["src","dst", "probability"]

In [None]:
collins_df.head(6)

In [None]:
collins_df.shape

In [None]:
collins_df["probability"].plot.hist(title="Edge Probabilities distribution", bins=100, xlim=[0,1]);

In [None]:
# Create graph
G = nx.from_pandas_edgelist(df=collins_df, source='src', target='dst', edge_attr="probability")

In [None]:
# Print graph properties
print(f" Number of nodes: {G.number_of_nodes()}")
print(f" Number of edges: {G.number_of_edges()}")
print(f" Is directed?: {G.is_directed()}")

In [None]:
# list(G.nodes)

In [None]:
# Plot degree distribution
plt.hist(dict(G.degree()).values(), bins=100)
plt.title("Degrees distribution");

In [None]:
# Plot graph
# nx.draw(G)

In [None]:
# nx.draw_spring(G, node_size=100, node_color="#00C98D", with_labels=False)

# Facebook datasets

In [None]:
import numpy as np

In [None]:
# Load datasets
artist_edges_df = pd.read_csv("../input/gemsec_facebook_dataset/artist_edges.csv")

In [None]:
artist_edges_df.head()

In [None]:
# Number of edges
artist_edges_df.shape

In [None]:
# Number of nodes
len(np.unique(np.hstack([artist_edges_df["node_1"].values, artist_edges_df["node_2"].values])))

**Artist Dataset**
- Number of edges : 50,515
- Number of nodes : 819,306

**Probability distributions**

1. Uniform
2. Normal
3. Power-Law

In [None]:
# Drop self loops
print("Before:", artist_edges_df.shape)
artist_edges_df = artist_edges_df[artist_edges_df["node_1"] != artist_edges_df["node_2"]]
print("After:", artist_edges_df.shape)

In [None]:
artist_edges_df[artist_edges_df["node_1"]==11760]

In [None]:
## 1. Uniform
artist_edges_df["probability"] = np.random.uniform(0,1, size=len(artist_edges_df))

In [None]:
(artist_edges_df["probability"]
 .plot
 .hist(title="Edge Probabilities distribution (Uniform)", bins=100, xlim=[0,1]));

In [None]:
# Save dataset
artist_edges_df.to_csv("../input/artists_uniform.csv", index=False, header=False)

In [None]:
## 2. Normal
probs = np.random.normal(0.5, 0.10, size=len(artist_edges_df))
probs[probs<0]=0
probs[probs>1]=0

artist_edges_df["probability"] = probs

In [None]:
(artist_edges_df["probability"]
 .plot
 .hist(title="Edge Probabilities distribution (Normal)", bins=100, xlim=[0,1]));

In [None]:
# Save dataset
artist_edges_df.to_csv("../input/artists_normal.csv", index=False, header=False)

In [None]:
## 3. Power Law
probs = np.random.power(0.7, size=len(artist_edges_df))
probs[probs<0]=0
probs[probs>1]=0

artist_edges_df["probability"] = probs

In [None]:
(artist_edges_df["probability"]
 .plot
 .hist(title="Edge Probabilities distribution (Power Law)", bins=100, xlim=[0,1]));

In [None]:
# Save dataset
artist_edges_df.to_csv("../input/artists_power_law.csv", index=False, header=False)

# Experiment results

In [1]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

## Load experiments

**Experiments**

1. experiments.csv
2. experiments_threshold_0.5.csv
3. experiments_k_large.csv


In [2]:
# Load experimenation results
experiments_df = pd.read_csv("../experiments/experiments.csv")
experiments_threshold_05_df = pd.read_csv("../experiments/experiments_threshold_0.5.csv")
experiments_k_large_df = pd.read_csv("../experiments/experiments_k_large.csv")

In [3]:
# Check total experiments per file
print("experiments_df:", experiments_df.shape)
print("experiments_threshold_05_df:", experiments_threshold_05_df.shape)
print("experiments_k_large_df:",experiments_k_large_df.shape)

experiments_df: (162, 5)
experiments_threshold_05_df: (12, 5)
experiments_k_large_df: (18, 5)


In [4]:
# Check experiments
pd.set_option('display.max_rows', 200)
experiments_df

Unnamed: 0,script,cores,dataset,k,time
0,graphframe_bs.py,1,artists_uniform,100,210.744795
1,graphframe_bs.py,1,artists_uniform,1000,202.733053
2,graphframe_bs.py,1,artists_uniform,10000,205.797122
3,graphframe_bs.py,2,artists_uniform,100,134.53694
4,graphframe_bs.py,2,artists_uniform,1000,136.553836
5,graphframe_bs.py,2,artists_uniform,10000,136.371078
6,graphframe_bs.py,8,artists_uniform,100,121.458325
7,graphframe_bs.py,8,artists_uniform,1000,114.910138
8,graphframe_bs.py,8,artists_uniform,10000,122.056432
9,rdd_bs.py,1,artists_uniform,100,965.181294


**There are 4 different variables (degrees of freedom). I must visualize the efect of each one in running times.**

1. Scripts (Algorithms)
    * Graphframes
    * Rdd
    * Best graphframes + rdd

2. Cores

3. Distribution

4. k

**`Experiments` file: Degrees of Freedom**

1. scripts = ["graphframe_bs.py", "graphframe_fast.py", "graphframe_fast_bv.py", "rdd_bs.py", "rdd_fast.py", "rdd_fast_bv.py"]
2. cores = [1,2,8]
3. dataset in ["artists_uniform", "artists_normal", "artists_power_law"]
4. k in [100, 1000, 10000]

In [5]:
print(experiments_df.shape)
print(6 * 3 * 3 * 3)

(162, 5)
162


In [6]:
# Check experiments_threshold_05_df
experiments_threshold_05_df

Unnamed: 0,script,cores,dataset,k,time
0,graphframe_fast.py,8,artists_normal,100,91.375761
1,graphframe_fast.py,8,artists_normal,1000,96.046723
2,graphframe_fast.py,8,artists_normal,10000,100.08475
3,graphframe_fast_bv.py,8,artists_normal,100,124.936311
4,graphframe_fast_bv.py,8,artists_normal,1000,141.315205
5,graphframe_fast_bv.py,8,artists_normal,10000,166.221728
6,rdd_fast.py,8,artists_normal,100,612.636081
7,rdd_fast.py,8,artists_normal,1000,601.980574
8,rdd_fast.py,8,artists_normal,10000,631.10798
9,rdd_fast_bv.py,8,artists_normal,100,509.034063


**Experiments `experiments_threshold_05_df`: Degrees of Freedom**

* scripts = ["graphframe_fast.py", "graphframe_fast_bv.py", "rdd_fast.py", "rdd_fast_bv.py"]  
* cores = [8]  
* dataset in ["artists_normal"]  
* k in [100, 1000, 10000]  

In [7]:
print(experiments_threshold_05_df.shape)
print(4*1*1*3)

(12, 5)
12


In [8]:
# Check experiments_k_large_df
experiments_k_large_df

Unnamed: 0,script,cores,dataset,k,time
0,graphframe_fast.py,8,artists_normal,2300000,495.97256
1,graphframe_fast_bv.py,8,artists_normal,2300000,603.951977
2,rdd_fast.py,8,artists_normal,2300000,2809.191476
3,rdd_fast_bv.py,8,artists_normal,2300000,2392.731071
4,graphframe_fast.py,8,artists_uniform,2300000,388.31084
5,graphframe_fast_bv.py,8,artists_uniform,2300000,1151.034513
6,rdd_fast.py,8,artists_uniform,2300000,1752.329388
7,rdd_fast_bv.py,8,artists_uniform,2300000,1583.311131
8,graphframe_fast.py,8,artists_power_law,2300000,356.513438
9,graphframe_fast_bv.py,8,artists_power_law,2300000,1092.251604


**Experiments `experiments_k_large_df`: Degrees of Freedom**

* scripts = ["graphframe_bs.py", "graphframe_fast.py", "graphframe_fast_bv.py", "rdd_bs","rdd_fast.py", "rdd_fast_bv.py"]
* cores = [8]
* dataset in ["artists_uniform", "artists_normal", "artists_power_law"]
* k = 2_300_000

In [9]:
print(experiments_k_large_df.shape)
print(6*1*3*1)

(18, 5)
18


## Prepare Files

In [10]:
# experiments_df
experiments_df = experiments_df.rename(columns={"script":"algorithm",
                                                "time":"time (sec)"})

experiments_df["algorithm"] = experiments_df["algorithm"].apply(lambda x: x.split(".")[0])

experiments_df["distribution"] = experiments_df["dataset"].apply(lambda x: "_".join(x.split("_")[1:]))

In [11]:
# experiments_df
experiments_threshold_05_df = experiments_threshold_05_df.rename(columns={"script":"algorithm",
                                                "time":"time (sec)"})

experiments_threshold_05_df["algorithm"] = experiments_threshold_05_df["algorithm"].apply(lambda x: x.split(".")[0])

experiments_threshold_05_df["distribution"] = experiments_threshold_05_df["dataset"].apply(lambda x: "_".join(x.split("_")[1:]))

In [12]:
# experiments_df
experiments_k_large_df = experiments_k_large_df.rename(columns={"script":"algorithm",
                                                "time":"time (sec)"})

experiments_k_large_df["algorithm"] = experiments_k_large_df["algorithm"].apply(lambda x: x.split(".")[0])

experiments_k_large_df["distribution"] = experiments_k_large_df["dataset"].apply(lambda x: "_".join(x.split("_")[1:]))

In [13]:
# Fix data types
# experiments_df["k"] = experiments_df["k"].astype(str)

In [14]:
# Fastest method
experiments_df.sort_values("time (sec)").head(10)

Unnamed: 0,algorithm,cores,dataset,k,time (sec),distribution
61,graphframe_fast,8,artists_uniform,1000,30.346403,uniform
151,rdd_fast,8,artists_power_law,1000,30.633403,power_law
60,graphframe_fast,8,artists_uniform,100,30.663337,uniform
150,rdd_fast,8,artists_power_law,100,30.933137,power_law
160,rdd_fast_bv,8,artists_power_law,1000,31.303999,power_law
159,rdd_fast_bv,8,artists_power_law,100,31.49306,power_law
133,graphframe_fast,8,artists_power_law,1000,31.587927,power_law
132,graphframe_fast,8,artists_power_law,100,31.978121,power_law
141,graphframe_fast_bv,8,artists_power_law,100,31.990183,power_law
142,graphframe_fast_bv,8,artists_power_law,1000,32.145957,power_law


In [15]:
# Slowest method
experiments_df.sort_values("time (sec)", ascending=False).head(10)

Unnamed: 0,algorithm,cores,dataset,k,time (sec),distribution
119,rdd_fast_bv,1,artists_normal,10000,1792.721759,normal
110,rdd_fast,1,artists_normal,10000,1560.367499,normal
45,rdd_bs,1,artists_power_law,100,1298.60767,power_law
109,rdd_fast,1,artists_normal,1000,1292.269979,normal
108,rdd_fast,1,artists_normal,100,1250.160017,normal
118,rdd_fast_bv,1,artists_normal,1000,1227.704351,normal
122,rdd_fast_bv,2,artists_normal,10000,1211.058084,normal
46,rdd_bs,1,artists_power_law,1000,1149.37348,power_law
29,rdd_bs,1,artists_normal,10000,1095.256856,normal
47,rdd_bs,1,artists_power_law,10000,1001.590389,power_law


## Full experiments

**Total graphframes experiments**: 3 rows * 3 columns * 3 lines * 3 points = 81

In [47]:
# 2. Compare graphframes implementations (All distributions)
filtered_dataset = experiments_df[(experiments_df["algorithm"].isin(["graphframe_bs",
                                                                     "graphframe_fast",
                                                                     "graphframe_fast_bv"]))].copy()


filtered_dataset["algorithm"] = (filtered_dataset["algorithm"]
                                 .replace({"graphframe_bs":"baseline",
                                           "graphframe_fast":"optimization 1",
                                           "graphframe_fast_bv":"optimization 2"}))


fig = px.line(filtered_dataset,
                 x="k",
                 y="time (sec)",
                 log_x=True,
                 log_y=False,
                 facet_row="distribution",
                 color="algorithm",
                 facet_col="cores")

fig.update_layout(
    title=f"Experiment times of GraphFrames implementations"
)


for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')

fig.update_layout(
    autosize=False,
    width=1000,
    height=700)
fig.show()

**Total rdd experiments**: 3 rows * 3 columns * 3 lines * 3 points = 81

In [50]:
# 3. Compare rdd (All distributions)
filtered_dataset = experiments_df[(experiments_df["algorithm"].isin(["rdd_bs",
                                                                     "rdd_fast",
                                                                     "rdd_fast_bv"]))].copy()

filtered_dataset["algorithm"] = (filtered_dataset["algorithm"]
                                 .replace({"rdd_bs":"baseline",
                                           "rdd_fast":"optimization 1",
                                           "rdd_fast_bv":"optimization 2"}))

fig = px.line(filtered_dataset,
                 x="k",
                 y="time (sec)",
                 log_x=True,
                 log_y=False,
                 facet_row="distribution",
                 color="algorithm",
                 facet_col="cores",
                 labels={"rdd_bs": "test", "rdd_fast": "test", "rdd_fast_bv":"r2w43r"})

fig.update_layout(
    title=f"Experiment times of RDD implementations",
    yaxis_title="time (sec)"
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')

fig.update_layout(
    autosize=False,
    width=1000,
    height=700)
fig.show()

## Experiments (threshold=0.5)

In [51]:
experiments_threshold_05_df

Unnamed: 0,algorithm,cores,dataset,k,time (sec),distribution
0,graphframe_fast,8,artists_normal,100,91.375761,[normal]
1,graphframe_fast,8,artists_normal,1000,96.046723,[normal]
2,graphframe_fast,8,artists_normal,10000,100.08475,[normal]
3,graphframe_fast_bv,8,artists_normal,100,124.936311,[normal]
4,graphframe_fast_bv,8,artists_normal,1000,141.315205,[normal]
5,graphframe_fast_bv,8,artists_normal,10000,166.221728,[normal]
6,rdd_fast,8,artists_normal,100,612.636081,[normal]
7,rdd_fast,8,artists_normal,1000,601.980574,[normal]
8,rdd_fast,8,artists_normal,10000,631.10798,[normal]
9,rdd_fast_bv,8,artists_normal,100,509.034063,[normal]


In [55]:
## Compare graphframes_fast and graphframes_fast_bv 
# with threshold=0.8 and threshold=0.5 for Normal distribution

fig = go.Figure()


## --> Extract times:
# bs, optimization 1, optimization 2
# threshold 0.5, 0.8

df_bs_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_bs")])

df_opt1_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_fast")])

df_opt2_08 = (experiments_df
             [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_fast_bv")])


df_opt1_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "graphframe_fast"])


df_opt2_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "graphframe_fast_bv"])




## Add lines

# 1. df_bs_08
fig.add_trace(go.Scatter(
    x = df_bs_08["k"],
    y = df_bs_08["time (sec)"],
    name="baseline"
))


# 2. df_opt1_08
fig.add_trace(go.Scatter(
    x = df_opt1_08["k"],
    y = df_opt1_08["time (sec)"],
    name="optimization 1, threshold=0.8",
    line = dict(color='firebrick')
))

# 3. df_opt2_08
fig.add_trace(go.Scatter(
    x = df_opt2_08["k"],
    y = df_opt2_08["time (sec)"],
    name="optimization 2, threshold=0.8",
    line = dict(color='#00CC96')
))


# 4. df_opt1_05
fig.add_trace(go.Scatter(
    x = df_opt1_05["k"],
    y = df_opt1_05["time (sec)"],
    name="optimization 1, threshold=0.5",
    line = dict(color='firebrick',dash='dash')
))


# 5. df_opt2_05
fig.add_trace(go.Scatter(
    x = df_opt2_05["k"],
    y = df_opt2_05["time (sec)"],
    name="optimization 2, threshold=0.5",
    line = dict(color='#00CC96',dash='dash')
))

fig.update_layout(
    title=f"Experiment times of GraphFrames implementations for threshold=0.8 and threshold=0.5 | Normal Distribution",
    xaxis_title="k",
    yaxis_title="time (sec)",
)

fig.update_xaxes(type="log")

fig.show()

In [56]:
## Compare rdd_fast and rdd_fast_bv
# with threshold=0.8 and threshold=0.5 for Normal distribution

fig = go.Figure()


## --> Extract times:
# bs, optimization 1, optimization 2
# threshold 0.5, 0.8

df_bs_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "rdd_bs")])

df_opt1_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "rdd_fast")])

df_opt2_08 = (experiments_df
             [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "rdd_fast_bv")])


df_opt1_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "rdd_fast"])


df_opt2_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "rdd_fast_bv"])




## Add lines

# 1. df_bs_08
fig.add_trace(go.Scatter(
    x = df_bs_08["k"],
    y = df_bs_08["time (sec)"],
    name="baseline"
))


# 2. df_opt1_08
fig.add_trace(go.Scatter(
    x = df_opt1_08["k"],
    y = df_opt1_08["time (sec)"],
    name="optimization 1, threshold=0.8",
    line = dict(color='firebrick')
))

# 3. df_opt2_08
fig.add_trace(go.Scatter(
    x = df_opt2_08["k"],
    y = df_opt2_08["time (sec)"],
    name="optimization 2, threshold=0.8",
    line = dict(color='#00CC96')
))


# 4. df_opt1_05
fig.add_trace(go.Scatter(
    x = df_opt1_05["k"],
    y = df_opt1_05["time (sec)"],
    name="optimization 1, threshold=0.5",
    line = dict(color='firebrick',dash='dash')
))


# 5. df_opt2_05
fig.add_trace(go.Scatter(
    x = df_opt2_05["k"],
    y = df_opt2_05["time (sec)"],
    name="optimization 2, threshold=0.5",
    line = dict(color='#00CC96',dash='dash')
))

fig.update_layout(
    title=f"Experiment times of RDD implementations for threshold=0.8 and threshold=0.5 | Normal Distribution",
    xaxis_title="k",
    yaxis_title="time (sec)",
)

fig.update_xaxes(type="log")

fig.show()

## Experiments (k = triangles + 1)

Compare `optimization 1` and `optimization 2` (with and without broadcast variable)


Αυτά τα πειράματα δεν έτρεξαν λόγω μνήμης.
```
java.lang.OutOfMemoryError
```

In [98]:
# experiments_df[(experiments_df["cores"] == 8) &
#                (experiments_df["algorithm"] == "graphframe_bs")]

In [99]:
experiments_k_large_df

Unnamed: 0,algorithm,cores,dataset,k,time (sec),distribution
0,graphframe_fast,8,artists_normal,2300000,495.97256,normal
1,graphframe_fast_bv,8,artists_normal,2300000,603.951977,normal
2,rdd_fast,8,artists_normal,2300000,2809.191476,normal
3,rdd_fast_bv,8,artists_normal,2300000,2392.731071,normal
4,graphframe_fast,8,artists_uniform,2300000,388.31084,uniform
5,graphframe_fast_bv,8,artists_uniform,2300000,1151.034513,uniform
6,rdd_fast,8,artists_uniform,2300000,1752.329388,uniform
7,rdd_fast_bv,8,artists_uniform,2300000,1583.311131,uniform
8,graphframe_fast,8,artists_power_law,2300000,356.513438,power_law
9,graphframe_fast_bv,8,artists_power_law,2300000,1092.251604,power_law


In [100]:
# Compare baseline implementations for k> total triangles
# for Normal distribution

fig = go.Figure()


## Add Bars

# 1. graphframe_bs
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_bs", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_bs", "time (sec)"].values,
    name="Baseline"
))


# 2. graphframe_fast
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast", "time (sec)"].values,
    name="Optimization 1"
))


# 3. graphframe_fast_bv
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast_bv", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast_bv", "time (sec)"].values,
    name="Optimization 2"
))


fig.update_layout(
    title=f"Stress test: GraphFrames implementations for k > total number of triangles",
    xaxis_title="distribution",
    yaxis_title="time (sec)",
)

fig.show()

In [101]:
# Compare baseline implementations for k> total triangles
# for Normal distribution

fig = go.Figure()


## Add Bars

# 1. graphframe_bs
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_bs", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_bs", "time (sec)"].values,
    name="Baseline"
))


# 2. graphframe_fast
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast", "time (sec)"].values,
    name="Optimization 1"
))


# 3. graphframe_fast_bv
fig.add_trace(go.Bar(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast_bv", "distribution"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast_bv", "time (sec)"].values,
    name="Optimization 2"
))


fig.update_layout(
    title=f"Stress test: RDD implementations for k > total number of triangles",
    xaxis_title="distribution",
    yaxis_title="time (sec)",
)

fig.show()

## GraphFrames vs RDD

In [24]:
# 2. Compare graphframes implementations (All distributions)
filtered_dataset = experiments_df[(experiments_df["cores"] == 8) &
                                  (experiments_df["algorithm"].isin(["graphframe_fast",
                                                                     "rdd_fast",
                                                                     "rdd_fast_bv"]))].copy()


filtered_dataset["algorithm"] = (filtered_dataset["algorithm"]
                                 .replace({"graphframe_fast":"optimization 1 - GraphFrames",
                                           "rdd_fast":"optimization 1 - RDD",
                                           "rdd_fast_bv":"optimization 2 - RDD"}))


fig = px.line(filtered_dataset,
                 x="k",
                 y="time (sec)",
                 log_x=True,
                 log_y=True,
                 facet_col="distribution",
                 color="algorithm")

fig.update_layout(
    title=f"Results comparison: GraphFrames vs RDD implementations"
)


for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')

fig.update_layout(
    autosize=False,
    width=1000,
    height=500)
fig.show()

In [None]:
## Compare graphframes_fast and graphframes_fast_bv 
# with threshold=0.8 and threshold=0.5 for Normal distribution

fig = go.Figure()


## --> Extract times:
# bs, optimization 1, optimization 2
# threshold 0.5, 0.8

df_bs_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_bs")])

df_opt1_08 = (experiments_df
            [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_fast")])

df_opt2_08 = (experiments_df
             [(experiments_df["cores"]==8) &
             (experiments_df["dataset"]=="artists_normal") &             
             (experiments_df["algorithm"] == "graphframe_fast_bv")])


df_opt1_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "graphframe_fast"])


df_opt2_05 = (experiments_threshold_05_df
              [experiments_threshold_05_df["algorithm"] == "graphframe_fast_bv"])




## Add lines

# 1. df_bs_08
fig.add_trace(go.Scatter(
    x = df_bs_08["k"],
    y = df_bs_08["time (sec)"],
    name="baseline"
))


# 2. df_opt1_08
fig.add_trace(go.Scatter(
    x = df_opt1_08["k"],
    y = df_opt1_08["time (sec)"],
    name="optimization 1, threshold=0.8",
    line = dict(color='firebrick')
))

# 3. df_opt2_08
fig.add_trace(go.Scatter(
    x = df_opt2_08["k"],
    y = df_opt2_08["time (sec)"],
    name="optimization 2, threshold=0.8",
    line = dict(color='#00CC96')
))


# 4. df_opt1_05
fig.add_trace(go.Scatter(
    x = df_opt1_05["k"],
    y = df_opt1_05["time (sec)"],
    name="optimization 1, threshold=0.5",
    line = dict(color='firebrick',dash='dash')
))


# 5. df_opt2_05
fig.add_trace(go.Scatter(
    x = df_opt2_05["k"],
    y = df_opt2_05["time (sec)"],
    name="optimization 2, threshold=0.5",
    line = dict(color='#00CC96',dash='dash')
))


fig.update_xaxes(type="log")


fig.update_layout(
    title=f"Results comparison: GraphFrames vs RDD",
    xaxis_title="distribution",
    yaxis_title="time (sec)",
)

fig.show()

## Legacy plots

In [None]:
## Compare rdd_fast and rdd_fast_bv
# with threshold=0.8 and threshold=0.5 for Normal distribution

fig = go.Figure()


## Add lines

# 1. graphframe_fast
fig.add_trace(go.Scatter(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast", "dataset"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast", "time (sec)"].values,
    name="Graphframes, optimization 1",
    line = dict(color='#00CC96')
))


# 2. graphframe_fast_bv
fig.add_trace(go.Scatter(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast_bv", "dataset"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="graphframe_fast_bv", "time (sec)"].values,
    name="Graphframes, optimization 2",
    line = dict(color='#00CC96',dash='dash')
))


# 1. rdd_fast
fig.add_trace(go.Scatter(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast", "dataset"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast", "time (sec)"].values,
    name="RDD, optimization 1",
    line = dict(color='royalblue')
))


# 2. rdd_fast_bv
fig.add_trace(go.Scatter(
    x = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast_bv", "dataset"].values,
    y = experiments_k_large_df.loc[experiments_k_large_df["algorithm"]=="rdd_fast_bv", "time (sec)"].values,
    name="RDD, optimization 2",
    line = dict(color='royalblue',dash='dash')
))



fig.update_layout(
    title=f"Compare Optimization 1 - Optimization 2 (Broadcast variable) | k > total number of triangles",
    xaxis_title="distribution",
    yaxis_title="time (secs.)",
)

fig.show()

In [None]:
# 
# for cores=8 

filtered_dataset = experiments_df[(experiments_df["cores"]==8) &
                                  (experiments_df["algorithm"].isin(["graphframe_fast", "graphframe_fast_bv"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time (secs.)",
                 log_x=True,
                 log_y=True,
                 color="algorithm",
                 facet_col="dataset")

fig.update_layout(
    title=f"Test",
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()

In [None]:
# fig.for_each_annotation(lambda a: a.update(text="time (sec.)"))

In [None]:
# 1. Compare graphframes implementations (Single distribution)
filtered_dataset = experiments_df[(experiments_df["dataset"]=="artists_uniform") &
                                  (experiments_df["script"].isin(["graphframe_bs.py",
                                                                  "graphframe_fast.py",
                                                                  "graphframe_fast_bv.py"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time",
                 log_x=True,
                 log_y=True,
                 color="script",
                 facet_col="cores")

fig.update_layout(
    title=f"Results of graphframes implementations - Uniform edge-probability distribution",
    yaxis_title="time (sec.)",
)


for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()

In [None]:
# Compare graphframe_fast,graphframe_fast_bv, rdd_fast,rdd_fast_bv implementations
# for cores=8 

filtered_dataset = experiments_df[(experiments_df["cores"]==8) &
                                  (experiments_df["script"].isin(["graphframe_fast.py",
                                                                  "graphframe_fast_bv.py",
                                                                  "rdd_fast.py",
                                                                  "rdd_fast_bv.py"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time",
                 log_x=True,
                 log_y=True,
                 color="script",
                 facet_col="dataset")

fig.update_layout(
    title=f"graphframe_fast vs rdd_vs implementations for 8 cores",
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()

In [None]:
# 2. Compare rdd implementations
filtered_dataset = experiments_df[(experiments_df["dataset"]=="artists_uniform") &
                                  (experiments_df["script"].isin(["rdd_bs.py",
                                                                  "rdd_fast.py",
                                                                  "rdd_fast_bv.py"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time",
                 log_x=True,
                 log_y=True,
                 color="script",
                 facet_col="cores",
                 line_dash_map="dataset")

fig.update_layout(
    title=f"Graphframes implementations for uniform edge distribution",
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()

In [None]:
# 2. Compare distributions for graphframes implementations
filtered_dataset = experiments_df[(experiments_df["cores"]==8) &
                                  (experiments_df["script"].isin(["graphframe_bs.py",
                                                                  "graphframe_fast.py",
                                                                  "graphframe_fast_bv.py"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time",
                 log_x=True,
                 log_y=True,
                 color="script",
                 facet_col="dataset")

fig.update_layout(
    title=f"Compare distributions for graphframes implementations and cores=8",
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()

In [None]:
# 2. Compare distributions for rdd implementation
filtered_dataset = experiments_df[(experiments_df["cores"]==8) &
                                  (experiments_df["script"].isin(["rdd_bs.py",
                                                                  "rdd_fast.py",
                                                                  "rdd_fast_bv.py"]))]

fig = px.line(filtered_dataset,
                 x="k",
                 y="time",
                 log_x=True,
                 log_y=True,
                 color="script",
                 facet_col="dataset")

fig.update_layout(
    title=f"Graphframes implementations for uniform edge distribution",
)

for i in range(len(fig.data)):
    fig.data[i].update(mode='markers+lines')
    
fig.show()