In [55]:
import dask.dataframe as dd
from dask.distributed import Client, progress
import glob
from ipywidgets import interact

## Spin up Dask Cluster

In [6]:
client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:34263  Dashboard: http://127.0.0.1:33185/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB


## Analyse Data

In [65]:
def analyse_dataset(x):
    df = dd.read_csv(x, names=["user", "tweet", "edge"])
    
    print("Basic Stats")
    print("Number of users", df["user"].nunique().compute())
    print("Number of tweets", df["tweet"].nunique().compute())
    print("Number of edges", df["edge"].count().compute())

    print("\nUser Degree Centrality")
    tweets_per_user = df.groupby(["user"])["tweet"].count()
    print("Average tweet interactions per user", tweets_per_user.mean().compute())
    print("Max tweet interactions per user", tweets_per_user.max().compute())
    print("Min tweet interactions per user", tweets_per_user.min().compute())
    print("50th Percentile user interactions per tweet", tweets_per_user.quantile(0.5).compute())
    print("99th Percentile user interactions per tweet", tweets_per_user.quantile(0.9).compute())

    print("\nTweet Degree Centrality")
    users_per_tweet = df.groupby(["tweet"])["user"].count()
    print("Average user interactions per tweet", users_per_tweet.mean().compute())
    print("Min user interactions per tweet", users_per_tweet.min().compute())
    print("Max user interactions per tweet", users_per_tweet.max().compute())
    print("50th Percentile user interactions per tweet", users_per_tweet.quantile(0.5).compute())
    print("99th Percentile user interactions per tweet", users_per_tweet.quantile(0.9).compute())

In [66]:
datasets = glob.glob("data/*.csv")

interact(analyse_dataset, x=datasets)

interactive(children=(Dropdown(description='x', options=('data/tweets-v2.csv', 'data/tweets-v1.csv'), value='d…

<function __main__.analyse_dataset(x)>