In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from matplotlib.path import Path
import ast
import pickle
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
import networkx as nx
from collections import Counter

In [19]:
def find_points(art):
    '''
    this function returns all the pixels of the given artworks
    '''
    x, y = np.meshgrid(np.arange(2000), np.arange(2000)) # make a canvas with coordinates
    x, y = x.flatten(), y.flatten()
    points = np.vstack((x,y)).T 

    p = Path(art) # make a polygon
    grid = p.contains_points(points,radius = 0)
    mask = grid.reshape(2000,2000) # now you have a mask with points inside a polygon
    x,y = mask.nonzero()
    coords = []
    for xc,yc in zip(x,y):
        coords.append((xc,yc))
    return coords

## Which artworks consider ?
We have at least two possible choices:
1) Define ourselves which artworks consider. In this example I consider first the 5 largest artworks on the canvas or the artworks containing in the name certain strings (Flag of Ukraine, Flag of France etc..)
2) Pick n artworks random (with at least some constrain on area for uniformity)

In [16]:
# 5 largest artworks
arts = pd.read_csv("data/artworks_ordered.csv")
arts[:5]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,path,center,area
0,5372,5372,txrg84,Flag of France,"{'109-164, T': [[249, 1469], [0, 1469], [0, 19...","{'109-164, T': [125, 1719]}",124500.0
1,1187,1187,twpyje,Flag of Germany,"{'109-166, T': [[0, 1121], [448, 1121], [447, ...","{'109-166, T': [992, 1148]}",96223.0
2,258,258,000354,The Nordic Union,"{'1-166, T': [[187, 93], [187, 97], [207, 97],...","{'1-166, T': [506, 129]}",89953.5
3,4866,4866,txd8wt,Central Alliance,"{'1-166, T': [[448, 890], [512, 890], [512, 90...","{'1-166, T': [447, 994]}",75325.5
4,7916,7916,ubyhtf,Netherlands,"{'56-166, T': [[297, 13], [297, 35], [886, 35]...","{'56-166, T': [1171, 19]}",54044.0


In [12]:
# Specify artworks using strings
arts = arts[arts['name'].str.contains('Flag of Ukraine|Flag of France|Flag of Spain|Nordic Union')]
arts =  arts[arts['area'] > 1000]
arts =  arts[arts['area'] < 100000]
print(len(arts))
arts


17


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,path,center,area
0,5372,5372,txrg84,Flag of France,"{'109-164, T': [[249, 1469], [0, 1469], [0, 19...","{'109-164, T': [125, 1719]}",124500.0
2,258,258,000354,The Nordic Union,"{'1-166, T': [[187, 93], [187, 97], [207, 97],...","{'1-166, T': [506, 129]}",89953.5
10,6479,6479,tzavbw,Flag of Spain,"{'109-166, T': [[1158, 1491], [1505, 1498], [1...","{'109-166, T': [1327, 1551]}",38713.5
13,4159,4159,tx86fp,Flag of Ukraine,"{'1-165, T': [[0, 177], [27, 177], [27, 169], ...","{'1-165, T': [216, 211]}",35299.0
42,1216,1216,twpuro,Flag of Spain,"{'56-166, T': [[865, 281], [865, 308], [1489, ...","{'56-166, T': [1145, 295]}",17398.0
89,310,310,twpee3,Flag of France,"{'1-165, T': [[125, 497], [175, 497], [175, 28...","{'1-165, T': [152, 397]}",9633.5
136,692,692,twlnhq,Flag of France,"{'56-165, T': [[1129, 730], [1129, 829], [1200...","{'56-165, T': [1163, 777]}",7292.5
238,702,702,twlk86,Flag of France,"{'1-165, T': [[433, 757], [433, 829], [373, 82...","{'1-165, T': [403, 793]}",4320.0
1297,3110,3110,tx1ghf,The Nordic Union taskbar button,"{'109-166, T': [[835, 1972], [834, 2000], [866...","{'109-166, T': [850, 1986]}",882.0
4224,3581,3581,tx5nb9,The Flag of Spain,"{'109-165, T': [[1062, 1590], [1062, 1596], [1...","{'109-165, T': [1076, 1593]}",168.0


In [20]:
# Random artworks

n_arts = 100 # number of artworks considered
arts = pd.read_csv("artworks_ordered.csv")
arts =  arts[arts['area'] > 5000]
arts =  arts[arts['area'] < 100000]
print(len(arts))
arts = arts.sample(n_arts)
len(arts)

197


100

In [6]:
# It's useful to keep track of which artworks we're going to consider for later plot and analysis
name = arts['name']
name.to_csv('../output/name_arts.csv')

In [7]:
# In this chunk I save coords of every artworks in a Dict
artworks = {}
id_index = 0
for row in arts.itertuples():
    #name = row.name
    path = row.path
    path = ast.literal_eval(path)
    coords = list(path.values())[0]
    artworks[id_index] = coords
    id_index = id_index + 1 

## Processing of the dataset

In [8]:
BEGIN = 3270 # This is the time when the canvas doubled up for the second time
ENDING = 4908
#FIRST_TIMESTAMP = 180 
#LAST_TIMESTAMP_before_WHITE = 4907 
#LAST_TIMESTAMP = 5189 

df = pd.read_csv(
        '../data/reddit_trimmed.csv',
        engine="c",
        dtype={'timestamp':np.uint16,
          'user_id':np.uint32,
          'pixel_color':np.uint8,
          'x':np.uint16,
          'y':np.uint16,})

df = df[df['timestamp'] > BEGIN]
df = df[df['timestamp'] < ENDING]
ltot = len(df)

In [9]:
dict_set = {}
user_actions = {}
for key in artworks.keys():
    dict_set[key] = set(find_points(artworks[key]))
    user_actions[key] = {}

row_counter = 0
for row in df.itertuples():
    user = row.user_id
    x = row.x
    y = row.y
    for s_key in dict_set.keys():
        if (x,y) in dict_set[s_key]:
            if  user in user_actions[s_key]:
                user_actions[s_key][user] = user_actions[s_key][user] + 1 
            else:
                user_actions[s_key][user] = 1
    row_counter = row_counter + 1
    
    if (row_counter % 5e6) == 0:
        print(f'{row_counter / ltot * 100:.2f}')

6.79
13.58
20.37
27.15
33.94
40.73
47.52
54.31
61.10
67.89
74.68
81.46
88.25
95.04


In [17]:
f = open('../output/network_random_arts.csv', 'w')

MIN_TILES = 2
counter = 0

for s_key in dict_set.keys():

    for user_key in user_actions[s_key].keys():

        if user_actions[s_key][user_key] >= MIN_TILES:
            f.write(str(s_key) + ',' + str(user_key) + ',' + str(user_actions[s_key][user_key]) + '\n')
            counter += 1
f.close()