# From Annotations to Features

This Python notebook describes the process of the three files:

- `users_infected_diffusion.graphml` with the users their attributes, and the diffusion;

- `tweets.csv` with the tweets and their respective users;

- `users_to_annotate.csv` a csv file with the 5071 users to be annotated.

- `annotated.csv` a csv file with the results in the annotation.

- `created_at.csv` a csv file with the creation date for the annotated users. This was collected after the main data collection, due to a bug in the data collection script (which has been fixed).

Into the following files:

- `users_all_neigh.csv` a csv file with the features extracted for the $100000$ users.

- `users_all_neigh_anon.csv` an anonymous version of the previous file.

- A set of files to be used by GraphSage:

    - `sw-G.json` -- A networkx-specified json file describing the input graph. Nodes have 'val' and 'test' attributes specifying if they are a part of the validation and test sets, respectively.
    - `sw-id_map.json` -- A json-stored dictionary mapping the graph node ids to consecutive integers.
    - `sw-class_map.json` -- A json-stored dictionary mapping the graph node ids to classes.
    - `sw-feats.npy` --- A numpy-stored array of node features; ordering given by id_map.json. Can be omitted and only identity features will be used.
    
We begin extracting the median and average time between tweets for each user using the `tweets.csv` file:

`tweets.csv` $\rightarrow$ `time_diff.csv`

In [None]:
import pandas as pd

tweets = pd.read_csv("../data/tweets.csv")
tweets.sort_values(by=["user_id", "tweet_creation"], ascending=True, inplace=True)
tweets["time_diff"] = tweets.groupby("user_id", sort=False).tweet_creation.diff()
time_diff_series_mean = tweets.groupby("user_id", sort=False).time_diff.mean()
time_diff_series_median = tweets.groupby("user_id", sort=False).time_diff.median()
time_diff = time_diff_series_mean.to_frame()
time_diff["time_diff_median"] = time_diff_series_median
time_diff.to_csv("../data/time_diff.csv")

We then use this time difference, the diffusion graph, and the annotations. We link these values, and calculate centrality measures for the graph, such as betweenness, eigenvector, in degree and out degree.

We also set a flag for the **neighbors** of the users who are hateful or normal.

`time_diff.csv` `users_infected_diffusion.graphml` `annotated.csv` $\rightarrow$ `users_hate.graphml`

In [None]:
import networkx as nx
import time
import csv

# Read annotated users

f = open("../data/annotated.csv", "r")
csv_writer = csv.DictReader(f)

set_users = dict()

for line in csv_writer:
    if line["hate"] == '1':
        set_users[line["user_id"]] = 1
    elif line["hate"] == "0":
        set_users[line["user_id"]] = 0
f.close()

# Read intervals between tweets

f = open("../data/time_diff.csv", "r")
csv_writer = csv.DictReader(f)

users_interval_median = dict()
users_interval_average = dict()

for line in csv_writer:
    users_interval_median[line["user_id"]] = line["time_diff_median"]
    users_interval_average[line["user_id"]] = line["time_diff"]

# Set hate attributes

nx_graph = nx.read_graphml("../data/users_infected_diffusion.graphml")
nx.set_node_attributes(nx_graph, name="hate", values=-1)
nx.set_node_attributes(nx_graph, name="hate", values=set_users)

# Set hateful and normal neighbors attribute

nodes = nx_graph.nodes(data='hate')

hateful_neighbors = dict()
normal_neighbors = dict()

for i in nodes:
    if i[1] == 1:  # hateful node
        for j in nx_graph.neighbors(i[0]):
            hateful_neighbors[j] = True
    if i[1] == 0:
        for j in nx_graph.neighbors(i[0]):
            normal_neighbors[j] = True

nx.set_node_attributes(nx_graph, name="hateful_neighbors", values=False)
nx.set_node_attributes(nx_graph, name="hateful_neighbors", values=hateful_neighbors)
nx.set_node_attributes(nx_graph, name="normal_neighbors", values=False)
nx.set_node_attributes(nx_graph, name="normal_neighbors", values=normal_neighbors)

# Set median and average interval attributes

nx.set_node_attributes(nx_graph, name="median_interval", values=users_interval_median)
nx.set_node_attributes(nx_graph, name="average_interval", values=users_interval_average)

# Set node network-based attributes, such as betweenness and eigenvector
vt = time.time()
betweenness = nx.betweenness_centrality(nx_graph, k=16258, normalized=False)
eigenvector = nx.eigenvector_centrality(nx_graph)
in_degree = nx.in_degree_centrality(nx_graph)
out_degree = nx.out_degree_centrality(nx_graph)

nx.set_node_attributes(nx_graph, name="betweenness", values=betweenness)
nx.set_node_attributes(nx_graph, name="eigenvector", values=eigenvector)
nx.set_node_attributes(nx_graph, name="in_degree", values=in_degree)
nx.set_node_attributes(nx_graph, name="out_degree", values=out_degree)

nx.write_graphml(nx_graph, "../data/users_hate.graphml")


We then create a csv file with users and these attributes:

    user_id            - unique identifier of a user 
    hate               - hateful|normal|other
    hate_neigh         - True|False
    normal_neigh       - True|False
    statuses_count     - number of statuses
    followers_count    - number of followers
    followees_count    - number of followees
    favorites_count    - number of favorites
    listed_count       - number of listed
    median_int         - median interval between tweets
    average_int        - average interval between tweets
    betweenness        - centrality measure
    eigenvector        - centrality measure
    in_degree          - centrality measure
    out_degree         - centrality measure

In [None]:
import networkx as nx
import pandas as pd

nx_graph = nx.read_graphml("../data/users_hate.graphml")

hate = nx.get_node_attributes(nx_graph, "hate")
hate_n = nx.get_node_attributes(nx_graph, "hateful_neighbors")
normal_n = nx.get_node_attributes(nx_graph, "normal_neighbors")
betweenness = nx.get_node_attributes(nx_graph, "betweenness")
eigenvector = nx.get_node_attributes(nx_graph, "eigenvector")
in_degree = nx.get_node_attributes(nx_graph, "in_degree")
out_degree = nx.get_node_attributes(nx_graph, "out_degree")
statuses_count = nx.get_node_attributes(nx_graph, "statuses_count")
followers_count = nx.get_node_attributes(nx_graph, "followers_count")
followees_count = nx.get_node_attributes(nx_graph, "followees_count")
favorites_count = nx.get_node_attributes(nx_graph, "favorites_count")
listed_count = nx.get_node_attributes(nx_graph, "listed_count")
median_interval = nx.get_node_attributes(nx_graph, "median_interval")
average_interval = nx.get_node_attributes(nx_graph, "average_interval")

users = []

for user_id in hate.keys():
    hateful = "other"

    if hate[user_id] == 1:
        hateful = "hateful"

    elif hate[user_id] == 0:
        hateful = "normal"

    median_int = None if user_id not in median_interval else median_interval[user_id]

    average_int = None if user_id not in average_interval else average_interval[user_id]

    users.append((user_id, hateful, hate_n[user_id], normal_n[user_id],  # General Stuff
                  statuses_count[user_id], followers_count[user_id], followees_count[user_id],
                  favorites_count[user_id], listed_count[user_id], median_int,  average_int,  # Numeric attributes
                  betweenness[user_id], eigenvector[user_id],  # Network Attributes
                  in_degree[user_id], out_degree[user_id]))

columns = ["user_id", "hate", "hate_neigh", "normal_neigh", "statuses_count", "followers_count", "followees_count",
           "favorites_count", "listed_count", "median_int", "average_int",
           "betweenness", "eigenvector", "in_degree", "out_degree"]

df = pd.DataFrame.from_records(users, columns=columns)
df.to_csv("../data/users_attributes.csv", index=False)
