# From Annotations to Features

This Python notebook describes the process of the transforming:

- `users_hate.graphml` with the users their attributes, and the diffusion;

- `tweets.csv` with the tweets and their respective users;

- `users_to_annotate.csv` a csv file with the 5071 users to be annotated.

- `annotated_full.csv` a csv file with the results in the annotation.

- `created_at.csv` a csv file with the creation date for the annotated users. This was collected after the main data collection, due to a bug in the data collection script (which has been fixed).

- `deleted_account.csv` a csv file with whether the accounts have been deleted after 3 months.

Into the following files:

- `users_anon.graphml` graph with all user attributes anonymized.

- `users_all_neighborhood_anon.csv` an anonymous version of the previous file.

- A set of files to be used by GraphSage:

    - `sw-G.json` -- A networkx-specified json file describing the input graph. Nodes have 'val' and 'test' attributes specifying if they are a part of the validation and test sets, respectively.
    - `sw-id_map.json` -- A json-stored dictionary mapping the graph node ids to consecutive integers.
    - `sw-class_map.json` -- A json-stored dictionary mapping the graph node ids to classes.
    - `sw-feats.npy` --- A numpy-stored array of node features; ordering given by id_map.json. Can be omitted and only identity features will be used.
    
## User Attributes
    
We begin using `annotated_full.csv`and `users_infected_diffusion.graphml` to create a new graph which has annotations, and also centrality measures (why not do the two things at the same time right?)!

`annotated_full.csv` `users_infected_diffusion.graphml` $\rightarrow$ `users_hate.graphml`

`1_average_intervals.py`

In [None]:
import networkx as nx
import time
import csv

# Read annotated users

f = open("../data/annotated_full.csv", "r")
csv_writer = csv.DictReader(f)

set_users = dict()
c = 0
tmp = set()
for line in csv_writer:
    if line["hate"] == '1':
        tmp.add(line["user_id"])
        set_users[line["user_id"]] = 1
    elif line["hate"] == "0":
        set_users[line["user_id"]] = 0
f.close()


# Set hate attributes

nx_graph = nx.read_graphml("../data/users_infected_diffusion.graphml")


nx.set_node_attributes(nx_graph, name="hate", values=-1)
nx.set_node_attributes(nx_graph, name="hate", values=set_users)
# Set hateful and normal neighbors attribute

nodes = nx_graph.nodes(data='hate')

hateful_neighbors = dict()
normal_neighbors = dict()

for i in nodes:
    if i[1] == 1:  # hateful node
        for j in nx_graph.neighbors(i[0]):
            hateful_neighbors[j] = True
    if i[1] == 0:
        for j in nx_graph.neighbors(i[0]):
            normal_neighbors[j] = True

nx.set_node_attributes(nx_graph, name="hateful_neighbors", values=False)
nx.set_node_attributes(nx_graph, name="hateful_neighbors", values=hateful_neighbors)
nx.set_node_attributes(nx_graph, name="normal_neighbors", values=False)
nx.set_node_attributes(nx_graph, name="normal_neighbors", values=normal_neighbors)


# Set node network-based attributes, such as betweenness and eigenvector

vt = time.time()
betweenness = nx.betweenness_centrality(nx_graph, k=16258, normalized=False)
eigenvector = nx.eigenvector_centrality(nx_graph)
in_degree = nx.in_degree_centrality(nx_graph)
out_degree = nx.out_degree_centrality(nx_graph)

nx.set_node_attributes(nx_graph, name="betweenness", values=betweenness)
nx.set_node_attributes(nx_graph, name="eigenvector", values=eigenvector)
nx.set_node_attributes(nx_graph, name="in_degree", values=in_degree)
nx.set_node_attributes(nx_graph, name="out_degree", values=out_degree)

nx.write_graphml(nx_graph, "../data/users_hate.graphml")

We then create a csv file with users and these attributes:

    user_id            - unique identifier of a user 
    hate               - hateful|normal|other
    hate_neigh         - True|False
    normal_neigh       - True|False
    statuses_count     - number of statuses
    followers_count    - number of followers
    followees_count    - number of followees
    favorites_count    - number of favorites
    listed_count       - number of listed
    median_int         - median interval between tweets
    average_int        - average interval between tweets
    betweenness        - centrality measure
    eigenvector        - centrality measure
    in_degree          - centrality measure
    out_degree         - centrality measure
    
`users_hate.graphml` $\rightarrow$ `users_attributes.csv`

`2_get_user_attributes.py`

In [None]:
import networkx as nx
import pandas as pd

nx_graph = nx.read_graphml("../data/users_hate.graphml")

hate = nx.get_node_attributes(nx_graph, "hate")

hate_n = nx.get_node_attributes(nx_graph, "hateful_neighbors")
normal_n = nx.get_node_attributes(nx_graph, "normal_neighbors")
betweenness = nx.get_node_attributes(nx_graph, "betweenness")
eigenvector = nx.get_node_attributes(nx_graph, "eigenvector")
in_degree = nx.get_node_attributes(nx_graph, "in_degree")
out_degree = nx.get_node_attributes(nx_graph, "out_degree")
statuses_count = nx.get_node_attributes(nx_graph, "statuses_count")
followers_count = nx.get_node_attributes(nx_graph, "followers_count")
followees_count = nx.get_node_attributes(nx_graph, "followees_count")
favorites_count = nx.get_node_attributes(nx_graph, "favorites_count")
listed_count = nx.get_node_attributes(nx_graph, "listed_count")

users = []

for user_id in hate.keys():
    hateful = "other"

    if hate[user_id] == 1:
        hateful = "hateful"

    elif hate[user_id] == 0:
        hateful = "normal"

    users.append((user_id, hateful, hate_n[user_id], normal_n[user_id],  # General Stuff
                  statuses_count[user_id], followers_count[user_id], followees_count[user_id],
                  favorites_count[user_id], listed_count[user_id],  # Numeric attributes
                  betweenness[user_id], eigenvector[user_id],  # Network Attributes
                  in_degree[user_id], out_degree[user_id]))

columns = ["user_id", "hate", "hate_neigh", "normal_neigh", "statuses_count", "followers_count", "followees_count",
           "favorites_count", "listed_count", "betweenness", "eigenvector", "in_degree", "out_degree"]

df = pd.DataFrame.from_records(users, columns=columns)

df.to_csv("../data/users_attributes.csv", index=False)

---

## Content-based attributes

We now start dealing with the content-based attributes. Notice that the next two scripts actually calculate We begin by calculating "simple" attributes based on content. We begin extracting the following from the tweets of each user:

    number hashtags          - number of hashtags used by the user
    hashtags                 - string with the hashtags
    tweet number             - number of tweets (original)
    retweet number           - number of retweets
    quote number             - number of quotes
    status length            - length of the tweets (average)
    number urls              - number of urls in the tweets
    baddies                  - number of bad words
    mentions                 - number of mentions
    
`tweets.csv` `bad_words.txt` $\rightarrow$ `./tmp2/users_content_*`

`3_content_attributes1.py`

In [None]:
%%bash
python3 ./5_content_attributes1.py

#### Then we procced to extract more complicated features from text, most specifically, we create vector representations based on Empath and on GloVe. We also perform sentiment analysis using VADER.

    *_empath hashtags        - 100+ empath categories
    *_glove                  - 300 dim glove vector
    sentiment                - sentiment score
    subjectivity             - subjectivity score


`tweets.csv` $\rightarrow$ `./tmp/users_content_*`

`4_content_attributes2.py`

In [None]:
%%bash
python3 ./6_content_attributes2.py

We use a bash script to put together these files (which were separated in the first place so we could parallelize the proccess);

`tmp/*.csv` $\rightarrow$ `users_content.csv`

`tmp2/*.csv` $\rightarrow$ `users_content2.csv`

`7_concat_files.sh`

In [None]:
%%bash
OutFileName="../data/users_content.csv"            # Fix the output name
i=0                                                # Reset a counter
for filename in ../data/tmp/*.csv; do
    if [ "$filename"  != "$OutFileName" ] ;        # Avoid recursion
    then
    if [[ $i -eq 0 ]] ; then
       head -1  $filename >   $OutFileName         # Copy header if it is the first file
    fi
    tail -n +2  $filename >>  $OutFileName         # Append from the 2nd line each file
    i=$(( $i + 1 ))                                # Increase the counter
    fi
done
OutFileName="../data/users_content2.csv"           # Fix the output name
i=0                                                # Reset a counter
for filename in ../data/tmp2/*.csv; do
    if [ "$filename"  != "$OutFileName" ] ;        # Avoid recursion
    then
    if [[ $i -eq 0 ]] ; then
       head -1  $filename >   $OutFileName         # Copy header if it is the first file
    fi
    tail -n +2  $filename >>  $OutFileName         # Append from the 2nd line each file
    i=$(( $i + 1 ))                                # Increase the counter
    fi
done

## Getting it all together

Finnally, this last script gets the mean and median between tweets, and concats pretty much all other files we created previously. Additionally, it averages out a bunch of values for the user 1-neighborhood and creates a csv with it as additional features.

`./tmp/users_content_*` `deleted_account.csv` `users_attributes` $\rightarrow$ `users_all.csv` `users_all_neighborhood.csv`

In [None]:
from LikeSheepsAmongWolves.tmp.utils import cols_attr, cols_glove, cols_empath
import networkx as nx
import pandas as pd
import numpy as np

# Gets mean and median between tweets
tweets = pd.read_csv("../data/tweets.csv")
tweets.sort_values(by=["user_id", "tweet_creation"], ascending=True, inplace=True)
tweets["time_diff"] = tweets.groupby("user_id", sort=False).tweet_creation.diff()
time_diff_series_mean = tweets.groupby("user_id", sort=False).time_diff.mean()
time_diff_series_median = tweets.groupby("user_id", sort=False).time_diff.median()
time_diff = time_diff_series_mean.to_frame()
time_diff["time_diff_median"] = time_diff_series_median
time_diff.to_csv("../data/time_diff.csv")

users_attributes = pd.read_csv("../data/users_attributes.csv")
users_content = pd.read_csv("../data/users_content.csv")
users_content2 = pd.read_csv("../data/users_content2.csv")
users_deleted = pd.read_csv("../data/deleted_account_before_guideline.csv")
users_deleted_after_guideline = pd.read_csv("../data/deleted_account_after_guideline.csv")
users_time = pd.read_csv("../data/time_diff.csv")
users_date = pd.read_csv("../data/created_at.csv")

df = pd.merge(left=users_attributes, right=users_content, on="user_id", how="left")
df = pd.merge(left=df, right=users_content2, on="user_id", how="left")
df = pd.merge(left=df, right=users_deleted, on="user_id", how="left")
df = pd.merge(left=df, right=users_deleted_after_guideline, on="user_id", how="left")
df = pd.merge(left=df, right=users_time, on="user_id", how="left")
df = pd.merge(left=df, right=users_date, on="user_id", how="left")

df.to_csv("../data/users_all.csv", index=False)

# df = pd.read_csv("../data/users_all.csv")

df1 = df.set_index("user_id", verify_integrity=True)

cols = cols_attr + cols_glove + cols_empath
num_cols = len(cols)
graph = nx.read_graphml("../data/users_hate.graphml")
users = list()
count = 0
for user_id in graph.nodes():
    count += 1
    if int(user_id) in df1.index.values:
        tmp = []
        for neighbor in graph.neighbors(user_id):
            if int(neighbor) in df1.index.values:
                tmp.append(list(df1.loc[int(neighbor)][cols].values))
        users.append([user_id] + list(np.average(np.array(tmp), axis=0)))


df2 = pd.DataFrame.from_records(users, columns=["user_id"] + ["c_" + v for v in cols])
df2.to_csv("../data/users_neighborhood.csv", index=False)

# df = pd.read_csv("../data/users_all.csv")
# df2 = pd.read_csv("../data/users_neighborhood.csv")

df3 = pd.merge(left=df, right=df2, on="user_id", how="left")
df3.to_csv("../data/users_all_neighborhood.csv", index=False)


## Getting final (anonymous) versions!

We begin by getting a clean graph, just with the structure!

`users_hate.graphml` $\rightarrow$ `users_clean.graphml`

In [None]:
from LikeSheepsAmongWolves.tmp.utils import cols_attr, cols_glove, cols_empath, graph_attributes
import networkx as nx
import pandas as pd
import numpy as np
import gc

# = = = = = = = = = = = = = = = = = = = = = = = = = = = = cleans graph = = = = = = = = = = = = = = = = = = = = = = = = =
graph = nx.read_graphml("../data/users_hate.graphml")

for user_id in graph.nodes():

    for att in graph_attributes:

        if att in graph.node[user_id]:
            del graph.node[user_id][att]

nx.write_graphml(graph, "../data/users_clean.graphml")

del graph

Then we save all the attributes on the user in each node of the graph!

`users_clean.graphml` `users_all.csv` $\rightarrow$ `users_all.graphml`

In [None]:
df = pd.read_csv("../data/users_all.csv", index_col=0)

cols = df.columns.values

del df
gc.collect()

graph = nx.read_graphml("../data/users_clean.graphml")

for col in cols:

    if col == "hashtags":
        continue

    df = pd.read_csv("../data/users_all.csv", usecols=["user_id", col])

    col_dict = dict()

    for i, v in zip(df["user_id"].values, df[col].values):
        if type(v) == np.float64:
            v = float(v)
        elif type(v) == np.int64:
            v = int(v)
        elif type(v) == np.bool:
            v = bool(v)
        elif type(v) == np.bool_:
            v = bool(v)

        col_dict[str(i)] = v

    nx.set_node_attributes(graph, values=col_dict, name=col)

nx.write_graphml(graph, "../data/users_all.graphml")

Now we anonymize `users_all.csv` and `users_all_neighborhood.csv`;

`users_all.csv` `users_all_neighborhood.csv` $\rightarrow$ `users_anon.csv` `users_neighborhood_anon.csv`

In [None]:
df = pd.read_csv("../data/users_all.csv", index_col=0)

old_index = df.index

df.index = np.array(range(len(df.index)))
df.index.name = "user_id"

new_index = df.index

df.to_csv("../data/users_anon.csv")

df = pd.read_csv("../data/users_all_neighborhood.csv", index_col=0)

df.index = np.array(range(len(df.index)))
df.index.name = "user_id"

df.to_csv("../data/users_neighborhood_anon.csv")