# From Database to Annotation Ready


This Python notebook describes the process of transforming the graph stored into a neo4j database into three files:

- `users_infected_diffusion.graphml` with the users their attributes, and the diffusion;

- `tweets.csv` with the tweets and their respective users;

- `users_to_annotate.csv` a csv file with the 5071 users to be annotated.

We begin by creating a `.graphml` file with the users and all their attributes:

`database` $\rightarrow$ `users.graphml`

In [None]:
from py2neo import Graph
import networkx as nx
import json

nx_graph = nx.DiGraph()

f = open("../secrets/twitter_neo4jsecret.json", 'r')
config_neo4j = json.load(f)
f.close()
graph = Graph(config_neo4j["host"], password=config_neo4j["password"])

for node in graph.data("""MATCH (a:User) WHERE a.virtual="F" RETURN a as val"""):
    n = dict(node["val"])
    nx_graph.add_node(n["id"], **n)
    nx_graph.add_edge(n["id"], n["id"])

for node in graph.data(
        """MATCH (a:User)-[:retweeted]->(b:User) WHERE a.virtual="F" AND b.virtual="F" RETURN a.id as a, b.id as b"""):
    nx_graph.add_edge(node['a'], node['b'])

nx.write_graphml(nx_graph, "../data/users.graphml")

Notice that our initial graph will look like the following:

![](../imgs/users_net1.png)

Where each one of the nodes in the graph has the following attributes:

    id                      - unique identifier of a user     
    statuses_count          - number of statuses
    lang                    - language inferred by twitter
    listed_count            - number of lists that the user is at
    number                  - number of user in data collection
    favorites_count         - number of favorites the user has
    uname                   - user name
    verified                - the user is verified or not
    default_profile         - the user has changed the default profile
    default_profile_image   - the user has changed the default image
    profile_image_url       - url to the user image
    screen_name             - user screen name
    geo_enabled             - whether the user's tweets are geo enabled
    followees_count         - number of people the user follows
    followers_count         - number of people following the user
    description             - textual description of the user
    location                - user's location
    time_zone               - user's timezone, unstructured
    
    
Interestingly, some characters are invalid for the graphml format, as there are emotijis and etc. Thus we have to remove invalid characters, which we do using the following script.

`users.graphml` $\rightarrow$ `users.graphml`

In [None]:
%%bash
perl -CSDA -pe 's/[^\x9\xA\xD\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+//g;' ../data/users.graphml > ../data/users2.graphml
rm ../data/users.graphml
mv ../data/users2.graphml ../data/users.graphml

We now get the csv file with the texts created by the users and the following values:

    screen_name       - screename of the user on twitter
    tweet_id          - number with the identifier of a the tweet
    tweet_text        - if status is a tweet/quote, the text written
    tweet_creation    - date in unix time when the tweet was tweeted
    tweet_fav         - number of favorites
    tweet_rt          - number of retweets
    rp_flag           - flag that indicates if the tweet is a reply
    rp_status         - id of the replied status
    rp_user           - id of the replied user
    qt_flag           - flag that indicates if the tweet is a quote
    qt_user_id        - id of the quoted user
    qt_status_id      - id of the quoted status
    qt_text           - text of the quoted status
    qt_creation       - date of creation of the quoted status
    qt_fav            - number of favorites of the quoted status
    qt_rt             - number of retweets of the quoted status
    rt_flag           - flag that indicates if a tweet is a retweet
    rt_user_id        - id of the retweeted user
    rt_status_id      - id of the retweeted status
    rt_text           - text of the retweeted status
    rt_creation       - creation date of the retweeted status
    rt_fav            - number of favorites of the retweeted status
    rt_rt             - number of retweets of the retweeted status
    
The procedure to get that is quite complicated, mostly because the delimiter used in the crawler was ';', which is also legal character in twitter, thus we had to do some clever regex matching to get around this issue.

`database` $\rightarrow$ `tweets.csv`

In [None]:
from py2neo import Graph
import json
import csv
import re

f = open("../secrets/twitter_neo4jsecret.json", 'r')
config_neo4j = json.load(f)
f.close()
graph = Graph(config_neo4j["host"], password=config_neo4j["password"])

f = open("../data/tweets.csv", "w")
csv_writer = csv.writer(f)

csv_writer.writerow(["user_id", "screen_name",
                     "tweet_id", "tweet_text", "tweet_creation", "tweet_fav", "tweet_rt",
                     "rp_flag", "rp_status", "rp_user",
                     "qt_flag", "qt_user_id", "qt_status_id", "qt_text", "qt_creation", "qt_fav", "qt_rt",
                     "rt_flag", "rt_user_id", "rt_status_id", "rt_text", "rt_creation", "rt_fav", "rt_rt"])

q = """MATCH (u:User) where u.virtual="F" return count(u) as number"""
df = graph.data(q)
max_entries = df[0]["number"]
aux = (list(range(max_entries, 0, -10000)) + [0])[::-1]
ranges = zip(aux[:-1], aux[1:])

for lower, upper in ranges:
    print(lower, upper)

    query = """ MATCH (u:User)-[:tweeted]->(t:Tweet)
                WHERE u.number > {0} AND u.number < {1} 
                RETURN u.id as id, u.screen_name as screen_name, t.content as content """.format(lower, upper)

    df = graph.data(query)

    for row in df:
        for tweet in row["content"]:
            new_tweet = []
            len_tweet = len(tweet.split(";"))

            match = re.match("([0-9])+", tweet)
            start, end = match.span()
            new_tweet.append(tweet[start:end])

            tweet = tweet[end + 1:]
            match = re.match(".*?(?=;1[0-9]{9}\.0)", tweet)
            start, end = match.span()
            new_tweet.append(tweet[start:end])
            tweet = tweet[end + 1:]

            tmp = tweet.split(";")
            new_tweet += tmp[:9]
            tweet = ";".join(tmp[9:])


            if new_tweet[-3] == 'True':
                match = re.match(".*?(?=;1[0-9]{9}\.0)", tweet)
                start, end = match.span()
                new_tweet.append(tweet[start:end])
                tweet = tweet[end + 1:]
                tmp = tweet.split(";")
                new_tweet += tmp[:6]
                tweet = ";".join(tmp[6:])

            else:
                tmp = tweet.split(";")
                new_tweet += tmp[:7]
                tweet = ";".join(tmp[7:])

            match = re.match(".*?(?=;1[0-9]{9}\.0)", tweet)
            if match:
                start, end = match.span()
                new_tweet.append(tweet[start:end])
                tweet = tweet[end + 1:]
            tmp = tweet.split(";")
            new_tweet += tmp

            csv_writer.writerow([row["id"]] + [row["screen_name"]] + new_tweet)

f.close()

Our ultimate goal is to find hateful users in the network we extracted from twitter. We will then select a subsample of the users to be annotated as hateful or not hateful. However this is not trivial as hateful users probably are a minority on Twitter, making undesirable to simply take a random sample (as annotating the users is expensive). Our technique for selecting the subsampling, thus, follows the following broad strokes:

1. Select a subset of users which are likely to be hateful.
2. Find users that are close to these users in the network.
3. Select the users to annotate from the users which are hateful and the ones which are close to them in the network.

This method follows the principle of homophily, which is commonly used in the Social Network Analysis literature. We now explain in some detail how we implement the steps $1$, $2$ and $3$.

## 1. Select a subset of users which are likely to be hateful.

We will mark as *infected* users that used a word in a lexicon of words highly correlated with hateful speech. Some examples are:

    goy, kike, heeb, nigga, white genocide, anti white, racial realism, holohoax, racemixing, racial treason
    
We will find the users who done it in the texts we have just extracted, and then mark them as infected in the graph we generated! Notice that in our previous graph a directed edge ($u_1$,$u_2$) meant that user $u_1$ retweeted $u_2$. However, influence in the retweet network is interpreted the other way around, if $u_1$ retweeted $u_2$, it actually means that $u_2$ influences $u_1$. This is clearly not always the case, as users may retweet other users sarcastically, but is a good model, commonly used in the Social Network Analysis literature.

![](../imgs/users_net2.png)

`users.graphml` `tweets.csv` `lexicon.txt` $\rightarrow$ `users_infected.graphml`

In [None]:
import networkx as nx
import csv
import re

l = open("../data/lexicon.txt", "r")
regexp = ""
for line in l.readlines():
    regexp += "({0})|".format(line.rstrip())
l.close()
regexp = regexp[:-1]
regexp = re.compile(regexp)

f = open("../data/tweets.csv", "r")
re.match(regexp, "")
csv_writer = csv.DictReader(f)

set_users = dict()

for line in csv_writer:
    text = regexp.search(line["tweet_text"])
    retweet = regexp.search(line["rt_text"])
    quote = regexp.search(line["qt_text"])
    if text is not None or retweet is not None or quote is not None:
        set_users[line["user_id"]] = True
f.close()


nx_graph = nx.read_graphml("../data/users.graphml")
nx_graph = nx_graph.reverse(copy=False)
nx.set_node_attributes(nx_graph, name="slur", values=set_users)
nx.write_graphml(nx_graph, "../data/users_infected.graphml")

Now we have some users who are suspicious to be hateful users. However, the problem is that we are limiting ourselves to the users who used one of our words. This will be addressed in the next step of our modelling. However, first, let us visualize the network "as is", with the infected users:

<img src="../imgs/users_diffusion.png" alt="" style="width: 550px;"/>


## 2. Find users that are close to these users in the network.

To get a more general sample, and not very biased towards our lexicon, we employ a diffusion proccess on the graph considering the users that employed the lexicon as "infected". Our model is based on DeGroot learning model, as described in Golub & Jackson (2010). Given the matrix $G$ of the retweet induced graph:

- Let $T$ is $G$ normalized in a row matrix. In other words, every row sums to $1$. In practice, this means that every user $u$ is influenced equally by all its neighbors in the graph. 
- Let $p$ be an vector representing the infected users. $p^{(i)} = 1$ if the $i$-th user was marked as infected and $p^{(i)} = 0$ otherwise.

Then we perform the following diffusion process, for $k$ steps, do:

$$
p^{(k)} = T*p^{(k-1)}
$$

`users_infected.graphml` $\rightarrow$ `users_infected_diffusion.graphml`

In [None]:
import networkx as nx
import numpy as np

initial_belief = 1
k = 2

np.random.seed(1)
graph = nx.read_graphml("../data/users_infected.graphml")

slur_nodes = list(nx.get_node_attributes(graph, "slur"))
other_nodes = list(set(graph.nodes()).difference(set(slur_nodes)))
node_list = slur_nodes + other_nodes

transition_matrix = nx.adjacency_matrix(graph, nodelist=node_list).asfptype()
n = transition_matrix.shape[0]

for i in range(n):
    total = transition_matrix[i, :].sum()
    if total != 0:
        transition_matrix[i, :] = transition_matrix[i, :] / total


beliefs = np.zeros(len(node_list))
beliefs[:len(slur_nodes)] = initial_belief

for _ in range(k):
    out = transition_matrix.dot(beliefs)
    beliefs = out


final_beliefs_dict = dict()
for node, belief in zip(node_list, beliefs):
    final_beliefs_dict[node] = float(belief)

nx.set_node_attributes(graph, name="diffusion_slur", values=final_beliefs_dict)
nx.write_graphml(graph, "../data/users_infected_diffusion.graphml".format(k))

Using $k=2$, we have then assigned values from $[0,1]$ which show how close our users are to those that used the words in the lexicon. We have something like in:
![](../imgs/users_net3.png)

We can also visualize the real graph:

<img src="../imgs/users.png" alt="" style="width: 550px;"/>


## 3. Select the users to annotate from the users which are hateful and the ones which are close to them in the network.

Finally Divide the users in $4$ strata according to their associated beliefs after the diffusion process, and perform a stratified sampling, obtaining up to $1500$ user per strata. Notice that $p_{i}^{(t)} \in [0,1]$. With this real value associated with each user, we get 4 strata by randomly selecting up to $1500$ users with $p_{i}$ in the intervals $[0,.25)$, $[.25,.50)$, $[.50,.75)$ and $[.75,1]$.

`users_infected_diffusion.graphml` $\rightarrow$ `users_to_annotate.csv`

In [None]:
import networkx as nx
import numpy as np
import random
import csv

np.random.seed(1234)
random.seed(1234)

N = 6000

nx_graph = nx.read_graphml("../data/users_infected_diffusion.graphml")
diffusion_slur = nx.get_node_attributes(nx_graph, name="diffusion_slur")
screen_names = nx.get_node_attributes(nx_graph, name="screen_name")
in_degree = nx_graph.in_degree()
strata1, strata2, strata3, strata4 = [], [], [], []
sum_vals = 0

for key in sorted(diffusion_slur):

    if diffusion_slur[key] < .25:
        strata1.append(int(key))

    if .50 > diffusion_slur[key] >= .25:
        strata2.append(int(key))

    if .75 > diffusion_slur[key] >= .50:
        strata3.append(int(key))

    if diffusion_slur[key] >= .75:
        strata4.append(int(key))

sample_strata1 = np.random.choice(strata1, size=int(N / 4), replace=False)
sample_strata2 = np.random.choice(strata2, size=int(N / 4), replace=False)
sample_strata3 = np.random.choice(strata3, size=int(N / 4), replace=False)
sample_strata4 = np.random.choice(strata4, size=min(int(N / 4), len(strata4)), replace=False)

f = open("../data/users_to_annotate.csv", "w")

csv_writer = csv.writer(f)

csv_writer.writerow(["user_id", "screen_name", "twitter", "diffusion_slur", "stratum"])

count = 0

sample = []

for strata in [sample_strata1, sample_strata2, sample_strata3, sample_strata4]:
    count += 1
    for key in strata:
        sample.append([int(key),
                       screen_names[str(key)],
                       "https://twitter.com/{0}".format(screen_names[str(key)]),
                       diffusion_slur[str(key)], count])

print(sample)
random.shuffle(sample)

for row in sample:
    csv_writer.writerow(row)
f.close()

-----------------------------------------------------------------------------------------------------------------------