# Step 0 - Preparations

## Step 0.1 Install & Load the Necessary Libraries

In [3]:
#!pip install stellargraph

In [4]:
#standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#graph library
import networkx as nx

#stellargraph - NOTE: execute fist the pip command above
# from stellargraph import StellarGraph, datasets
# from stellargraph.data import EdgeSplitter
# from stellargraph.data import BiasedRandomWalk

#word2vec
from gensim.models import Word2Vec

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import PCA

#futher libraries
import multiprocessing

## Step 0.2 Load the Data

### Path Structure

These constants are used for dataset reading/writing. The `EDGES_PATH` is a subfolder of the dataset directory, and will hold the dataframes with the edge information to build the individual networks.
The intended directory structure is:
```
- network_structures.ipynb
- sna_python_handson.ipynb

- dataset/

-- raw_data/
----- [files downloaded from TUWEL]

-- edges/
----- [files containing edge information]

-- embeddings/
----- [containins the produced embeddings]

```

In [5]:
ROOT_PATH = "./dataset/"

RAW_DATA = f"{ROOT_PATH}raw_data/"
EDGES_PATH = f"{ROOT_PATH}edges/"
EMBEDDINGS_PATH = f"{ROOT_PATH}embeddings/"


FOLLOW_IGNORE_PATH = f"{RAW_DATA}Following_Ignoring_Relationships_01052019_31052019.csv"
VOTES_1_PATH = f"{RAW_DATA}Votes_01052019_15052019.csv"
VOTES_2_PATH = f"{RAW_DATA}Votes_16052019_31052019.csv"
VOTES_PATH = f"{RAW_DATA}Votes.csv"
POSTINGS_1_PATH = f"{RAW_DATA}Postings_01052019_15052019.csv"
POSTINGS_2_PATH = f"{RAW_DATA}Postings_16052019_31052019.csv"
POSTINGS_PATH = f"{RAW_DATA}Postings.csv"

The votings and posts tables have been split up into two parts, for size reasons. However, combining them does not lead to an overly large filesize, and will simplify things later on.

In [6]:
data_postings_1 = pd.read_csv(POSTINGS_1_PATH, sep=";")
data_postings_2 = pd.read_csv(POSTINGS_2_PATH, sep=";")
data_votes_1 = pd.read_csv(VOTES_1_PATH, sep=";")
data_votes_2 = pd.read_csv(VOTES_2_PATH, sep=";")

data_postings = data_postings_1.append(data_postings_2, ignore_index=True, verify_integrity=True)
data_votes = data_votes_1.append(data_votes_2, ignore_index=True, verify_integrity=True)

data_postings.to_csv(POSTINGS_PATH, sep=";", index=False)
data_votes.to_csv(VOTES_PATH, sep=";", index=False)

In [7]:
print(data_votes.columns)
print(data_postings.columns)

Index(['ID_CommunityIdentity', 'ID_Posting', 'VoteNegative', 'VotePositive',
       'VoteCreatedAt', 'UserCommunityName', 'UserGender', 'UserCreatedAt'],
      dtype='object')
Index(['ID_Posting', 'ID_Posting_Parent', 'ID_CommunityIdentity',
       'PostingHeadline', 'PostingComment', 'PostingCreatedAt', 'ID_Article',
       'ArticlePublishingDate', 'ArticleTitle', 'ArticleChannel',
       'ArticleRessortName', 'UserCommunityName', 'UserGender',
       'UserCreatedAt'],
      dtype='object')


## Step 1 Create the graphs

### Follow/Ignore Network

In this block, we build the follow network and the ignore network (they are separate networks, but could be combined as well). A directed edge from node `v` to node `w` indicates that user `v` follows/ignores user `w`.

In [8]:
data_follow_ignore = pd.read_csv(FOLLOW_IGNORE_PATH, sep=";")
data_follow_ignore.head()

Unnamed: 0,ID_CommunityIdentity,ID_CommunityIdentityConnectedTo,ID_CommunityConnectionType
0,1778,246490,1
1,5872,5872,1
2,9030,23875,1
3,9030,508504,1
4,10569,10569,1


In [9]:
data_follow = data_follow_ignore[data_follow_ignore["ID_CommunityConnectionType"] == 1]
data_ignore = data_follow_ignore[data_follow_ignore["ID_CommunityConnectionType"] == 2]

In [10]:
edge_list = np.dstack((
    data_follow["ID_CommunityIdentity"].values,
    data_follow["ID_CommunityIdentityConnectedTo"].values
))[0]
edges_df = pd.DataFrame(edge_list, columns=["follower", "followed"])
edges_df.to_csv(f"{EDGES_PATH}follow_edges.csv", sep=" ", index=False, header=False)

In [11]:
edge_list = np.dstack((
    data_ignore["ID_CommunityIdentity"].values,
    data_ignore["ID_CommunityIdentityConnectedTo"].values
))[0]
edges_df = pd.DataFrame(edge_list, columns=["ignorer", "ignored"])
edges_df.to_csv(f"{EDGES_PATH}ignore_edges.csv", sep=" ", index=False, header=False)

### Positive/Negative Votes Network

In this block, we construct the network of positive/negative votes (they are again separate networks, but could be combined). A directed edge from node `v` to node `w` indicates that user `v` up-/down-votes user `w`.

In [12]:
data_postings = pd.read_csv(POSTINGS_PATH, sep=";")
data_votes = pd.read_csv(VOTES_PATH, sep=";")

In [13]:
data_votes = data_votes[data_votes.columns[:4]]
data_postings = data_postings[["ID_Posting", "ID_CommunityIdentity"]].rename(columns={"ID_CommunityIdentity": "ID_Poster"})

In [14]:
votes_and_postings = data_votes.merge(data_postings, on="ID_Posting", how="left")
votes_and_postings.head(15)

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,ID_Poster
0,675862,1041076570,1,0,691035
1,689023,1041076570,1,0,691035
2,24810,1041076745,0,1,581068
3,673781,1041076745,0,1,581068
4,24810,1041076831,0,1,76577
5,40177,1041076831,0,1,76577
6,581068,1041076831,0,1,76577
7,68791,1041077081,1,0,107237
8,45168,1041077474,0,1,105566
9,79653,1041077474,0,1,105566


In [15]:
pos_votes = votes_and_postings[votes_and_postings["VotePositive"] == 1]
neg_votes = votes_and_postings[votes_and_postings["VoteNegative"] == 1]

In [16]:
edge_list = np.dstack((
    pos_votes["ID_CommunityIdentity"].values,
    pos_votes["ID_Poster"].values
))[0]
edges_df = pd.DataFrame(edge_list, columns=["pos_voter", "poster"])
edges_df.to_csv(f"{EDGES_PATH}pos_vote_edges.csv", sep=" ", index=False, header=False)

In [17]:
edge_list = np.dstack((
    neg_votes["ID_CommunityIdentity"].values,
    neg_votes["ID_Poster"].values
))[0]
edges_df = pd.DataFrame(edge_list, columns=["neg_voter", "poster"])
edges_df.to_csv(f"{EDGES_PATH}neg_vote_edges.csv", sep=" ", index=False, header=False)

### Reply Network

In this block we construct the reply network. Note that we do this by joining the postings data with itself, instead of the extremely inefficient loop presented in the hands-on exercise. A directed edge from node `v` to node `w` indicates that user `v` replied to a post made by user `w`. The weight of the edge will indicate the amount of times `v` commented on posts by `w`.

In [18]:
data_postings = pd.read_csv(POSTINGS_PATH, sep=";")
data_postings.head()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073000.0,671476,Das hat gestern bereits der Voggenhuber angefü...,schieder hatte dem inhaltlich nichts entgegenz...,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041073000.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110
2,1041073872,1041069000.0,669286,,"Nein, bei der ÖVP/FPÖ genauso passiert. Ich wo...",2019-05-01 18:29:05.533,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Hpolditsch,,2018-03-06 20:03:42.737
3,1041080734,1041080000.0,671476,Sie haben doch nichts gefordert??,sie haben nur die regierung kritisiert. das di...,2019-05-01 22:37:56.010,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
4,1041080828,,671476,Heute wäre der perfekte Tag für die SPÖ gewese...,"ihr noch nicht erfülltes versprechen, den silb...",2019-05-01 22:42:06.310,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470


In [19]:
data_postings_1 = data_postings[["ID_Posting", "ID_Posting_Parent", "ID_CommunityIdentity"]].rename(columns={"ID_Posting_Parent": "ID_OriginalPost", "ID_CommunityIdentity": "ID_Replier"})

data_postings_2 = data_postings[["ID_Posting", "ID_CommunityIdentity"]].rename(columns={"ID_Posting": "ID_OriginalPost", "ID_CommunityIdentity": "ID_Poster"})

del data_postings

In [20]:
data_postings_1.dropna(inplace=True)

In [21]:
data_postings_1["ID_OriginalPost"] = data_postings_1["ID_OriginalPost"].astype(np.int64)

In [22]:
replies = data_postings_1.merge(data_postings_2, on="ID_OriginalPost", how="left")
replies.dropna(inplace=True)
replies["ID_Poster"] = replies["ID_Poster"].astype(np.int64)
replies.head()

Unnamed: 0,ID_Posting,ID_OriginalPost,ID_Replier,ID_Poster
0,1041073586,1041073234,671476,233191
1,1041073839,1041072504,566938,640123
2,1041073872,1041068600,669286,680772
3,1041080734,1041080236,671476,51817
4,1041080938,1041080782,671476,678196


In [23]:
edge_list = np.dstack((
    replies["ID_Replier"].values,
    replies["ID_Poster"].values
))[0]
unique, count = np.unique(edge_list, axis=0, return_counts=True)
count = count.reshape((-1, 1))
weighted_edge_list = np.append(unique, count, axis=1)
edges_df = pd.DataFrame(weighted_edge_list, columns=["replier", "poster", "count"])
edges_df.to_csv(f"{EDGES_PATH}reply_edges.csv", sep=" ", index=False, header=False)