# Network Analysis Preparation

This notebook prepares the data for the import into Gephi. Networks consist of a file for the edges and a file for the nodes.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
user_friendships_evaluation = pd.read_csv('../data/processed/user_friendships_evaluation.csv', sep=";", na_values="", dtype={'source_id': str, 'target_id': str})
user_friendships = pd.read_csv('../data/processed/user_friendships.csv', sep=";", na_values="")

user_list = pd.read_csv('../data/processed/user_list.csv', sep=";", na_values="", dtype={'twitter_id': str})

tweet_list = pd.read_csv('../data/processed/tweet_list.csv', sep=";", na_values="", lineterminator='\n')
quote_list = pd.read_csv('../data/processed/quote_list.csv', sep=";", na_values="", lineterminator='\n')
retweet_list = pd.read_csv('../data/processed/retweet_list.csv', sep=";", na_values="", lineterminator='\n')

### User Network

* Directed Graph
* Nodes = Twitter-Accounts
* Edges = Acccount X follows Y

In [23]:
nodes = user_list[['twitter_handle', 'fraktion']]
nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)

nodes.to_csv('../data/processed/network_analysis/user_network/nodes.csv', index=False, decimal=',', sep=";", float_format='%.0f')

edges = user_friendships[user_friendships['following'] == True]
edges.rename(columns = {'source_screen_name': 'Source', 'target_screen_name': 'Target'}, inplace=True)

edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['Source', 'Target'], right_on=['source_screen_name', 'target_screen_name'])
edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['Source', 'Target'], right_on=['target_screen_name', 'source_screen_name'])
edges['tie_type_x'].update(edges.pop('tie_type_y'))
edges['tie_type_x'][edges['Source'] == edges['Target']] = 'same person' 
edges['tie_type_x'][edges['tie_type_x'].isna()] = 'not available'

edges = edges[['Source', 'Target', 'tie_type_x']]

edges.to_csv('../data/processed/network_analysis/user_network/edges.csv', index=False, decimal=',', sep=";", float_format='%.f')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['tie_type_x'][edges['Source'] == edges['Target']] = 'same person'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['tie_type_x'][edges['tie_type_x'].isna()] = 'not available'


### Retweet Network

* Directed Graph
* Nodes = Twitter-Accounts
* Edges = Account X retweeted Y
* Edge-Weight = how often X retweeted Y

In [5]:
nodes = user_list
nodes = nodes[['twitter_handle', 'fraktion']]
nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)

nodes.to_csv('../data/processed/network_analysis/retweet_network/nodes.csv', index=False, decimal=',', sep=";", float_format='%.0f')

edges = retweet_list

user_list['twitter_id'] = pd.to_numeric(user_list['twitter_id'])
edges = pd.merge(edges, user_list, left_on='author_twitter_id', right_on='twitter_id')
edges = edges[['retweeter_twitter_handle', 'twitter_handle', 'tweet_text']] 
edges.rename(columns = {'twitter_handle': 'author_twitter_handle'}, inplace=True)
# display(edges)
edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['retweeter_twitter_handle', 'author_twitter_handle'], right_on=['source_screen_name', 'target_screen_name'])
edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['retweeter_twitter_handle', 'author_twitter_handle'], right_on=['target_screen_name', 'source_screen_name'])

# display(edges[edges['tie_type_y'].notna()])
edges['tie_type_x'].update(edges.pop('tie_type_y'))

# Set tie_type for same person
edges['tie_type_x'][edges['retweeter_twitter_handle'] == edges['author_twitter_handle']] = 'same person' 

edges['tie_type_x'][edges['tie_type_x'].isna()] = 'not available'

# display(edges[edges['tie_type_x'].isna()])

edges['retweet_count'] = 1
edges = edges.groupby(['retweeter_twitter_handle','author_twitter_handle', 'tie_type_x']).retweet_count.count().reset_index()
edges.drop_duplicates()
display(edges.sort_values(by='retweet_count'))
edges.rename(columns = {'retweeter_twitter_handle': 'Source', 'author_twitter_handle': 'Target', 'retweet_count': 'Weight', 'tie_type_x': 'Tie_type'}, inplace=True)

edges.to_csv('../data/processed/network_analysis/retweet_network/edges.csv', index=False, decimal=',', sep=";", float_format='%.f')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)


Unnamed: 0,retweeter_twitter_handle,author_twitter_handle,tie_type_x,retweet_count
2698,RenateKuenast,karlbaer,strong,1
3042,StephanThomae,StephanThomae,same person,1
3046,StephanThomae,cad59,strong,1
3048,StephanThomae,florian_toncar,strong,1
3049,StephanThomae,hacker_fdp,strong,1
...,...,...,...,...
4888,reinholdmdb,c_lindner,strong,49
2353,NicoTippelt,torstenherbst,strong,50
1804,LINKEPELLI,DietmarBartsch,strong,52
2285,Mi_Muentefering,HeikoMaas,strong,67


### Quote Network

* Directed Graph
* Nodes = Twitter-Accounts
* Edges = Account X retweeted Y
* Edge-Weight = how often X retweeted Y

In [6]:
nodes = user_list
nodes = nodes[['twitter_handle', 'fraktion']]
nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)

nodes.to_csv('../data/processed/network_analysis/quote_network/nodes.csv', index=False, decimal=',', sep=";", float_format='%.0f')

edges = quote_list

user_list['twitter_id'] = pd.to_numeric(user_list['twitter_id'])
edges = pd.merge(edges, user_list, left_on='author_twitter_id', right_on='twitter_id')
edges = edges[['quoter_twitter_handle', 'twitter_handle', 'tweet_text']] 
edges.rename(columns = {'twitter_handle': 'author_twitter_handle'}, inplace=True)
# display(edges)
edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['quoter_twitter_handle', 'author_twitter_handle'], right_on=['source_screen_name', 'target_screen_name'])
edges = pd.merge(edges, user_friendships_evaluation, how='left', left_on=['quoter_twitter_handle', 'author_twitter_handle'], right_on=['target_screen_name', 'source_screen_name'])

# display(edges[edges['tie_type_y'].notna()])
edges['tie_type_x'].update(edges.pop('tie_type_y'))

# Set tie_type for same person
edges['tie_type_x'][edges['quoter_twitter_handle'] == edges['author_twitter_handle']] = 'same person' 

edges['tie_type_x'][edges['tie_type_x'].isna()] = 'not available'

# display(edges[edges['tie_type_x'].isna()])

edges['quote_count'] = 1
edges = edges.groupby(['quoter_twitter_handle','author_twitter_handle', 'tie_type_x']).quote_count.count().reset_index()
edges.drop_duplicates()
display(edges.sort_values(by='quote_count'))
edges.rename(columns = {'quoter_twitter_handle': 'Source', 'author_twitter_handle': 'Target', 'quote_count': 'Weight', 'tie_type_x': 'Tie_type'}, inplace=True)

edges.to_csv('../data/processed/network_analysis/quote_network/edges.csv', index=False, decimal=',', sep=";", float_format='%.f')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nodes.rename(columns = {'twitter_handle': 'Id', 'fraktion': 'Fraktion'}, inplace=True)


Unnamed: 0,quoter_twitter_handle,author_twitter_handle,tie_type_x,quote_count
0,ABaerbockArchiv,ABaerbockArchiv,same person,1
1381,VriesChristoph,KonstantinNotz,weak,1
1380,VriesChristoph,ArminLaschet,weak,1
1379,VolkerUllrich,groehe,strong,1
1378,VolkerUllrich,c_lindner,weak,1
...,...,...,...,...
1758,julia_verlinden,julia_verlinden,same person,29
2167,victorperli,victorperli,same person,45
932,MatthiasHauer,MatthiasHauer,same person,54
60,AndrejHunko,AndrejHunko,same person,139
