In [273]:
%pip install gremlinpython

/bin/bash: /anaconda/envs/jupyter_env/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.


In [274]:
# General imports
import pandas as pd
import numpy as np
import pickle
import time
from tqdm import tqdm
tqdm.pandas()
from itertools import combinations

# Graph related imports
import nest_asyncio
from gremlin_python.driver import client, serializer

# Azure imports
from azureml.core import Workspace, Dataset

In [275]:
# Necessary to avoid "RuntimeError: Cannot run the event loop while another loop is running"
nest_asyncio.apply()

In [276]:
gremlin_client = client.Client('wss://leomathesis-cosmos-gremlin.gremlin.cosmos.azure.com:443/', 'g',
                               username="/dbs/mathesisleo-database/colls/year-graph-2010",
                               password="<<password>>",
                               message_serializer=serializer.GraphSONSerializersV2d0()
                              )

In [277]:
def add_vertex(label, id, type, prop_dict={}):
    prop_dict['object_id'] = id
    prop_dict['id'] = f"{type}_{id}"
    prop_dict['type'] = type
    gremlin_query = f'g.addV("{label}")'
    for key, value in prop_dict.items():
        if isinstance(value, list):
            for value_entry in value:
                gremlin_query += f'.property("{key}", "{value_entry}")'
        else:
            gremlin_query += f'.property("{key}", "{value}")'
    for i in range(500):
        if gremlin_client.available_pool_size > 0:
            break
        else:
            time.sleep(0.01)
    gremlin_client.submit_async(gremlin_query)

In [278]:
def add_edge(label, from_id, to_id, prop_dict={}):
    gremlin_query = f'g.V("{from_id}").as("a").V("{to_id}").as("b").addE("{label}").from("a").to("b")'
    for key, value in prop_dict.items():
        if type(value) == int or type(value) == float:
            gremlin_query += f'.property("{key}", {value})'
        else:
            gremlin_query += f'.property("{key}", "{value}")'
    for i in range(50):
        if gremlin_client.available_pool_size > 0:
            break
        else:
            time.sleep(0.1)
        
    gremlin_client.submit_async(gremlin_query)

## Clear graph

In [279]:
gremlin_client.submit("g.V().drop()")

<gremlin_python.driver.resultset.ResultSet at 0x7fe61f039600>

In [280]:
last_cnt = 0
for i in range(200):
    callback = gremlin_client.submit_async("g.V().count()")
    cnt = callback.result().all().result()[0]
    print(cnt)
    if cnt == 0:
        break
    else:
        if last_cnt == cnt:
            gremlin_client.submit("g.V().drop()")
            pass
        time.sleep(10)
        last_cnt = cnt

0


## Adding vertices for persons and parties

In [281]:
subscription_id = '<<subscription_id>>'
resource_group = 'rg-leore-001'
workspace_name = 'leomathesisML'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='db_politician_data_asset')
df_politicians = dataset.to_pandas_dataframe()

Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Failed to extract subscription information, Exception=AttributeError; 'Logger' object has no attribute 'activity_info'
Message: rslex failed, falling back to clex.
Payload: {"pid": 44075, "source": "azureml.dataprep", "version": "4.8.4", "trace": "azureml|data|tabular_dataset.py, line 169 in function <lambda>.\nazureml|data|dataset_error_handling.py, line 106 in function _try_execute.\nazureml|data|tabular_dataset.py, line 169 in function to_pandas_dataframe.", "subscription": "", "run_id": "", "resource_group": "", "w

In [282]:
party_translate = {'CVP': 'Die Mitte',
                   'BDP': 'Die Mitte',
                   'GP': 'Grüne',
                   'FPS': 'APS',
                   'LdU': 'DaP/LdU'}

In [283]:
df_politicians['party'] = df_politicians['party'].map(party_translate).fillna(df_politicians['party'])

In [284]:
parties = list(df_politicians['party'].unique())
parties.remove(None)
parties.remove('unknown')
parties

['SP',
 'Die Mitte',
 'FDP',
 'SVP',
 'SD',
 'DaP/LdU',
 'Grüne',
 'APS',
 'EVP',
 'FraP',
 'GLP',
 'EDU',
 'SaS',
 'parteilos',
 'AL',
 'CSP']

In [285]:
for idx, party in enumerate(parties):
    add_vertex(label=party, id=idx, type='party', prop_dict={'name': party})

In [286]:
politicians = df_politicians[['person_id', 'first_name', 'last_name']].drop_duplicates()

In [287]:
def add_politician(df_row):
    full_name = f"{df_row['first_name']} {df_row['last_name']}"
    prop_dict = {'first_name': df_row['first_name'], 
                 'last_name': df_row['last_name'], 
                 'person_id': df_row['person_id']}
    add_vertex(label=full_name, 
               id=df_row['person_id'], 
               type='politician', 
               prop_dict=prop_dict)

In [288]:
_ = politicians.apply(add_politician, axis=1)

In [289]:
def add_politician_party(df_row):
    if df_row['party'] not in (None, 'unknown'):
        prop_dict = {'valid_from': df_row['valid_from'], 
                     'valid_to': df_row['valid_to'],
                     'council': df_row['council']}
        callback = gremlin_client.submit_async(f'g.V().hasLabel("{df_row["party"]}").values("id")')
        party_id = callback.result().all().result()[0]
        add_edge(label='is_member_of', 
                 from_id=f"politician_{df_row['person_id']}", 
                 to_id=party_id,
                 prop_dict=prop_dict)

In [290]:
_ = df_politicians.apply(add_politician_party, axis=1)

## Adding vertices for topics

In [291]:
topics = pickle.load(open('../data/topics_manual-finetune_2010.pkl', 'rb'))

In [292]:
for topic_nr, topic in topics.items():
    if topic_nr >= 0:
        topic_name = str(topic_nr)
        for i in range(4):
            topic_name += "_" + topic[i][0]
        prop_dict = {'topic_id': topic_nr,
                     'topic_name': topic_name,
                     'topic_words': [i[0] for i in topic]}
        add_vertex(label=topic_name, 
                   id=topic_nr,
                   type='topic',
                   prop_dict=prop_dict)

## Adding edges between topics and politicians

In [293]:
topic_df = pd.read_pickle('../data/df_topic_manual-finetune_2010.pkl')
sent_df = pd.read_pickle('../data/df_sent_2010.pkl')

In [294]:
df = topic_df.merge(sent_df, how='left', on='window_id')
df['year'] = df['date'].dt.year

In [295]:
df_graph = df[(df['Topic'] >= 0) & (df['person_id'].notnull())].copy()

In [296]:
df_graph.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169156 entries, 135 to 673959
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   window_id                169156 non-null  int64         
 1   paragraph_id             169156 non-null  int64         
 2   item_of_business         169156 non-null  object        
 3   person_id                169156 non-null  object        
 4   first_name               169156 non-null  object        
 5   last_name                169156 non-null  object        
 6   council                  169156 non-null  object        
 7   party                    169156 non-null  object        
 8   in_admin_role            169156 non-null  bool          
 9   text                     169156 non-null  object        
 10  date                     169156 non-null  datetime64[ns]
 11  session_title            169156 non-null  object        
 12  session_id    

In [297]:
agg_dict = {}
for year in df_graph['year'].unique():
    prob_year_key = f'Probability_{year}'
    sent_year_key = f'sentiment_{year}'
    agg_dict[prob_year_key] = 'sum'
    agg_dict[sent_year_key] = 'mean'
    df_graph[prob_year_key] = df_graph[df_graph['year'] == year]['Probability']
    df_graph[sent_year_key] = df_graph[df_graph['year'] == year]['sentiment']

In [298]:
df_graph.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169156 entries, 135 to 673959
Data columns (total 52 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   window_id                169156 non-null  int64         
 1   paragraph_id             169156 non-null  int64         
 2   item_of_business         169156 non-null  object        
 3   person_id                169156 non-null  object        
 4   first_name               169156 non-null  object        
 5   last_name                169156 non-null  object        
 6   council                  169156 non-null  object        
 7   party                    169156 non-null  object        
 8   in_admin_role            169156 non-null  bool          
 9   text                     169156 non-null  object        
 10  date                     169156 non-null  datetime64[ns]
 11  session_title            169156 non-null  object        
 12  session_id    

In [299]:
df_graph['party'] = df_graph['party'].map(party_translate).fillna(df_graph['party'])

In [300]:
agg_dict['Probability'] = 'sum'
agg_dict['sentiment'] = 'mean'
df_person_topic_grouped = df_graph.groupby(['person_id', 'Topic'], as_index=False).agg(agg_dict)

In [301]:
df_person_topic_grouped

Unnamed: 0,person_id,Topic,Probability_2010,sentiment_2010,Probability_2011,sentiment_2011,Probability_2012,sentiment_2012,Probability_2013,sentiment_2013,...,Probability_2020,sentiment_2020,Probability_2021,sentiment_2021,Probability_2022,sentiment_2022,Probability_2023,sentiment_2023,Probability,sentiment
0,14988,0,0.000000,,0.000000,,0.000000,,2.000000,-0.601759,...,0.000000,,0.0,,0.000000,,0.0,,9.660662,-0.211594
1,14988,1,0.000000,,9.031365,-0.380621,6.462950,-0.152682,23.000000,-0.294839,...,0.000000,,0.0,,0.000000,,0.0,,96.599387,-0.240894
2,14988,2,2.769931,-0.31405,0.881466,0.028439,31.339367,0.208662,26.030048,0.144966,...,0.000000,,0.0,,0.000000,,0.0,,181.200253,0.032078
3,14988,3,0.000000,,8.000000,0.357199,0.000000,,0.000000,,...,0.000000,,0.0,,0.000000,,0.0,,19.000000,0.202943
4,14988,4,0.000000,,0.731819,-0.042855,0.000000,,0.000000,,...,0.000000,,0.0,,0.000000,,0.0,,1.546479,-0.331252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12288,9060,213,0.000000,,0.000000,,0.000000,,0.000000,,...,0.000000,,0.0,,0.000000,,0.0,,8.966395,0.070897
12289,9060,218,0.000000,,0.000000,,0.000000,,0.000000,,...,0.000000,,0.0,,9.187772,-0.021847,0.0,,9.187772,-0.021847
12290,9060,233,0.000000,,0.000000,,0.000000,,0.000000,,...,0.000000,,0.0,,0.000000,,0.0,,2.997832,-0.726120
12291,9060,239,0.000000,,0.000000,,0.000000,,0.000000,,...,0.000000,,0.0,,0.000000,,0.0,,9.000000,-0.834733


In [302]:
for col in agg_dict.keys():
    print(col)

Probability_2010
sentiment_2010
Probability_2011
sentiment_2011
Probability_2012
sentiment_2012
Probability_2013
sentiment_2013
Probability_2014
sentiment_2014
Probability_2015
sentiment_2015
Probability_2016
sentiment_2016
Probability_2017
sentiment_2017
Probability_2018
sentiment_2018
Probability_2019
sentiment_2019
Probability_2020
sentiment_2020
Probability_2021
sentiment_2021
Probability_2022
sentiment_2022
Probability_2023
sentiment_2023
Probability
sentiment


In [303]:
def add_politician_topic(df_row):
    prop_dict = {'weight_total': df_row['Probability'] * 4,
                 'sentiment_total': df_row['sentiment']}
    for col in agg_dict.keys():
        if col.startswith('Probability_'):
            prop_dict[f"weight_{col.split('_')[1]}"] = df_row[col] * 4
        elif col.startswith('sentiment_'):
            prop_dict[f"sentiment_{col.split('_')[1]}"] = float(np.nan_to_num(df_row[col]))
    add_edge(label='discusses', 
             from_id=f"politician_{df_row['person_id']}", 
             to_id=f"topic_{df_row['Topic']}",
             prop_dict=prop_dict)

In [304]:
_ = df_person_topic_grouped.progress_apply(add_politician_topic, axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12293/12293 [02:43<00:00, 75.11it/s]


In [305]:
df_party_topic_grouped = df_graph[(df_graph['party'] != 'unknown') & (df_graph['party'].notnull())].groupby(['party', 'Topic'], as_index=False).agg(agg_dict)

In [306]:
df_party_topic_grouped

Unnamed: 0,party,Topic,Probability_2010,sentiment_2010,Probability_2011,sentiment_2011,Probability_2012,sentiment_2012,Probability_2013,sentiment_2013,...,Probability_2020,sentiment_2020,Probability_2021,sentiment_2021,Probability_2022,sentiment_2022,Probability_2023,sentiment_2023,Probability,sentiment
0,AL,0,29.674695,-0.238971,34.040140,-0.486399,21.755519,-0.461124,14.953877,-0.532488,...,63.077538,-0.283283,52.609028,-0.216220,46.336336,-0.410074,9.735440,-0.243692,636.054410,-0.391647
1,AL,1,0.000000,,7.000000,-0.404518,6.000000,-0.740836,0.000000,,...,46.556087,-0.194274,23.000920,-0.187283,85.640247,-0.120240,0.000000,,262.307944,-0.222302
2,AL,2,0.000000,,0.000000,,0.000000,,0.000000,,...,29.502974,0.017799,17.171707,-0.332358,15.872681,0.032907,17.450583,-0.034866,181.854870,-0.061487
3,AL,3,36.000000,0.056570,8.000000,-0.317958,35.000000,0.232998,10.000000,0.243998,...,17.000000,0.135072,14.000000,-0.137308,11.000000,0.194526,0.000000,,255.276022,0.070253
4,AL,4,27.384266,-0.168635,3.222739,-0.491469,5.128765,-0.015500,23.988941,-0.248426,...,43.971881,-0.168215,40.613542,-0.387522,32.314276,-0.264617,0.698360,-0.834019,483.960052,-0.251889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2119,parteilos,146,0.000000,,0.000000,,0.000000,,0.000000,,...,4.470289,-0.800510,1.780314,-0.967500,3.295349,-0.458486,3.308876,-0.900142,12.854829,-0.758137
2120,parteilos,170,0.000000,,0.000000,,0.000000,,0.000000,,...,0.534259,-0.182488,0.000000,,0.000000,,0.000000,,0.534259,-0.182488
2121,parteilos,172,0.000000,,0.000000,,0.000000,,0.000000,,...,7.641944,-0.812061,0.000000,,7.764906,-0.336162,0.000000,,15.406850,-0.574112
2122,parteilos,189,0.000000,,0.000000,,0.000000,,0.000000,,...,1.069339,-0.843898,0.000000,,1.391455,-0.910598,0.000000,,2.460794,-0.877248


In [307]:
def add_party_topic(df_row):
    prop_dict = {'weight_total': df_row['Probability'] * 4,
                 'sentiment_total': df_row['sentiment']}
    for col in agg_dict.keys():
        if col.startswith('Probability_'):
            prop_dict[f"weight_{col.split('_')[1]}"] = df_row[col] * 4
        elif col.startswith('sentiment_'):
            prop_dict[f"sentiment_{col.split('_')[1]}"] = float(np.nan_to_num(df_row[col]))
    callback = gremlin_client.submit_async(f'g.V().hasLabel("{df_row["party"]}").values("id")')
    party_id = callback.result().all().result()[0]
    add_edge(label='member_discusses', 
             from_id=party_id, 
             to_id=f"topic_{df_row['Topic']}",
             prop_dict=prop_dict)

In [308]:
_ = df_party_topic_grouped.apply(add_party_topic, axis=1)

## Adding edges among topics

In [309]:
tt_df = topic_df[topic_df['Topic'] >= 0].copy()
tt_df['year'] = tt_df['date'].dt.year
paragraphs = list(tt_df['paragraph_id'].unique())

In [310]:
years = tt_df['year'].unique()
start_dict = {'total': 0}
for year in years:
    start_dict[year] = 0

In [311]:
topic_combo_dict = {}
for paragraph in tqdm(paragraphs, ncols=100):
    df_para = tt_df[tt_df['paragraph_id'] == paragraph].groupby(['Topic', 'year'], as_index=False)['Probability'].max().sort_values(by='Topic', ignore_index=True)
    if len(df_para.index) > 1:
        for a, b in combinations(df_para.index, 2):
            topic_combo = (int(df_para.loc[a]['Topic']), int(df_para.loc[b]['Topic']))
            if topic_combo not in topic_combo_dict:
                topic_combo_dict[topic_combo] = start_dict.copy()
            topic_combo_dict[topic_combo][df_para.loc[a]['year']] += df_para.loc[a]['Probability'] * df_para.loc[b]['Probability']
            topic_combo_dict[topic_combo]['total'] += df_para.loc[a]['Probability'] * df_para.loc[b]['Probability']

100%|████████████████████████████████████████████████████████| 28047/28047 [01:21<00:00, 344.69it/s]


In [312]:
for topic_combo, year_dict in tqdm(topic_combo_dict.items(), ncols=100):
    prop_dict = {}
    for key, value in year_dict.items():
        prop_dict[f"weight_{key}"] = float(value)
    add_edge(label='discussed_together', 
             from_id=f"topic_{topic_combo[0]}",
             to_id=f"topic_{topic_combo[1]}",
             prop_dict=prop_dict)

100%|███████████████████████████████████████████████████████████| 3456/3456 [00:44<00:00, 78.27it/s]
