In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import requests 
import io
import seaborn as sns

In [None]:
orig_members = pd.read_csv('kad_mrgcn_public/data/ironmarch/raw_files/orig_members.csv')

In [None]:
sorted_member_id = orig_members['member_id'].sort_values()
member_id_map ={}
start_id = 1
for id in sorted_member_id:
  member_id_map[id] = start_id
  start_id = start_id + 1

# Posts

### Topic: Starter_Id Mapping

In [None]:
orig_topics = pd.read_csv("kad_mrgcn_public/data/ironmarch/raw_files/orig_topics.csv", error_bad_lines=False)

In [None]:
orig_topics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5073 entries, 0 to 5072
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5073 non-null   int64  
 1   tid                   5073 non-null   int64  
 2   title                 5073 non-null   object 
 3   description           0 non-null      float64
 4   state                 5073 non-null   object 
 5   posts                 5073 non-null   int64  
 6   starter_id            5073 non-null   int64  
 7   start_date            5073 non-null   int64  
 8   last_poster_id        5073 non-null   int64  
 9   last_post             5073 non-null   int64  
 10  icon_id               46 non-null     float64
 11  starter_name          5073 non-null   object 
 12  last_poster_name      5073 non-null   object 
 13  poll_state            5073 non-null   int64  
 14  last_vote             5073 non-null   int64  
 15  views                

In [None]:
topic_starter_map = {}
for index, row in orig_topics.iterrows():
  title_id = row['tid']
  starter_id = row['starter_id']
  topic_starter_map[title_id] = starter_id 

### Topic Starter Database

In [None]:
frame = { 'topic_id': orig_topics['tid'], 'starter_id': orig_topics['starter_id'] }
topic_user_db = pd.DataFrame(frame)

In [None]:
topic_user_db

Unnamed: 0,topic_id,starter_id
0,5182,9491
1,3,1
2,3674,7346
3,7,1
4,8,1
...,...,...
5068,6220,9939
5069,6219,9304
5070,6222,9353
5071,6223,9916


### Associating a Starter_Id with every Post

In [None]:
orig_posts = pd.read_csv("kad_mrgcn_public/data/ironmarch/raw_files/orig_posts.csv", error_bad_lines=False)

In [None]:
orig_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139068 entries, 0 to 139067
Data columns (total 25 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        139068 non-null  int64  
 1   pid               139068 non-null  int64  
 2   append_edit       139068 non-null  int64  
 3   edit_time         32550 non-null   float64
 4   author_id         139068 non-null  int64  
 5   author_name       139068 non-null  object 
 6   use_sig           139068 non-null  int64  
 7   use_emo           139068 non-null  int64  
 8   ip_address        139068 non-null  object 
 9   post_date         139068 non-null  int64  
 10  icon_id           0 non-null       float64
 11  post              139068 non-null  object 
 12  queued            139068 non-null  int64  
 13  topic_id          139068 non-null  int64  
 14  post_title        0 non-null       float64
 15  new_topic         139068 non-null  int64  
 16  edit_name         32

In [None]:
orig_posts.drop(orig_posts.columns[[0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]], axis = 1, inplace = True)

In [None]:
orig_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139068 entries, 0 to 139067
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   pid        139068 non-null  int64
 1   author_id  139068 non-null  int64
 2   topic_id   139068 non-null  int64
dtypes: int64(3)
memory usage: 3.2 MB


In [None]:
topic_author_starter = pd.merge(orig_posts,topic_user_db,on='topic_id',how='left')

In [None]:
topic_author_starter = topic_author_starter.dropna()

In [None]:
# let's check the final contents of the file
topic_author_starter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139050 entries, 0 to 139067
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   pid         139050 non-null  int64  
 1   author_id   139050 non-null  int64  
 2   topic_id    139050 non-null  int64  
 3   starter_id  139050 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 5.3 MB


In [None]:
# saving the dataframe
topic_author_starter.to_csv('kad_mrgcn_public/data/ironmarch/processed/orig_topic_author_starter.csv')

### Topic to Every User Except Starter
We make a mapping between topicID and the userIDs. It does not make sense to make a dataframe here as we do not know the number of users possible for a aprticular topic

In [None]:
topic_user_map = {}
for index, row in topic_author_starter.iterrows():
  title_id = row['topic_id']
  user_id = row['author_id']

  # if the considered user is not the starter
  if topic_starter_map[title_id] != user_id:
    
    # if this title is being considered for the first time, we need to first initialise the list
    if title_id not in topic_user_map:
      topic_user_map[title_id] = []
    topic_user_map[title_id].append(user_id)

### AdjMatrix

In [None]:
adjMatrix = np.zeros((orig_members.shape[0], orig_members.shape[0]))

### Starter to Authors

In [None]:
for index, row in topic_author_starter.iterrows():
  from_id = row['author_id']
  to_id = row['starter_id']
  
  # do not need self edges, ie, user == starter
  # do not need ids which are not present in orig_members
  if to_id not in member_id_map or from_id not in member_id_map or from_id==to_id:
    continue

  from_mapping = member_id_map[from_id]
  to_mapping = member_id_map[to_id]
  adjMatrix[from_mapping-1][to_mapping-1] +=1

### Sparsity of AdjMatrix from Indirect Connections

In [None]:
countZero = adjMatrix[np.where(adjMatrix == 0)].size
countTotal = adjMatrix.shape[0]*adjMatrix.shape[1]
countZero/countTotal

0.9568939603448483

In [None]:
np.save("kad_mrgcn_public/data/ironmarch/adjMatrix/adjMatrix_creator_post", adjMatrix)

## Message Posts

Similar analysis for message posts.

In [None]:
adjMatrix = np.zeros((orig_members.shape[0], orig_members.shape[0]))

In [None]:
orig_message_posts = pd.read_csv("kad_mrgcn_public/data/ironmarch/raw_files/orig_message_posts.csv", error_bad_lines=False)

In [None]:
orig_message_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13341 entries, 0 to 13340
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         13341 non-null  int64 
 1   msg_id             13341 non-null  int64 
 2   msg_topic_id       13341 non-null  int64 
 3   msg_date           13341 non-null  int64 
 4   msg_post           13341 non-null  object
 5   msg_post_key       13341 non-null  object
 6   msg_author_id      13341 non-null  int64 
 7   msg_ip_address     13341 non-null  object
 8   msg_is_first_post  13341 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 938.2+ KB


In [None]:
orig_message_posts.drop(orig_message_posts.columns[[0, 3, 4, 5, 7, 8]], axis = 1, inplace = True)

In [None]:
orig_message_posts = orig_message_posts.rename(columns={'msg_topic_id':'topic_id'})

In [None]:
orig_message_posts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13341 entries, 0 to 13340
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   msg_id         13341 non-null  int64
 1   topic_id       13341 non-null  int64
 2   msg_author_id  13341 non-null  int64
dtypes: int64(3)
memory usage: 312.8 KB


In [None]:
orig_message_topics = pd.read_csv("kad_mrgcn_public/data/ironmarch/raw_files/orig_message_topics.csv", error_bad_lines=False)
mp_topic_starter_map = {}
for index, row in orig_message_topics.iterrows():
  title_id = row['mt_id']
  starter_id = row['mt_starter_id']
  mp_topic_starter_map[title_id] = starter_id 

In [None]:
frame = { 'topic_id': orig_message_topics['mt_id'], 'starter_id': orig_message_topics['mt_starter_id'] }
mp_topic_user_db = pd.DataFrame(frame)

In [None]:
mp_topic_author_starter = pd.merge(orig_message_posts,mp_topic_user_db,on='topic_id',how='left')

In [None]:
mp_topic_author_starter.isna().sum()

msg_id           0
topic_id         0
msg_author_id    0
starter_id       0
dtype: int64

In [None]:
# saving the dataframe
mp_topic_author_starter.to_csv('kad_mrgcn_public/data/ironmarch/processed/orig_mp_topic_author_starter.csv')

In [None]:
mp_topic_user_map = {}
for index, row in mp_topic_author_starter.iterrows():
  title_id = row['topic_id']
  user_id = row['msg_author_id']

  # if the considered user is not the starter
  if mp_topic_starter_map[title_id] != user_id:
    
    # if this title is being considered for the first time, we need to first initialise the list
    if title_id not in mp_topic_user_map:
      mp_topic_user_map[title_id] = []
    mp_topic_user_map[title_id].append(user_id)

In [None]:
for index, row in mp_topic_author_starter.iterrows():
  from_id = row['msg_author_id']
  to_id = row['starter_id']
  
  # do not need self edges, ie, user == starter
  # do not need ids which are not present in orig_members
  if to_id not in member_id_map or from_id not in member_id_map or from_id==to_id:
    continue

  from_mapping = member_id_map[from_id]
  to_mapping = member_id_map[to_id]
  adjMatrix[from_mapping-1][to_mapping-1] +=1

### Sparsity of AdjMatrix

In [None]:
countZero = adjMatrix[np.where(adjMatrix == 0)].size
countTotal = adjMatrix.shape[0]*adjMatrix.shape[1]
countZero/countTotal

0.9973856388780578

In [None]:
np.save("kad_mrgcn_public/data/ironmarch/adjMatrix/adjMatrix_creator_msg_post", adjMatrix)